Skip to content

Commit 12ad308

Browse files
committed
Add integ test of high frequency gpu metrics
1 parent dcf0243 commit 12ad308

File tree

9 files changed

+549
-84
lines changed

9 files changed

+549
-84
lines changed

generator/test_case_generator.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,10 @@ var testTypeToTestConfig = map[string][]testConfig{
305305
testDir: "../../../../test/gpu",
306306
terraformDir: "terraform/eks/addon/gpu",
307307
},
308+
{
309+
testDir: "../../../../test/gpu_high_frequency_metrics",
310+
terraformDir: "terraform/eks/addon/gpu",
311+
},
308312
},
309313
"eks_daemon": {
310314
{
@@ -340,6 +344,10 @@ var testTypeToTestConfig = map[string][]testConfig{
340344
testDir: "./test/gpu", terraformDir: "terraform/eks/daemon/gpu",
341345
targets: map[string]map[string]struct{}{"arc": {"amd64": {}}},
342346
},
347+
{
348+
testDir: "./test/gpu_high_frequency_metrics", terraformDir: "terraform/eks/daemon/gpu",
349+
targets: map[string]map[string]struct{}{"arc": {"amd64": {}}},
350+
},
343351
{
344352
testDir: "./test/awsneuron", terraformDir: "terraform/eks/daemon/awsneuron",
345353
targets: map[string]map[string]struct{}{"arc": {"amd64": {}}},

test/gpu/common/gpu_validation.go

Lines changed: 236 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,236 @@
1+
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
// SPDX-License-Identifier: MIT
3+
4+
//go:build !windows
5+
6+
package common
7+
8+
import (
9+
"encoding/json"
10+
"flag"
11+
"fmt"
12+
"log"
13+
"strings"
14+
"time"
15+
16+
"github.com/aws/amazon-cloudwatch-agent-test/environment"
17+
"github.com/aws/amazon-cloudwatch-agent-test/test/metric"
18+
"github.com/aws/amazon-cloudwatch-agent-test/test/status"
19+
"github.com/aws/amazon-cloudwatch-agent-test/util/awsservice"
20+
"github.com/aws/aws-sdk-go-v2/service/cloudwatchlogs/types"
21+
)
22+
23+
const (
24+
GPUMetricIndicator = "_gpu_"
25+
26+
ContainerMemTotal = "container_gpu_memory_total"
27+
ContainerMemUsed = "container_gpu_memory_used"
28+
ContainerPower = "container_gpu_power_draw"
29+
ContainerTemp = "container_gpu_temperature"
30+
ContainerUtil = "container_gpu_utilization"
31+
ContainerMemUtil = "container_gpu_memory_utilization"
32+
PodMemTotal = "pod_gpu_memory_total"
33+
PodMemUsed = "pod_gpu_memory_used"
34+
PodPower = "pod_gpu_power_draw"
35+
PodTemp = "pod_gpu_temperature"
36+
PodUtil = "pod_gpu_utilization"
37+
PodMemUtil = "pod_gpu_memory_utilization"
38+
PodLimit = "pod_gpu_limit"
39+
PodRequest = "pod_gpu_request"
40+
PodCountTotal = "pod_gpu_usage_total"
41+
PodReserved = "pod_gpu_reserved_capacity"
42+
NodeMemTotal = "node_gpu_memory_total"
43+
NodeMemUsed = "node_gpu_memory_used"
44+
NodePower = "node_gpu_power_draw"
45+
NodeTemp = "node_gpu_temperature"
46+
NodeUtil = "node_gpu_utilization"
47+
NodeMemUtil = "node_gpu_memory_utilization"
48+
NodeCountTotal = "node_gpu_usage_total"
49+
NodeCountLimit = "node_gpu_limit"
50+
NodeReserved = "node_gpu_reserved_capacity"
51+
)
52+
53+
var UseE2EMetrics = flag.Bool("useE2EMetrics", false, "Use E2E metrics mapping which uses latest build CWA")
54+
55+
// ExpectedDimsToMetricsIntegTest defines the expected dimensions and metrics for GPU validation
56+
var ExpectedDimsToMetricsIntegTest = map[string][]string{
57+
"ClusterName": {
58+
ContainerMemTotal, ContainerMemUsed, ContainerPower, ContainerTemp, ContainerUtil, ContainerMemUtil,
59+
PodMemTotal, PodMemUsed, PodPower, PodTemp, PodUtil, PodMemUtil,
60+
NodeMemTotal, NodeMemUsed, NodePower, NodeTemp, NodeUtil, NodeMemUtil,
61+
"container_gpu_tensor_core_utilization", "pod_gpu_tensor_core_utilization", "node_gpu_tensor_core_utilization",
62+
"node_gpu_unreserved_capacity", "node_gpu_available_capacity",
63+
},
64+
"ClusterName-Namespace": {
65+
PodMemTotal, PodMemUsed, PodPower, PodTemp, PodUtil, PodMemUtil,
66+
"pod_gpu_tensor_core_utilization",
67+
},
68+
//"ClusterName-Namespace-Service": {
69+
// PodMemTotal, PodMemUsed, PodPower, PodTemp, PodUtil, PodMemUtil,
70+
//},
71+
"ClusterName-Namespace-PodName": {
72+
PodMemTotal, PodMemUsed, PodPower, PodTemp, PodUtil, PodMemUtil,
73+
"pod_gpu_tensor_core_utilization",
74+
},
75+
"ClusterName-ContainerName-Namespace-PodName": {
76+
ContainerMemTotal, ContainerMemUsed, ContainerPower, ContainerTemp, ContainerUtil, ContainerMemUtil,
77+
"container_gpu_tensor_core_utilization",
78+
},
79+
"ClusterName-ContainerName-FullPodName-Namespace-PodName": {
80+
ContainerMemTotal, ContainerMemUsed, ContainerPower, ContainerTemp, ContainerUtil, ContainerMemUtil,
81+
"container_gpu_tensor_core_utilization",
82+
},
83+
"ClusterName-ContainerName-FullPodName-GpuDevice-Namespace-PodName": {
84+
ContainerMemTotal, ContainerMemUsed, ContainerPower, ContainerTemp, ContainerUtil, ContainerMemUtil,
85+
"container_gpu_tensor_core_utilization",
86+
},
87+
"ClusterName-FullPodName-Namespace-PodName": {
88+
PodMemTotal, PodMemUsed, PodPower, PodTemp, PodUtil, PodMemUtil,
89+
"pod_gpu_tensor_core_utilization",
90+
},
91+
"ClusterName-FullPodName-GpuDevice-Namespace-PodName": {
92+
PodMemTotal, PodMemUsed, PodPower, PodTemp, PodUtil, PodMemUtil,
93+
"pod_gpu_tensor_core_utilization",
94+
},
95+
"ClusterName-InstanceId-NodeName": {
96+
NodeMemTotal, NodeMemUsed, NodePower, NodeTemp, NodeUtil, NodeMemUtil,
97+
"node_gpu_tensor_core_utilization", "node_gpu_unreserved_capacity", "node_gpu_available_capacity",
98+
//NodeCountTotal, NodeCountRequest, NodeCountLimit,
99+
},
100+
"ClusterName-GpuDevice-InstanceId-InstanceType-NodeName": {
101+
NodeMemTotal, NodeMemUsed, NodePower, NodeTemp, NodeUtil, NodeMemUtil,
102+
"node_gpu_tensor_core_utilization",
103+
},
104+
}
105+
106+
// ValidateGPUMetrics validates GPU metrics using the common validation logic
107+
func ValidateGPUMetrics(env *environment.MetaData) []status.TestResult {
108+
var testResults []status.TestResult
109+
110+
// Create a copy of the expected dimensions to metrics map
111+
expectedDimsToMetrics := make(map[string][]string)
112+
for k, v := range ExpectedDimsToMetricsIntegTest {
113+
expectedDimsToMetrics[k] = append([]string{}, v...)
114+
}
115+
116+
// Add GPU count metrics if using E2E metrics
117+
if *UseE2EMetrics {
118+
expectedDimsToMetrics["ClusterName"] = append(
119+
expectedDimsToMetrics["ClusterName"],
120+
PodReserved, PodRequest, PodCountTotal, PodLimit, NodeCountTotal, NodeCountLimit, NodeReserved,
121+
)
122+
expectedDimsToMetrics["ClusterName-Namespace-PodName"] = append(
123+
expectedDimsToMetrics["ClusterName-Namespace-PodName"],
124+
PodCountTotal, PodRequest, PodReserved, PodLimit,
125+
)
126+
expectedDimsToMetrics["ClusterName-FullPodName-Namespace-PodName"] = append(
127+
expectedDimsToMetrics["ClusterName-FullPodName-Namespace-PodName"],
128+
PodCountTotal, PodRequest, PodReserved, PodLimit,
129+
)
130+
expectedDimsToMetrics["ClusterName-InstanceId-NodeName"] = append(
131+
expectedDimsToMetrics["ClusterName-InstanceId-NodeName"],
132+
NodeCountLimit, NodeCountTotal, NodeReserved,
133+
)
134+
}
135+
136+
// Validate metrics and logs
137+
testResults = append(testResults, metric.ValidateMetrics(env, GPUMetricIndicator, expectedDimsToMetrics)...)
138+
testResults = append(testResults, metric.ValidateLogs(env))
139+
140+
return testResults
141+
}
142+
143+
// ValidateHistogramFormat validates that the logs contain metrics in histogram format
144+
func ValidateHistogramFormat(env *environment.MetaData) status.TestResult {
145+
testResult := status.TestResult{
146+
Name: "histogram-format",
147+
Status: status.FAILED,
148+
}
149+
150+
end := time.Now()
151+
start := end.Add(time.Duration(-3) * time.Minute)
152+
group := fmt.Sprintf("/aws/containerinsights/%s/performance", env.EKSClusterName)
153+
154+
log.Println("Searching for histogram format in log group:", group)
155+
156+
// Get the instances used for the EKS cluster
157+
eKSInstances, err := awsservice.GetEKSInstances(env.EKSClusterName)
158+
if err != nil {
159+
log.Println("Failed to get EKS instances:", err)
160+
return testResult
161+
}
162+
163+
histogramFound := false
164+
logCount := 0
165+
gpuMetricCount := 0
166+
167+
for _, instance := range eKSInstances {
168+
stream := *instance.InstanceName
169+
170+
err = awsservice.ValidateLogs(
171+
group,
172+
stream,
173+
&start,
174+
&end,
175+
awsservice.AssertLogsNotEmpty(),
176+
awsservice.AssertPerLog(
177+
func(event types.OutputLogEvent) error {
178+
logCount++
179+
message := *event.Message
180+
181+
// Check if the log contains histogram format
182+
var logData map[string]interface{}
183+
if err := json.Unmarshal([]byte(message), &logData); err != nil {
184+
return nil // Skip this log if it's not valid JSON
185+
}
186+
187+
// Check for GPU metrics with histogram format
188+
gpuMetricsInLog := 0
189+
for key, value := range logData {
190+
if !strings.Contains(key, "_gpu_") {
191+
continue
192+
}
193+
194+
gpuMetricsInLog++
195+
gpuMetricCount++
196+
197+
// Check if the value is a map with histogram fields
198+
valueMap, ok := value.(map[string]interface{})
199+
if !ok {
200+
continue
201+
}
202+
203+
// Check for required histogram fields
204+
_, hasValues := valueMap["Values"]
205+
_, hasCounts := valueMap["Counts"]
206+
_, hasMax := valueMap["Max"]
207+
_, hasMin := valueMap["Min"]
208+
_, hasCount := valueMap["Count"]
209+
_, hasSum := valueMap["Sum"]
210+
211+
if hasValues && hasCounts && hasMax && hasMin && hasCount && hasSum {
212+
histogramFound = true
213+
log.Println("Found GPU metric in histogram format:", key)
214+
return nil
215+
}
216+
}
217+
218+
return nil // Continue checking other logs
219+
},
220+
),
221+
)
222+
223+
if err != nil {
224+
log.Println("Error validating logs:", err)
225+
}
226+
227+
if histogramFound {
228+
log.Println("Successfully found GPU metric in histogram format")
229+
testResult.Status = status.SUCCESSFUL
230+
return testResult
231+
}
232+
}
233+
234+
log.Printf("Processed %d logs, found %d GPU metrics, but none in histogram format", logCount, gpuMetricCount)
235+
return testResult
236+
}

test/gpu/nvidia_test.go

Lines changed: 4 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -6,86 +6,14 @@
66
package emf
77

88
import (
9-
"flag"
109
"time"
1110

1211
"github.com/aws/amazon-cloudwatch-agent-test/environment"
13-
"github.com/aws/amazon-cloudwatch-agent-test/test/metric"
12+
"github.com/aws/amazon-cloudwatch-agent-test/test/gpu/common"
1413
"github.com/aws/amazon-cloudwatch-agent-test/test/status"
1514
"github.com/aws/amazon-cloudwatch-agent-test/test/test_runner"
1615
)
1716

18-
const (
19-
gpuMetricIndicator = "_gpu_"
20-
21-
containerMemTotal = "container_gpu_memory_total"
22-
containerMemUsed = "container_gpu_memory_used"
23-
containerPower = "container_gpu_power_draw"
24-
containerTemp = "container_gpu_temperature"
25-
containerUtil = "container_gpu_utilization"
26-
containerMemUtil = "container_gpu_memory_utilization"
27-
podMemTotal = "pod_gpu_memory_total"
28-
podMemUsed = "pod_gpu_memory_used"
29-
podPower = "pod_gpu_power_draw"
30-
podTemp = "pod_gpu_temperature"
31-
podUtil = "pod_gpu_utilization"
32-
podMemUtil = "pod_gpu_memory_utilization"
33-
podLimit = "pod_gpu_limit"
34-
podRequest = "pod_gpu_request"
35-
podCountTotal = "pod_gpu_usage_total"
36-
podReserved = "pod_gpu_reserved_capacity"
37-
nodeMemTotal = "node_gpu_memory_total"
38-
nodeMemUsed = "node_gpu_memory_used"
39-
nodePower = "node_gpu_power_draw"
40-
nodeTemp = "node_gpu_temperature"
41-
nodeUtil = "node_gpu_utilization"
42-
nodeMemUtil = "node_gpu_memory_utilization"
43-
nodeCountTotal = "node_gpu_usage_total"
44-
nodeCountLimit = "node_gpu_limit"
45-
nodeReserved = "node_gpu_reserved_capacity"
46-
)
47-
48-
var useE2EMetrics = flag.Bool("useE2EMetrics", false, "Use E2E metrics mapping which uses latest build CWA")
49-
50-
var expectedDimsToMetricsIntegTest = map[string][]string{
51-
"ClusterName": {
52-
containerMemTotal, containerMemUsed, containerPower, containerTemp, containerUtil, containerMemUtil,
53-
podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil,
54-
nodeMemTotal, nodeMemUsed, nodePower, nodeTemp, nodeUtil, nodeMemUtil,
55-
},
56-
"ClusterName-Namespace": {
57-
podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil,
58-
},
59-
//"ClusterName-Namespace-Service": {
60-
// podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil,
61-
//},
62-
"ClusterName-Namespace-PodName": {
63-
podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil,
64-
},
65-
"ClusterName-ContainerName-Namespace-PodName": {
66-
containerMemTotal, containerMemUsed, containerPower, containerTemp, containerUtil, containerMemUtil,
67-
},
68-
"ClusterName-ContainerName-FullPodName-Namespace-PodName": {
69-
containerMemTotal, containerMemUsed, containerPower, containerTemp, containerUtil, containerMemUtil,
70-
},
71-
"ClusterName-ContainerName-FullPodName-GpuDevice-Namespace-PodName": {
72-
containerMemTotal, containerMemUsed, containerPower, containerTemp, containerUtil, containerMemUtil,
73-
},
74-
"ClusterName-FullPodName-Namespace-PodName": {
75-
podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil,
76-
},
77-
"ClusterName-FullPodName-GpuDevice-Namespace-PodName": {
78-
podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil,
79-
},
80-
"ClusterName-InstanceId-NodeName": {
81-
nodeMemTotal, nodeMemUsed, nodePower, nodeTemp, nodeUtil, nodeMemUtil,
82-
//nodeCountTotal, nodeCountRequest, nodeCountLimit,
83-
},
84-
"ClusterName-GpuDevice-InstanceId-InstanceType-NodeName": {
85-
nodeMemTotal, nodeMemUsed, nodePower, nodeTemp, nodeUtil, nodeMemUtil,
86-
},
87-
}
88-
8917
type NvidiaTestRunner struct {
9018
test_runner.BaseTestRunner
9119
testName string
@@ -95,17 +23,9 @@ type NvidiaTestRunner struct {
9523
var _ test_runner.ITestRunner = (*NvidiaTestRunner)(nil)
9624

9725
func (t *NvidiaTestRunner) Validate() status.TestGroupResult {
98-
var testResults []status.TestResult
99-
expectedDimsToMetrics := expectedDimsToMetricsIntegTest
100-
if *useE2EMetrics {
101-
// add GPU count metrics
102-
expectedDimsToMetricsIntegTest["ClusterName"] = append(expectedDimsToMetricsIntegTest["ClusterName"], podReserved, podRequest, podCountTotal, podLimit, nodeCountTotal, nodeCountLimit, nodeReserved)
103-
expectedDimsToMetricsIntegTest["ClusterName-Namespace-PodName"] = append(expectedDimsToMetricsIntegTest["ClusterName-Namespace-PodName"], podCountTotal, podRequest, podReserved, podLimit)
104-
expectedDimsToMetricsIntegTest["ClusterName-FullPodName-Namespace-PodName"] = append(expectedDimsToMetricsIntegTest["ClusterName-FullPodName-Namespace-PodName"], podCountTotal, podRequest, podReserved, podLimit)
105-
expectedDimsToMetricsIntegTest["ClusterName-InstanceId-NodeName"] = append(expectedDimsToMetricsIntegTest["ClusterName-InstanceId-NodeName"], nodeCountLimit, nodeCountTotal, nodeReserved)
106-
}
107-
testResults = append(testResults, metric.ValidateMetrics(t.env, gpuMetricIndicator, expectedDimsToMetrics)...)
108-
testResults = append(testResults, metric.ValidateLogs(t.env))
26+
// Use the common GPU validation logic
27+
testResults := common.ValidateGPUMetrics(t.env)
28+
10929
return status.TestGroupResult{
11030
Name: t.GetTestName(),
11131
TestResults: testResults,

0 commit comments

Comments
 (0)