Skip to content

Commit 63ca48e

Browse files
committed
Add integ test of high frequency gpu metrics
1 parent 7cd8c97 commit 63ca48e

File tree

13 files changed

+545
-85
lines changed

13 files changed

+545
-85
lines changed

generator/test_case_generator.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,10 @@ var testTypeToTestConfig = map[string][]testConfig{
352352
testDir: "./test/gpu", terraformDir: "terraform/eks/daemon/gpu",
353353
targets: map[string]map[string]struct{}{"arc": {"amd64": {}}},
354354
},
355+
{
356+
testDir: "./test/gpu_high_frequency_metrics", terraformDir: "terraform/eks/daemon/gpu",
357+
targets: map[string]map[string]struct{}{"arc": {"amd64": {}}},
358+
},
355359
{
356360
testDir: "./test/awsneuron", terraformDir: "terraform/eks/daemon/awsneuron",
357361
targets: map[string]map[string]struct{}{"arc": {"amd64": {}}},

terraform/eks/daemon/gpu/main.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -306,7 +306,7 @@ resource "kubernetes_daemonset" "exporter" {
306306
"-c",
307307
]
308308
args = [
309-
"/bin/echo 'DCGM_FI_DEV_GPU_UTIL{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_FB_FREE{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_FB_USED{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_FB_TOTAL{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_FB_USED_PERCENT{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_GPU_TEMP{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_POWER_USAGE{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1' >> /usr/local/apache2/htdocs/metrics && sed -i -e \"s/hostname1/$HOST_NAME/g\" /usr/local/apache2/htdocs/metrics && httpd-foreground -k restart"
309+
"/bin/echo 'DCGM_FI_DEV_GPU_UTIL{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_FB_FREE{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_FB_USED{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_FB_TOTAL{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_FB_USED_PERCENT{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_GPU_TEMP{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_POWER_USAGE{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_PROF_PIPE_TENSOR_ACTIVE{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1' >> /usr/local/apache2/htdocs/metrics && sed -i -e \"s/hostname1/$HOST_NAME/g\" /usr/local/apache2/htdocs/metrics && httpd-foreground -k restart"
310310
]
311311
volume_mount {
312312
mount_path = "/etc/amazon-cloudwatch-observability-dcgm-cert"

test/gpu/common/gpu_validation.go

Lines changed: 230 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,230 @@
1+
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
// SPDX-License-Identifier: MIT
3+
4+
//go:build !windows
5+
6+
package common
7+
8+
import (
9+
"encoding/json"
10+
"flag"
11+
"fmt"
12+
"log"
13+
"strings"
14+
"time"
15+
16+
"github.com/aws/amazon-cloudwatch-agent-test/environment"
17+
"github.com/aws/amazon-cloudwatch-agent-test/test/metric"
18+
"github.com/aws/amazon-cloudwatch-agent-test/test/status"
19+
"github.com/aws/amazon-cloudwatch-agent-test/util/awsservice"
20+
"github.com/aws/aws-sdk-go-v2/service/cloudwatchlogs/types"
21+
)
22+
23+
const (
24+
GPUMetricIndicator = "_gpu_"
25+
26+
ContainerMemTotal = "container_gpu_memory_total"
27+
ContainerMemUsed = "container_gpu_memory_used"
28+
ContainerPower = "container_gpu_power_draw"
29+
ContainerTemp = "container_gpu_temperature"
30+
ContainerUtil = "container_gpu_utilization"
31+
ContainerMemUtil = "container_gpu_memory_utilization"
32+
ContainerTensorUtil = "container_gpu_tensor_core_utilization"
33+
PodMemTotal = "pod_gpu_memory_total"
34+
PodMemUsed = "pod_gpu_memory_used"
35+
PodPower = "pod_gpu_power_draw"
36+
PodTemp = "pod_gpu_temperature"
37+
PodUtil = "pod_gpu_utilization"
38+
PodMemUtil = "pod_gpu_memory_utilization"
39+
PodTensorUtil = "pod_gpu_tensor_core_utilization"
40+
PodLimit = "pod_gpu_limit"
41+
PodRequest = "pod_gpu_request"
42+
PodCountTotal = "pod_gpu_usage_total"
43+
PodReserved = "pod_gpu_reserved_capacity"
44+
NodeMemTotal = "node_gpu_memory_total"
45+
NodeMemUsed = "node_gpu_memory_used"
46+
NodePower = "node_gpu_power_draw"
47+
NodeTemp = "node_gpu_temperature"
48+
NodeUtil = "node_gpu_utilization"
49+
NodeMemUtil = "node_gpu_memory_utilization"
50+
NodeTensorUtil = "node_gpu_tensor_core_utilization"
51+
NodeCountTotal = "node_gpu_usage_total"
52+
NodeCountLimit = "node_gpu_limit"
53+
NodeReserved = "node_gpu_reserved_capacity"
54+
NodeUnreserved = "node_gpu_unreserved_capacity"
55+
NodeAvailable = "node_gpu_available_capacity"
56+
)
57+
58+
var UseE2EMetrics = flag.Bool("useE2EMetrics", false, "Use E2E metrics mapping which uses latest build CWA")
59+
60+
// ExpectedDimsToMetricsIntegTest defines the expected dimensions and metrics for GPU validation
61+
var ExpectedDimsToMetricsIntegTest = map[string][]string{
62+
"ClusterName": {
63+
ContainerMemTotal, ContainerMemUsed, ContainerPower, ContainerTemp, ContainerUtil, ContainerMemUtil, ContainerTensorUtil,
64+
PodMemTotal, PodMemUsed, PodPower, PodTemp, PodUtil, PodMemUtil, PodTensorUtil,
65+
NodeMemTotal, NodeMemUsed, NodePower, NodeTemp, NodeUtil, NodeMemUtil, NodeTensorUtil,
66+
},
67+
"ClusterName-Namespace": {
68+
PodMemTotal, PodMemUsed, PodPower, PodTemp, PodUtil, PodMemUtil, PodTensorUtil,
69+
},
70+
//"ClusterName-Namespace-Service": {
71+
// PodMemTotal, PodMemUsed, PodPower, PodTemp, PodUtil, PodMemUtil,
72+
//},
73+
"ClusterName-Namespace-PodName": {
74+
PodMemTotal, PodMemUsed, PodPower, PodTemp, PodUtil, PodMemUtil, PodTensorUtil,
75+
},
76+
"ClusterName-ContainerName-Namespace-PodName": {
77+
ContainerMemTotal, ContainerMemUsed, ContainerPower, ContainerTemp, ContainerUtil, ContainerMemUtil, ContainerTensorUtil,
78+
},
79+
"ClusterName-ContainerName-FullPodName-Namespace-PodName": {
80+
ContainerMemTotal, ContainerMemUsed, ContainerPower, ContainerTemp, ContainerUtil, ContainerMemUtil, ContainerTensorUtil,
81+
},
82+
"ClusterName-ContainerName-FullPodName-GpuDevice-Namespace-PodName": {
83+
ContainerMemTotal, ContainerMemUsed, ContainerPower, ContainerTemp, ContainerUtil, ContainerMemUtil, ContainerTensorUtil,
84+
},
85+
"ClusterName-FullPodName-Namespace-PodName": {
86+
PodMemTotal, PodMemUsed, PodPower, PodTemp, PodUtil, PodMemUtil, PodTensorUtil,
87+
},
88+
"ClusterName-FullPodName-GpuDevice-Namespace-PodName": {
89+
PodMemTotal, PodMemUsed, PodPower, PodTemp, PodUtil, PodMemUtil, PodTensorUtil,
90+
},
91+
"ClusterName-InstanceId-NodeName": {
92+
NodeMemTotal, NodeMemUsed, NodePower, NodeTemp, NodeUtil, NodeMemUtil, NodeTensorUtil,
93+
//NodeCountTotal, NodeCountRequest, NodeCountLimit,
94+
},
95+
"ClusterName-GpuDevice-InstanceId-InstanceType-NodeName": {
96+
NodeMemTotal, NodeMemUsed, NodePower, NodeTemp, NodeUtil, NodeMemUtil, NodeTensorUtil,
97+
},
98+
}
99+
100+
// ValidateGPUMetrics validates GPU metrics using the common validation logic
101+
func ValidateGPUMetrics(env *environment.MetaData) []status.TestResult {
102+
var testResults []status.TestResult
103+
104+
// Create a copy of the expected dimensions to metrics map
105+
expectedDimsToMetrics := make(map[string][]string)
106+
for k, v := range ExpectedDimsToMetricsIntegTest {
107+
expectedDimsToMetrics[k] = append([]string{}, v...)
108+
}
109+
110+
// Add GPU count metrics if using E2E metrics
111+
if *UseE2EMetrics {
112+
expectedDimsToMetrics["ClusterName"] = append(
113+
expectedDimsToMetrics["ClusterName"],
114+
PodReserved, PodRequest, PodCountTotal, PodLimit, NodeCountTotal, NodeCountLimit, NodeReserved, NodeUnreserved, NodeAvailable,
115+
)
116+
expectedDimsToMetrics["ClusterName-Namespace-PodName"] = append(
117+
expectedDimsToMetrics["ClusterName-Namespace-PodName"],
118+
PodCountTotal, PodRequest, PodReserved, PodLimit,
119+
)
120+
expectedDimsToMetrics["ClusterName-FullPodName-Namespace-PodName"] = append(
121+
expectedDimsToMetrics["ClusterName-FullPodName-Namespace-PodName"],
122+
PodCountTotal, PodRequest, PodReserved, PodLimit,
123+
)
124+
expectedDimsToMetrics["ClusterName-InstanceId-NodeName"] = append(
125+
expectedDimsToMetrics["ClusterName-InstanceId-NodeName"],
126+
NodeCountLimit, NodeCountTotal, NodeReserved, NodeUnreserved, NodeAvailable,
127+
)
128+
}
129+
130+
// Validate metrics and logs
131+
testResults = append(testResults, metric.ValidateMetrics(env, GPUMetricIndicator, expectedDimsToMetrics)...)
132+
testResults = append(testResults, metric.ValidateLogs(env))
133+
134+
return testResults
135+
}
136+
137+
// ValidateHistogramFormat validates that the logs contain metrics in histogram format
138+
func ValidateHistogramFormat(env *environment.MetaData) status.TestResult {
139+
testResult := status.TestResult{
140+
Name: "histogram-format",
141+
Status: status.FAILED,
142+
}
143+
144+
end := time.Now()
145+
start := end.Add(time.Duration(-3) * time.Minute)
146+
group := fmt.Sprintf("/aws/containerinsights/%s/performance", env.EKSClusterName)
147+
148+
log.Println("Searching for histogram format in log group:", group)
149+
150+
// Get the instances used for the EKS cluster
151+
eKSInstances, err := awsservice.GetEKSInstances(env.EKSClusterName)
152+
if err != nil {
153+
log.Println("Failed to get EKS instances:", err)
154+
return testResult
155+
}
156+
157+
histogramFound := false
158+
logCount := 0
159+
gpuMetricCount := 0
160+
161+
for _, instance := range eKSInstances {
162+
stream := *instance.InstanceName
163+
164+
err = awsservice.ValidateLogs(
165+
group,
166+
stream,
167+
&start,
168+
&end,
169+
awsservice.AssertLogsNotEmpty(),
170+
awsservice.AssertPerLog(
171+
func(event types.OutputLogEvent) error {
172+
logCount++
173+
message := *event.Message
174+
175+
// Check if the log contains histogram format
176+
var logData map[string]interface{}
177+
if err := json.Unmarshal([]byte(message), &logData); err != nil {
178+
return nil // Skip this log if it's not valid JSON
179+
}
180+
181+
// Check for GPU metrics with histogram format
182+
gpuMetricsInLog := 0
183+
for key, value := range logData {
184+
if !strings.Contains(key, "_gpu_") {
185+
continue
186+
}
187+
188+
gpuMetricsInLog++
189+
gpuMetricCount++
190+
191+
// Check if the value is a map with histogram fields
192+
valueMap, ok := value.(map[string]interface{})
193+
if !ok {
194+
continue
195+
}
196+
197+
// Check for required histogram fields
198+
_, hasValues := valueMap["Values"]
199+
_, hasCounts := valueMap["Counts"]
200+
_, hasMax := valueMap["Max"]
201+
_, hasMin := valueMap["Min"]
202+
_, hasCount := valueMap["Count"]
203+
_, hasSum := valueMap["Sum"]
204+
205+
if hasValues && hasCounts && hasMax && hasMin && hasCount && hasSum {
206+
histogramFound = true
207+
log.Println("Found GPU metric in histogram format:", key)
208+
return nil
209+
}
210+
}
211+
212+
return nil // Continue checking other logs
213+
},
214+
),
215+
)
216+
217+
if err != nil {
218+
log.Println("Error validating logs:", err)
219+
}
220+
221+
if histogramFound {
222+
log.Println("Successfully found GPU metric in histogram format")
223+
testResult.Status = status.SUCCESSFUL
224+
return testResult
225+
}
226+
}
227+
228+
log.Printf("Processed %d logs, found %d GPU metrics, but none in histogram format", logCount, gpuMetricCount)
229+
return testResult
230+
}

test/gpu/nvidia_test.go

Lines changed: 4 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -6,86 +6,14 @@
66
package emf
77

88
import (
9-
"flag"
109
"time"
1110

1211
"github.com/aws/amazon-cloudwatch-agent-test/environment"
13-
"github.com/aws/amazon-cloudwatch-agent-test/test/metric"
12+
"github.com/aws/amazon-cloudwatch-agent-test/test/gpu/common"
1413
"github.com/aws/amazon-cloudwatch-agent-test/test/status"
1514
"github.com/aws/amazon-cloudwatch-agent-test/test/test_runner"
1615
)
1716

18-
const (
19-
gpuMetricIndicator = "_gpu_"
20-
21-
containerMemTotal = "container_gpu_memory_total"
22-
containerMemUsed = "container_gpu_memory_used"
23-
containerPower = "container_gpu_power_draw"
24-
containerTemp = "container_gpu_temperature"
25-
containerUtil = "container_gpu_utilization"
26-
containerMemUtil = "container_gpu_memory_utilization"
27-
podMemTotal = "pod_gpu_memory_total"
28-
podMemUsed = "pod_gpu_memory_used"
29-
podPower = "pod_gpu_power_draw"
30-
podTemp = "pod_gpu_temperature"
31-
podUtil = "pod_gpu_utilization"
32-
podMemUtil = "pod_gpu_memory_utilization"
33-
podLimit = "pod_gpu_limit"
34-
podRequest = "pod_gpu_request"
35-
podCountTotal = "pod_gpu_usage_total"
36-
podReserved = "pod_gpu_reserved_capacity"
37-
nodeMemTotal = "node_gpu_memory_total"
38-
nodeMemUsed = "node_gpu_memory_used"
39-
nodePower = "node_gpu_power_draw"
40-
nodeTemp = "node_gpu_temperature"
41-
nodeUtil = "node_gpu_utilization"
42-
nodeMemUtil = "node_gpu_memory_utilization"
43-
nodeCountTotal = "node_gpu_usage_total"
44-
nodeCountLimit = "node_gpu_limit"
45-
nodeReserved = "node_gpu_reserved_capacity"
46-
)
47-
48-
var useE2EMetrics = flag.Bool("useE2EMetrics", false, "Use E2E metrics mapping which uses latest build CWA")
49-
50-
var expectedDimsToMetricsIntegTest = map[string][]string{
51-
"ClusterName": {
52-
containerMemTotal, containerMemUsed, containerPower, containerTemp, containerUtil, containerMemUtil,
53-
podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil,
54-
nodeMemTotal, nodeMemUsed, nodePower, nodeTemp, nodeUtil, nodeMemUtil,
55-
},
56-
"ClusterName-Namespace": {
57-
podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil,
58-
},
59-
//"ClusterName-Namespace-Service": {
60-
// podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil,
61-
//},
62-
"ClusterName-Namespace-PodName": {
63-
podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil,
64-
},
65-
"ClusterName-ContainerName-Namespace-PodName": {
66-
containerMemTotal, containerMemUsed, containerPower, containerTemp, containerUtil, containerMemUtil,
67-
},
68-
"ClusterName-ContainerName-FullPodName-Namespace-PodName": {
69-
containerMemTotal, containerMemUsed, containerPower, containerTemp, containerUtil, containerMemUtil,
70-
},
71-
"ClusterName-ContainerName-FullPodName-GpuDevice-Namespace-PodName": {
72-
containerMemTotal, containerMemUsed, containerPower, containerTemp, containerUtil, containerMemUtil,
73-
},
74-
"ClusterName-FullPodName-Namespace-PodName": {
75-
podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil,
76-
},
77-
"ClusterName-FullPodName-GpuDevice-Namespace-PodName": {
78-
podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil,
79-
},
80-
"ClusterName-InstanceId-NodeName": {
81-
nodeMemTotal, nodeMemUsed, nodePower, nodeTemp, nodeUtil, nodeMemUtil,
82-
//nodeCountTotal, nodeCountRequest, nodeCountLimit,
83-
},
84-
"ClusterName-GpuDevice-InstanceId-InstanceType-NodeName": {
85-
nodeMemTotal, nodeMemUsed, nodePower, nodeTemp, nodeUtil, nodeMemUtil,
86-
},
87-
}
88-
8917
type NvidiaTestRunner struct {
9018
test_runner.BaseTestRunner
9119
testName string
@@ -95,17 +23,9 @@ type NvidiaTestRunner struct {
9523
var _ test_runner.ITestRunner = (*NvidiaTestRunner)(nil)
9624

9725
func (t *NvidiaTestRunner) Validate() status.TestGroupResult {
98-
var testResults []status.TestResult
99-
expectedDimsToMetrics := expectedDimsToMetricsIntegTest
100-
if *useE2EMetrics {
101-
// add GPU count metrics
102-
expectedDimsToMetricsIntegTest["ClusterName"] = append(expectedDimsToMetricsIntegTest["ClusterName"], podReserved, podRequest, podCountTotal, podLimit, nodeCountTotal, nodeCountLimit, nodeReserved)
103-
expectedDimsToMetricsIntegTest["ClusterName-Namespace-PodName"] = append(expectedDimsToMetricsIntegTest["ClusterName-Namespace-PodName"], podCountTotal, podRequest, podReserved, podLimit)
104-
expectedDimsToMetricsIntegTest["ClusterName-FullPodName-Namespace-PodName"] = append(expectedDimsToMetricsIntegTest["ClusterName-FullPodName-Namespace-PodName"], podCountTotal, podRequest, podReserved, podLimit)
105-
expectedDimsToMetricsIntegTest["ClusterName-InstanceId-NodeName"] = append(expectedDimsToMetricsIntegTest["ClusterName-InstanceId-NodeName"], nodeCountLimit, nodeCountTotal, nodeReserved)
106-
}
107-
testResults = append(testResults, metric.ValidateMetrics(t.env, gpuMetricIndicator, expectedDimsToMetrics)...)
108-
testResults = append(testResults, metric.ValidateLogs(t.env))
26+
// Use the common GPU validation logic
27+
testResults := common.ValidateGPUMetrics(t.env)
28+
10929
return status.TestGroupResult{
11030
Name: t.GetTestName(),
11131
TestResults: testResults,

0 commit comments

Comments
 (0)