|
| 1 | +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. |
| 2 | +// SPDX-License-Identifier: MIT |
| 3 | + |
| 4 | +//go:build !windows |
| 5 | + |
| 6 | +package common |
| 7 | + |
| 8 | +import ( |
| 9 | + "encoding/json" |
| 10 | + "flag" |
| 11 | + "fmt" |
| 12 | + "log" |
| 13 | + "strings" |
| 14 | + "time" |
| 15 | + |
| 16 | + "github.com/aws/amazon-cloudwatch-agent-test/environment" |
| 17 | + "github.com/aws/amazon-cloudwatch-agent-test/test/metric" |
| 18 | + "github.com/aws/amazon-cloudwatch-agent-test/test/status" |
| 19 | + "github.com/aws/amazon-cloudwatch-agent-test/util/awsservice" |
| 20 | + "github.com/aws/aws-sdk-go-v2/service/cloudwatchlogs/types" |
| 21 | +) |
| 22 | + |
| 23 | +const ( |
| 24 | + GPUMetricIndicator = "_gpu_" |
| 25 | + |
| 26 | + ContainerMemTotal = "container_gpu_memory_total" |
| 27 | + ContainerMemUsed = "container_gpu_memory_used" |
| 28 | + ContainerPower = "container_gpu_power_draw" |
| 29 | + ContainerTemp = "container_gpu_temperature" |
| 30 | + ContainerUtil = "container_gpu_utilization" |
| 31 | + ContainerMemUtil = "container_gpu_memory_utilization" |
| 32 | + ContainerTensorUtil = "container_gpu_tensor_core_utilization" |
| 33 | + PodMemTotal = "pod_gpu_memory_total" |
| 34 | + PodMemUsed = "pod_gpu_memory_used" |
| 35 | + PodPower = "pod_gpu_power_draw" |
| 36 | + PodTemp = "pod_gpu_temperature" |
| 37 | + PodUtil = "pod_gpu_utilization" |
| 38 | + PodMemUtil = "pod_gpu_memory_utilization" |
| 39 | + PodTensorUtil = "pod_gpu_tensor_core_utilization" |
| 40 | + PodLimit = "pod_gpu_limit" |
| 41 | + PodRequest = "pod_gpu_request" |
| 42 | + PodCountTotal = "pod_gpu_usage_total" |
| 43 | + PodReserved = "pod_gpu_reserved_capacity" |
| 44 | + NodeMemTotal = "node_gpu_memory_total" |
| 45 | + NodeMemUsed = "node_gpu_memory_used" |
| 46 | + NodePower = "node_gpu_power_draw" |
| 47 | + NodeTemp = "node_gpu_temperature" |
| 48 | + NodeUtil = "node_gpu_utilization" |
| 49 | + NodeMemUtil = "node_gpu_memory_utilization" |
| 50 | + NodeTensorUtil = "node_gpu_tensor_core_utilization" |
| 51 | + NodeCountTotal = "node_gpu_usage_total" |
| 52 | + NodeCountLimit = "node_gpu_limit" |
| 53 | + NodeReserved = "node_gpu_reserved_capacity" |
| 54 | + NodeUnreserved = "node_gpu_unreserved_capacity" |
| 55 | + NodeAvailable = "node_gpu_available_capacity" |
| 56 | +) |
| 57 | + |
| 58 | +var UseE2EMetrics = flag.Bool("useE2EMetrics", false, "Use E2E metrics mapping which uses latest build CWA") |
| 59 | + |
| 60 | +// ExpectedDimsToMetricsIntegTest defines the expected dimensions and metrics for GPU validation |
| 61 | +var ExpectedDimsToMetricsIntegTest = map[string][]string{ |
| 62 | + "ClusterName": { |
| 63 | + ContainerMemTotal, ContainerMemUsed, ContainerPower, ContainerTemp, ContainerUtil, ContainerMemUtil, ContainerTensorUtil, |
| 64 | + PodMemTotal, PodMemUsed, PodPower, PodTemp, PodUtil, PodMemUtil, PodTensorUtil, |
| 65 | + NodeMemTotal, NodeMemUsed, NodePower, NodeTemp, NodeUtil, NodeMemUtil, NodeTensorUtil, |
| 66 | + }, |
| 67 | + "ClusterName-Namespace": { |
| 68 | + PodMemTotal, PodMemUsed, PodPower, PodTemp, PodUtil, PodMemUtil, PodTensorUtil, |
| 69 | + }, |
| 70 | + //"ClusterName-Namespace-Service": { |
| 71 | + // PodMemTotal, PodMemUsed, PodPower, PodTemp, PodUtil, PodMemUtil, |
| 72 | + //}, |
| 73 | + "ClusterName-Namespace-PodName": { |
| 74 | + PodMemTotal, PodMemUsed, PodPower, PodTemp, PodUtil, PodMemUtil, PodTensorUtil, |
| 75 | + }, |
| 76 | + "ClusterName-ContainerName-Namespace-PodName": { |
| 77 | + ContainerMemTotal, ContainerMemUsed, ContainerPower, ContainerTemp, ContainerUtil, ContainerMemUtil, ContainerTensorUtil, |
| 78 | + }, |
| 79 | + "ClusterName-ContainerName-FullPodName-Namespace-PodName": { |
| 80 | + ContainerMemTotal, ContainerMemUsed, ContainerPower, ContainerTemp, ContainerUtil, ContainerMemUtil, ContainerTensorUtil, |
| 81 | + }, |
| 82 | + "ClusterName-ContainerName-FullPodName-GpuDevice-Namespace-PodName": { |
| 83 | + ContainerMemTotal, ContainerMemUsed, ContainerPower, ContainerTemp, ContainerUtil, ContainerMemUtil, ContainerTensorUtil, |
| 84 | + }, |
| 85 | + "ClusterName-FullPodName-Namespace-PodName": { |
| 86 | + PodMemTotal, PodMemUsed, PodPower, PodTemp, PodUtil, PodMemUtil, PodTensorUtil, |
| 87 | + }, |
| 88 | + "ClusterName-FullPodName-GpuDevice-Namespace-PodName": { |
| 89 | + PodMemTotal, PodMemUsed, PodPower, PodTemp, PodUtil, PodMemUtil, PodTensorUtil, |
| 90 | + }, |
| 91 | + "ClusterName-InstanceId-NodeName": { |
| 92 | + NodeMemTotal, NodeMemUsed, NodePower, NodeTemp, NodeUtil, NodeMemUtil, NodeTensorUtil, |
| 93 | + //NodeCountTotal, NodeCountRequest, NodeCountLimit, |
| 94 | + }, |
| 95 | + "ClusterName-GpuDevice-InstanceId-InstanceType-NodeName": { |
| 96 | + NodeMemTotal, NodeMemUsed, NodePower, NodeTemp, NodeUtil, NodeMemUtil, NodeTensorUtil, |
| 97 | + }, |
| 98 | +} |
| 99 | + |
| 100 | +// ValidateGPUMetrics validates GPU metrics using the common validation logic |
| 101 | +func ValidateGPUMetrics(env *environment.MetaData) []status.TestResult { |
| 102 | + var testResults []status.TestResult |
| 103 | + |
| 104 | + // Create a copy of the expected dimensions to metrics map |
| 105 | + expectedDimsToMetrics := make(map[string][]string) |
| 106 | + for k, v := range ExpectedDimsToMetricsIntegTest { |
| 107 | + expectedDimsToMetrics[k] = append([]string{}, v...) |
| 108 | + } |
| 109 | + |
| 110 | + // Add GPU count metrics if using E2E metrics |
| 111 | + if *UseE2EMetrics { |
| 112 | + expectedDimsToMetrics["ClusterName"] = append( |
| 113 | + expectedDimsToMetrics["ClusterName"], |
| 114 | + PodReserved, PodRequest, PodCountTotal, PodLimit, NodeCountTotal, NodeCountLimit, NodeReserved, NodeUnreserved, NodeAvailable, |
| 115 | + ) |
| 116 | + expectedDimsToMetrics["ClusterName-Namespace-PodName"] = append( |
| 117 | + expectedDimsToMetrics["ClusterName-Namespace-PodName"], |
| 118 | + PodCountTotal, PodRequest, PodReserved, PodLimit, |
| 119 | + ) |
| 120 | + expectedDimsToMetrics["ClusterName-FullPodName-Namespace-PodName"] = append( |
| 121 | + expectedDimsToMetrics["ClusterName-FullPodName-Namespace-PodName"], |
| 122 | + PodCountTotal, PodRequest, PodReserved, PodLimit, |
| 123 | + ) |
| 124 | + expectedDimsToMetrics["ClusterName-InstanceId-NodeName"] = append( |
| 125 | + expectedDimsToMetrics["ClusterName-InstanceId-NodeName"], |
| 126 | + NodeCountLimit, NodeCountTotal, NodeReserved, NodeUnreserved, NodeAvailable, |
| 127 | + ) |
| 128 | + } |
| 129 | + |
| 130 | + // Validate metrics and logs |
| 131 | + testResults = append(testResults, metric.ValidateMetrics(env, GPUMetricIndicator, expectedDimsToMetrics)...) |
| 132 | + testResults = append(testResults, metric.ValidateLogs(env)) |
| 133 | + |
| 134 | + return testResults |
| 135 | +} |
| 136 | + |
| 137 | +// ValidateHistogramFormat validates that the logs contain metrics in histogram format |
| 138 | +func ValidateHistogramFormat(env *environment.MetaData) status.TestResult { |
| 139 | + testResult := status.TestResult{ |
| 140 | + Name: "histogram-format", |
| 141 | + Status: status.FAILED, |
| 142 | + } |
| 143 | + |
| 144 | + end := time.Now() |
| 145 | + start := end.Add(time.Duration(-3) * time.Minute) |
| 146 | + group := fmt.Sprintf("/aws/containerinsights/%s/performance", env.EKSClusterName) |
| 147 | + |
| 148 | + log.Println("Searching for histogram format in log group:", group) |
| 149 | + |
| 150 | + // Get the instances used for the EKS cluster |
| 151 | + eKSInstances, err := awsservice.GetEKSInstances(env.EKSClusterName) |
| 152 | + if err != nil { |
| 153 | + log.Println("Failed to get EKS instances:", err) |
| 154 | + return testResult |
| 155 | + } |
| 156 | + |
| 157 | + histogramFound := false |
| 158 | + logCount := 0 |
| 159 | + gpuMetricCount := 0 |
| 160 | + |
| 161 | + for _, instance := range eKSInstances { |
| 162 | + stream := *instance.InstanceName |
| 163 | + |
| 164 | + err = awsservice.ValidateLogs( |
| 165 | + group, |
| 166 | + stream, |
| 167 | + &start, |
| 168 | + &end, |
| 169 | + awsservice.AssertLogsNotEmpty(), |
| 170 | + awsservice.AssertPerLog( |
| 171 | + func(event types.OutputLogEvent) error { |
| 172 | + logCount++ |
| 173 | + message := *event.Message |
| 174 | + |
| 175 | + // Check if the log contains histogram format |
| 176 | + var logData map[string]interface{} |
| 177 | + if err := json.Unmarshal([]byte(message), &logData); err != nil { |
| 178 | + return nil // Skip this log if it's not valid JSON |
| 179 | + } |
| 180 | + |
| 181 | + // Check for GPU metrics with histogram format |
| 182 | + gpuMetricsInLog := 0 |
| 183 | + for key, value := range logData { |
| 184 | + if !strings.Contains(key, "_gpu_") { |
| 185 | + continue |
| 186 | + } |
| 187 | + |
| 188 | + gpuMetricsInLog++ |
| 189 | + gpuMetricCount++ |
| 190 | + |
| 191 | + // Check if the value is a map with histogram fields |
| 192 | + valueMap, ok := value.(map[string]interface{}) |
| 193 | + if !ok { |
| 194 | + continue |
| 195 | + } |
| 196 | + |
| 197 | + // Check for required histogram fields |
| 198 | + _, hasValues := valueMap["Values"] |
| 199 | + _, hasCounts := valueMap["Counts"] |
| 200 | + _, hasMax := valueMap["Max"] |
| 201 | + _, hasMin := valueMap["Min"] |
| 202 | + _, hasCount := valueMap["Count"] |
| 203 | + _, hasSum := valueMap["Sum"] |
| 204 | + |
| 205 | + if hasValues && hasCounts && hasMax && hasMin && hasCount && hasSum { |
| 206 | + histogramFound = true |
| 207 | + log.Println("Found GPU metric in histogram format:", key) |
| 208 | + return nil |
| 209 | + } |
| 210 | + } |
| 211 | + |
| 212 | + return nil // Continue checking other logs |
| 213 | + }, |
| 214 | + ), |
| 215 | + ) |
| 216 | + |
| 217 | + if err != nil { |
| 218 | + log.Println("Error validating logs:", err) |
| 219 | + } |
| 220 | + |
| 221 | + if histogramFound { |
| 222 | + log.Println("Successfully found GPU metric in histogram format") |
| 223 | + testResult.Status = status.SUCCESSFUL |
| 224 | + return testResult |
| 225 | + } |
| 226 | + } |
| 227 | + |
| 228 | + log.Printf("Processed %d logs, found %d GPU metrics, but none in histogram format", logCount, gpuMetricCount) |
| 229 | + return testResult |
| 230 | +} |
0 commit comments