Bugfix: AMD metrics push to prometheus (#61)

gablyu-oci · web-flow · commit 5fcbc976d1c9 · 2025-12-05T12:46:25.000-08:00
* add post-upgrade hook to enable hostNetwork for AMD GPU exporter Daemon

* Update default values

* Updated AMD metrics names

* Updated parallelism and completions in health check

* Removed amd exporter patch job

* Update AMD hostname expressions
diff --git a/grafana_dashboards/oke-workloads-to-gpuscanner-mapping-dashboard.json b/grafana_dashboards/oke-workloads-to-gpuscanner-mapping-dashboard.json
@@ -2567,7 +2567,7 @@
                 "uid": "{{DATASOURCE_PROMETHEUS}}"
               },
               "editorMode": "code",
-              "expr": "DCGM_FI_DEV_GPU_TEMP{Hostname=~\"$hostname\"} OR gpu_junction_temperature{hostname=~\"$hostname\"}",
+              "expr": "DCGM_FI_DEV_GPU_TEMP{Hostname=~\"$hostname\"} OR (amd_gpu_junction_temperature * on(node) group_left(hostname) (max by(node, hostname) (oci_lens_pod_node_info{hostname=~\"$hostname\"})))",
               "hide": false,
               "instant": false,
               "interval": "",
@@ -2642,7 +2642,7 @@
                 "uid": "{{DATASOURCE_PROMETHEUS}}"
               },
               "editorMode": "code",
-              "expr": "avg(DCGM_FI_DEV_POWER_USAGE{Hostname=~\"$hostname\"} OR gpu_power_usage{hostname=~\"$hostname\"})",
+              "expr": "avg(DCGM_FI_DEV_POWER_USAGE{Hostname=~\"$hostname\"} OR (amd_gpu_power_usage * on(node) group_left(hostname) (max by(node, hostname) (oci_lens_pod_node_info{hostname=~\"$hostname\"}))))",
               "hide": false,
               "instant": false,
               "legendFormat": "__auto",
@@ -2717,7 +2717,7 @@
                 "uid": "{{DATASOURCE_PROMETHEUS}}"
               },
               "editorMode": "code",
-              "expr": "avg(DCGM_FI_DEV_GPU_TEMP{Hostname=~\"$hostname\"} OR gpu_junction_temperature{hostname=~\"$hostname\"}) ",
+              "expr": "avg(DCGM_FI_DEV_GPU_TEMP{Hostname=~\"$hostname\"} OR (amd_gpu_junction_temperature * on(node) group_left(hostname) (max by(node, hostname) (oci_lens_pod_node_info{hostname=~\"$hostname\"})))) ",
               "interval": "",
               "legendFormat": "Avg GPU Temperature",
               "range": true,
@@ -2824,7 +2824,7 @@
                 "uid": "{{DATASOURCE_PROMETHEUS}}"
               },
               "editorMode": "code",
-              "expr": "avg(DCGM_FI_DEV_GPU_UTIL{Hostname=~\"$hostname\"} OR gpu_gfx_activity{hostname=~\"$hostname\"})",
+              "expr": "avg(DCGM_FI_DEV_GPU_UTIL{Hostname=~\"$hostname\"} OR (amd_gpu_gfx_activity * on(node) group_left(hostname) (max by(node, hostname) (oci_lens_pod_node_info{hostname=~\"$hostname\"}))))",
               "interval": "",
               "legendFormat": "{{Hostname}}{{hostname}}",
               "range": true,
@@ -2929,7 +2929,7 @@
                 "uid": "{{DATASOURCE_PROMETHEUS}}"
               },
               "editorMode": "code",
-              "expr": "DCGM_FI_DEV_POWER_USAGE{Hostname=~\"$hostname\"} OR gpu_power_usage{hostname=~\"$hostname\"}",
+              "expr": "DCGM_FI_DEV_POWER_USAGE{Hostname=~\"$hostname\"} OR (amd_gpu_power_usage * on(node) group_left(hostname) (max by(node, hostname) (oci_lens_pod_node_info{hostname=~\"$hostname\"})))",
               "interval": "",
               "legendFormat": "GPU {{gpu}}{{gpu_id}} - {{Hostname}}{{hostname}}",
               "range": true,
diff --git a/oci-scanner-plugin-helm/templates/health-check.yaml b/oci-scanner-plugin-helm/templates/health-check.yaml
@@ -23,8 +23,8 @@ metadata:
     component: gpu-monitoring
     priority: low
 spec:
-  parallelism: 1
-  completions: 1
+  parallelism: 3
+  completions: 3
   activeDeadlineSeconds: 7200
   backoffLimit: 0
   ttlSecondsAfterFinished: {{ .Values.healthCheck.ttlSecondsAfterFinished | default 86400 }}  # Clean up after 24h
diff --git a/oci-scanner-plugin-helm/templates/node-problem-detector.yaml b/oci-scanner-plugin-helm/templates/node-problem-detector.yaml
@@ -40,7 +40,6 @@ spec:
         - args:
             - /node-problem-detector --logtostderr --prometheus-port=${PROMETHEUS_PORT}
               --prometheus-address 0.0.0.0 --config.system-log-monitor=/config/kernel-monitor.json,/config/readonly-monitor.json
-              --config.custom-plugin-monitor=/node-problem-detector-custom-check/imds_reachability.json
               {{- if .Values.nodeProblemDetector.enableGpuChecks }}
               --config.custom-plugin-monitor=/node-problem-detector-gpu-check/dr-hpc.json
               {{- end }}
diff --git a/oci-scanner-plugin-helm/values.yaml b/oci-scanner-plugin-helm/values.yaml
@@ -42,6 +42,7 @@ device-metrics-exporter-charts:
   global:
     imageRegistry: ""
   platform: k8s
+  hostNetwork: true
   nodeSelector:
     amd.com/gpu: "true"
   kubelet:
@@ -107,7 +108,7 @@ metricsPushJob:
 
 # Active Health Check
 healthCheck:
-  enabled: false
+  enabled: true
   image:
     repository: oci_lens_healthcheck_amd
     tag: v0.0.5
@@ -166,7 +167,7 @@ rbac:
 
 # Node Exporter
 nodeExporter:
-  enabled: false
+  enabled: true
   
 # Override default values for the official chart
 prometheus-node-exporter:
@@ -216,7 +217,7 @@ podNodeMapper:
 
 # Node Problem Detector
 nodeProblemDetector:
-  enabled: false
+  enabled: true
   enableGpuChecks: true  # Enable GPU health checks via DRHPC
   
   # DRHPC results path - must match the hostPath where DRHPC writes results