Skip to content

Commit 5fcbc97

Browse files
authored
Bugfix: AMD metrics push to prometheus (#61)
* add post-upgrade hook to enable hostNetwork for AMD GPU exporter Daemon * Update default values * Updated AMD metrics names * Updated parallelism and completions in health check * Removed amd exporter patch job * Update AMD hostname expressions
1 parent dba6e5e commit 5fcbc97

File tree

4 files changed

+11
-11
lines changed

4 files changed

+11
-11
lines changed

grafana_dashboards/oke-workloads-to-gpuscanner-mapping-dashboard.json

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2567,7 +2567,7 @@
25672567
"uid": "{{DATASOURCE_PROMETHEUS}}"
25682568
},
25692569
"editorMode": "code",
2570-
"expr": "DCGM_FI_DEV_GPU_TEMP{Hostname=~\"$hostname\"} OR gpu_junction_temperature{hostname=~\"$hostname\"}",
2570+
"expr": "DCGM_FI_DEV_GPU_TEMP{Hostname=~\"$hostname\"} OR (amd_gpu_junction_temperature * on(node) group_left(hostname) (max by(node, hostname) (oci_lens_pod_node_info{hostname=~\"$hostname\"})))",
25712571
"hide": false,
25722572
"instant": false,
25732573
"interval": "",
@@ -2642,7 +2642,7 @@
26422642
"uid": "{{DATASOURCE_PROMETHEUS}}"
26432643
},
26442644
"editorMode": "code",
2645-
"expr": "avg(DCGM_FI_DEV_POWER_USAGE{Hostname=~\"$hostname\"} OR gpu_power_usage{hostname=~\"$hostname\"})",
2645+
"expr": "avg(DCGM_FI_DEV_POWER_USAGE{Hostname=~\"$hostname\"} OR (amd_gpu_power_usage * on(node) group_left(hostname) (max by(node, hostname) (oci_lens_pod_node_info{hostname=~\"$hostname\"}))))",
26462646
"hide": false,
26472647
"instant": false,
26482648
"legendFormat": "__auto",
@@ -2717,7 +2717,7 @@
27172717
"uid": "{{DATASOURCE_PROMETHEUS}}"
27182718
},
27192719
"editorMode": "code",
2720-
"expr": "avg(DCGM_FI_DEV_GPU_TEMP{Hostname=~\"$hostname\"} OR gpu_junction_temperature{hostname=~\"$hostname\"}) ",
2720+
"expr": "avg(DCGM_FI_DEV_GPU_TEMP{Hostname=~\"$hostname\"} OR (amd_gpu_junction_temperature * on(node) group_left(hostname) (max by(node, hostname) (oci_lens_pod_node_info{hostname=~\"$hostname\"})))) ",
27212721
"interval": "",
27222722
"legendFormat": "Avg GPU Temperature",
27232723
"range": true,
@@ -2824,7 +2824,7 @@
28242824
"uid": "{{DATASOURCE_PROMETHEUS}}"
28252825
},
28262826
"editorMode": "code",
2827-
"expr": "avg(DCGM_FI_DEV_GPU_UTIL{Hostname=~\"$hostname\"} OR gpu_gfx_activity{hostname=~\"$hostname\"})",
2827+
"expr": "avg(DCGM_FI_DEV_GPU_UTIL{Hostname=~\"$hostname\"} OR (amd_gpu_gfx_activity * on(node) group_left(hostname) (max by(node, hostname) (oci_lens_pod_node_info{hostname=~\"$hostname\"}))))",
28282828
"interval": "",
28292829
"legendFormat": "{{Hostname}}{{hostname}}",
28302830
"range": true,
@@ -2929,7 +2929,7 @@
29292929
"uid": "{{DATASOURCE_PROMETHEUS}}"
29302930
},
29312931
"editorMode": "code",
2932-
"expr": "DCGM_FI_DEV_POWER_USAGE{Hostname=~\"$hostname\"} OR gpu_power_usage{hostname=~\"$hostname\"}",
2932+
"expr": "DCGM_FI_DEV_POWER_USAGE{Hostname=~\"$hostname\"} OR (amd_gpu_power_usage * on(node) group_left(hostname) (max by(node, hostname) (oci_lens_pod_node_info{hostname=~\"$hostname\"})))",
29332933
"interval": "",
29342934
"legendFormat": "GPU {{gpu}}{{gpu_id}} - {{Hostname}}{{hostname}}",
29352935
"range": true,

oci-scanner-plugin-helm/templates/health-check.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@ metadata:
2323
component: gpu-monitoring
2424
priority: low
2525
spec:
26-
parallelism: 1
27-
completions: 1
26+
parallelism: 3
27+
completions: 3
2828
activeDeadlineSeconds: 7200
2929
backoffLimit: 0
3030
ttlSecondsAfterFinished: {{ .Values.healthCheck.ttlSecondsAfterFinished | default 86400 }} # Clean up after 24h

oci-scanner-plugin-helm/templates/node-problem-detector.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@ spec:
4040
- args:
4141
- /node-problem-detector --logtostderr --prometheus-port=${PROMETHEUS_PORT}
4242
--prometheus-address 0.0.0.0 --config.system-log-monitor=/config/kernel-monitor.json,/config/readonly-monitor.json
43-
--config.custom-plugin-monitor=/node-problem-detector-custom-check/imds_reachability.json
4443
{{- if .Values.nodeProblemDetector.enableGpuChecks }}
4544
--config.custom-plugin-monitor=/node-problem-detector-gpu-check/dr-hpc.json
4645
{{- end }}

oci-scanner-plugin-helm/values.yaml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ device-metrics-exporter-charts:
4242
global:
4343
imageRegistry: ""
4444
platform: k8s
45+
hostNetwork: true
4546
nodeSelector:
4647
amd.com/gpu: "true"
4748
kubelet:
@@ -107,7 +108,7 @@ metricsPushJob:
107108

108109
# Active Health Check
109110
healthCheck:
110-
enabled: false
111+
enabled: true
111112
image:
112113
repository: oci_lens_healthcheck_amd
113114
tag: v0.0.5
@@ -166,7 +167,7 @@ rbac:
166167

167168
# Node Exporter
168169
nodeExporter:
169-
enabled: false
170+
enabled: true
170171

171172
# Override default values for the official chart
172173
prometheus-node-exporter:
@@ -216,7 +217,7 @@ podNodeMapper:
216217

217218
# Node Problem Detector
218219
nodeProblemDetector:
219-
enabled: false
220+
enabled: true
220221
enableGpuChecks: true # Enable GPU health checks via DRHPC
221222

222223
# DRHPC results path - must match the hostPath where DRHPC writes results

0 commit comments

Comments
 (0)