Skip to content

Commit 24a5b56

Browse files
committed
Fix GPU Dashboard
1 parent c02aff0 commit 24a5b56

File tree

1 file changed

+50
-35
lines changed

1 file changed

+50
-35
lines changed

grafana/dashboards/gpu.json

Lines changed: 50 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,7 @@
1818
"editable": true,
1919
"gnetId": 11752,
2020
"graphTooltip": 0,
21-
"id": 5,
22-
"iteration": 1605896702545,
21+
"iteration": 1606131081690,
2322
"links": [
2423
{
2524
"icon": "external link",
@@ -270,8 +269,9 @@
270269
"tableColumn": "",
271270
"targets": [
272271
{
273-
"expr": "avg(DCGM_FI_DEV_GPU_UTIL{instance_id=\"$instance_id\"})",
272+
"expr": "DCGM_FI_DEV_GPU_UTIL{instance_id=\"$instance_id\"}",
274273
"format": "time_series",
274+
"instant": true,
275275
"interval": "",
276276
"intervalFactor": 1,
277277
"legendFormat": "",
@@ -309,7 +309,7 @@
309309
},
310310
"format": "watt",
311311
"gauge": {
312-
"maxValue": 2400,
312+
"maxValue": 100,
313313
"minValue": 0,
314314
"show": true,
315315
"thresholdLabels": false,
@@ -358,16 +358,16 @@
358358
"tableColumn": "",
359359
"targets": [
360360
{
361-
"expr": "sum(DCGM_FI_DEV_POWER_USAGE{instance_id=\"$instance_id\"})",
361+
"expr": "DCGM_FI_DEV_POWER_USAGE{instance_id=\"$instance_id\"}",
362362
"format": "time_series",
363-
"instant": false,
363+
"instant": true,
364364
"interval": "",
365365
"intervalFactor": 1,
366366
"legendFormat": "",
367367
"refId": "A"
368368
}
369369
],
370-
"thresholds": "1800,2200",
370+
"thresholds": "60,90",
371371
"title": "GPU Total Power",
372372
"type": "singlestat",
373373
"valueFontSize": "80%",
@@ -390,16 +390,15 @@
390390
"#d44a3a"
391391
],
392392
"datasource": "prometheus",
393-
"description": "",
394393
"fieldConfig": {
395394
"defaults": {
396395
"custom": {}
397396
},
398397
"overrides": []
399398
},
400-
"format": "percent",
399+
"format": "celsius",
401400
"gauge": {
402-
"maxValue": 100,
401+
"maxValue": 90,
403402
"minValue": 0,
404403
"show": true,
405404
"thresholdLabels": false,
@@ -411,7 +410,7 @@
411410
"x": 12,
412411
"y": 0
413412
},
414-
"id": 68,
413+
"id": 31,
415414
"interval": null,
416415
"links": [],
417416
"mappingType": 1,
@@ -448,16 +447,17 @@
448447
"tableColumn": "",
449448
"targets": [
450449
{
451-
"expr": "avg(DCGM_FI_DEV_DEC_UTIL{instance_id=\"$instance_id\"})",
450+
"expr": "DCGM_FI_DEV_GPU_TEMP{instance_id=\"$instance_id\"}",
452451
"format": "time_series",
452+
"instant": true,
453453
"interval": "",
454454
"intervalFactor": 1,
455455
"legendFormat": "",
456456
"refId": "A"
457457
}
458458
],
459-
"thresholds": "80,90",
460-
"title": "GPU Decored Utilization",
459+
"thresholds": "83,87",
460+
"title": "GPU Avg. Temperature",
461461
"type": "singlestat",
462462
"valueFontSize": "80%",
463463
"valueMaps": [
@@ -479,15 +479,16 @@
479479
"#d44a3a"
480480
],
481481
"datasource": "prometheus",
482+
"description": "",
482483
"fieldConfig": {
483484
"defaults": {
484485
"custom": {}
485486
},
486487
"overrides": []
487488
},
488-
"format": "celsius",
489+
"format": "percent",
489490
"gauge": {
490-
"maxValue": 90,
491+
"maxValue": 100,
491492
"minValue": 0,
492493
"show": true,
493494
"thresholdLabels": false,
@@ -499,7 +500,7 @@
499500
"x": 15,
500501
"y": 0
501502
},
502-
"id": 31,
503+
"id": 68,
503504
"interval": null,
504505
"links": [],
505506
"mappingType": 1,
@@ -536,16 +537,17 @@
536537
"tableColumn": "",
537538
"targets": [
538539
{
539-
"expr": "avg(DCGM_FI_DEV_GPU_TEMP{instance_id=\"$instance_id\"})",
540+
"expr": "DCGM_FI_DEV_DEC_UTIL{instance_id=\"$instance_id\"}",
540541
"format": "time_series",
542+
"instant": true,
541543
"interval": "",
542544
"intervalFactor": 1,
543545
"legendFormat": "",
544546
"refId": "A"
545547
}
546548
],
547-
"thresholds": "83,87",
548-
"title": "GPU Avg. Temperature",
549+
"thresholds": "80,90",
550+
"title": "GPU Decored Utilization",
549551
"type": "singlestat",
550552
"valueFontSize": "80%",
551553
"valueMaps": [
@@ -625,8 +627,9 @@
625627
"tableColumn": "",
626628
"targets": [
627629
{
628-
"expr": "avg(DCGM_FI_DEV_ENC_UTIL{instance_id=\"$instance_id\"})",
630+
"expr": "DCGM_FI_DEV_ENC_UTIL{instance_id=\"$instance_id\"}",
629631
"format": "time_series",
632+
"instant": true,
630633
"interval": "",
631634
"intervalFactor": 1,
632635
"legendFormat": "",
@@ -714,16 +717,17 @@
714717
"tableColumn": "",
715718
"targets": [
716719
{
717-
"expr": "avg(DCGM_FI_DEV_MEM_COPY_UTIL{instance_id=\"$instance_id\"})",
720+
"expr": "DCGM_FI_DEV_FB_USED{instance_id=\"$instance_id\"}/(DCGM_FI_DEV_FB_FREE{instance_id=\"$instance_id\"}+DCGM_FI_DEV_FB_USED{instance_id=\"$instance_id\"})*100",
718721
"format": "time_series",
722+
"instant": true,
719723
"interval": "",
720724
"intervalFactor": 1,
721725
"legendFormat": "",
722726
"refId": "A"
723727
}
724728
],
725729
"thresholds": "70,90",
726-
"title": "GPU Total Mem Cpy Utilization",
730+
"title": "GPU Mem Util.",
727731
"type": "singlestat",
728732
"valueFontSize": "80%",
729733
"valueMaps": [
@@ -802,7 +806,7 @@
802806
"tableColumn": "",
803807
"targets": [
804808
{
805-
"expr": "avg(DCGM_FI_DEV_SM_CLOCK{instance_id=\"$instance_id\"}*1000000)",
809+
"expr": "DCGM_FI_DEV_SM_CLOCK{instance_id=\"$instance_id\"}*1000000",
806810
"format": "time_series",
807811
"interval": "",
808812
"intervalFactor": 1,
@@ -890,7 +894,7 @@
890894
"tableColumn": "",
891895
"targets": [
892896
{
893-
"expr": "avg(DCGM_FI_DEV_MEM_CLOCK{instance_id=\"$instance_id\"}*1000000)",
897+
"expr": "DCGM_FI_DEV_MEM_CLOCK{instance_id=\"$instance_id\"}*1000000",
894898
"format": "time_series",
895899
"interval": "",
896900
"intervalFactor": 1,
@@ -1044,11 +1048,13 @@
10441048
"hiddenSeries": false,
10451049
"id": 57,
10461050
"legend": {
1047-
"avg": false,
1051+
"alignAsTable": true,
1052+
"avg": true,
10481053
"current": true,
1049-
"max": false,
1050-
"min": false,
1051-
"show": false,
1054+
"max": true,
1055+
"min": true,
1056+
"rightSide": true,
1057+
"show": true,
10521058
"total": false,
10531059
"values": true
10541060
},
@@ -1562,11 +1568,13 @@
15621568
"hiddenSeries": false,
15631569
"id": 42,
15641570
"legend": {
1565-
"avg": false,
1571+
"alignAsTable": true,
1572+
"avg": true,
15661573
"current": true,
1567-
"max": false,
1568-
"min": false,
1569-
"show": false,
1574+
"max": true,
1575+
"min": true,
1576+
"rightSide": true,
1577+
"show": true,
15701578
"total": false,
15711579
"values": true
15721580
},
@@ -1588,7 +1596,7 @@
15881596
"steppedLine": false,
15891597
"targets": [
15901598
{
1591-
"expr": "DCGM_FI_DEV_FB_USED{instance_id=\"$instance_id\"}/(DCGM_FI_DEV_FB_USED{instance_id=\"$instance_id\"}+DCGM_FI_DEV_FB_FREE{instance_id=\"$instance_id\"})",
1599+
"expr": "DCGM_FI_DEV_FB_USED{instance_id=\"$instance_id\"}",
15921600
"format": "time_series",
15931601
"hide": false,
15941602
"interval": "",
@@ -1617,15 +1625,17 @@
16171625
},
16181626
"yaxes": [
16191627
{
1628+
"$$hashKey": "object:1193",
16201629
"decimals": null,
1621-
"format": "percentunit",
1630+
"format": "decmbytes",
16221631
"label": null,
16231632
"logBase": 1,
16241633
"max": null,
16251634
"min": "0",
16261635
"show": true
16271636
},
16281637
{
1638+
"$$hashKey": "object:1194",
16291639
"format": "watt",
16301640
"label": null,
16311641
"logBase": 1,
@@ -1848,6 +1858,11 @@
18481858
"list": [
18491859
{
18501860
"allValue": null,
1861+
"current": {
1862+
"selected": false,
1863+
"text": "i-076225e1b1aefc813",
1864+
"value": "i-076225e1b1aefc813"
1865+
},
18511866
"datasource": "prometheus",
18521867
"definition": "label_values(node_uname_info{job=~\"ec2_instances\",instance_type=~\"g[3-4].*\"}, instance_id)",
18531868
"error": null,

0 commit comments

Comments
 (0)