|
18 | 18 | "editable": true,
|
19 | 19 | "gnetId": 11752,
|
20 | 20 | "graphTooltip": 0,
|
21 |
| - "id": 5, |
22 |
| - "iteration": 1605896702545, |
| 21 | + "iteration": 1606131081690, |
23 | 22 | "links": [
|
24 | 23 | {
|
25 | 24 | "icon": "external link",
|
|
270 | 269 | "tableColumn": "",
|
271 | 270 | "targets": [
|
272 | 271 | {
|
273 |
| - "expr": "avg(DCGM_FI_DEV_GPU_UTIL{instance_id=\"$instance_id\"})", |
| 272 | + "expr": "DCGM_FI_DEV_GPU_UTIL{instance_id=\"$instance_id\"}", |
274 | 273 | "format": "time_series",
|
| 274 | + "instant": true, |
275 | 275 | "interval": "",
|
276 | 276 | "intervalFactor": 1,
|
277 | 277 | "legendFormat": "",
|
|
309 | 309 | },
|
310 | 310 | "format": "watt",
|
311 | 311 | "gauge": {
|
312 |
| - "maxValue": 2400, |
| 312 | + "maxValue": 100, |
313 | 313 | "minValue": 0,
|
314 | 314 | "show": true,
|
315 | 315 | "thresholdLabels": false,
|
|
358 | 358 | "tableColumn": "",
|
359 | 359 | "targets": [
|
360 | 360 | {
|
361 |
| - "expr": "sum(DCGM_FI_DEV_POWER_USAGE{instance_id=\"$instance_id\"})", |
| 361 | + "expr": "DCGM_FI_DEV_POWER_USAGE{instance_id=\"$instance_id\"}", |
362 | 362 | "format": "time_series",
|
363 |
| - "instant": false, |
| 363 | + "instant": true, |
364 | 364 | "interval": "",
|
365 | 365 | "intervalFactor": 1,
|
366 | 366 | "legendFormat": "",
|
367 | 367 | "refId": "A"
|
368 | 368 | }
|
369 | 369 | ],
|
370 |
| - "thresholds": "1800,2200", |
| 370 | + "thresholds": "60,90", |
371 | 371 | "title": "GPU Total Power",
|
372 | 372 | "type": "singlestat",
|
373 | 373 | "valueFontSize": "80%",
|
|
390 | 390 | "#d44a3a"
|
391 | 391 | ],
|
392 | 392 | "datasource": "prometheus",
|
393 |
| - "description": "", |
394 | 393 | "fieldConfig": {
|
395 | 394 | "defaults": {
|
396 | 395 | "custom": {}
|
397 | 396 | },
|
398 | 397 | "overrides": []
|
399 | 398 | },
|
400 |
| - "format": "percent", |
| 399 | + "format": "celsius", |
401 | 400 | "gauge": {
|
402 |
| - "maxValue": 100, |
| 401 | + "maxValue": 90, |
403 | 402 | "minValue": 0,
|
404 | 403 | "show": true,
|
405 | 404 | "thresholdLabels": false,
|
|
411 | 410 | "x": 12,
|
412 | 411 | "y": 0
|
413 | 412 | },
|
414 |
| - "id": 68, |
| 413 | + "id": 31, |
415 | 414 | "interval": null,
|
416 | 415 | "links": [],
|
417 | 416 | "mappingType": 1,
|
|
448 | 447 | "tableColumn": "",
|
449 | 448 | "targets": [
|
450 | 449 | {
|
451 |
| - "expr": "avg(DCGM_FI_DEV_DEC_UTIL{instance_id=\"$instance_id\"})", |
| 450 | + "expr": "DCGM_FI_DEV_GPU_TEMP{instance_id=\"$instance_id\"}", |
452 | 451 | "format": "time_series",
|
| 452 | + "instant": true, |
453 | 453 | "interval": "",
|
454 | 454 | "intervalFactor": 1,
|
455 | 455 | "legendFormat": "",
|
456 | 456 | "refId": "A"
|
457 | 457 | }
|
458 | 458 | ],
|
459 |
| - "thresholds": "80,90", |
460 |
| - "title": "GPU Decored Utilization", |
| 459 | + "thresholds": "83,87", |
| 460 | + "title": "GPU Avg. Temperature", |
461 | 461 | "type": "singlestat",
|
462 | 462 | "valueFontSize": "80%",
|
463 | 463 | "valueMaps": [
|
|
479 | 479 | "#d44a3a"
|
480 | 480 | ],
|
481 | 481 | "datasource": "prometheus",
|
| 482 | + "description": "", |
482 | 483 | "fieldConfig": {
|
483 | 484 | "defaults": {
|
484 | 485 | "custom": {}
|
485 | 486 | },
|
486 | 487 | "overrides": []
|
487 | 488 | },
|
488 |
| - "format": "celsius", |
| 489 | + "format": "percent", |
489 | 490 | "gauge": {
|
490 |
| - "maxValue": 90, |
| 491 | + "maxValue": 100, |
491 | 492 | "minValue": 0,
|
492 | 493 | "show": true,
|
493 | 494 | "thresholdLabels": false,
|
|
499 | 500 | "x": 15,
|
500 | 501 | "y": 0
|
501 | 502 | },
|
502 |
| - "id": 31, |
| 503 | + "id": 68, |
503 | 504 | "interval": null,
|
504 | 505 | "links": [],
|
505 | 506 | "mappingType": 1,
|
|
536 | 537 | "tableColumn": "",
|
537 | 538 | "targets": [
|
538 | 539 | {
|
539 |
| - "expr": "avg(DCGM_FI_DEV_GPU_TEMP{instance_id=\"$instance_id\"})", |
| 540 | + "expr": "DCGM_FI_DEV_DEC_UTIL{instance_id=\"$instance_id\"}", |
540 | 541 | "format": "time_series",
|
| 542 | + "instant": true, |
541 | 543 | "interval": "",
|
542 | 544 | "intervalFactor": 1,
|
543 | 545 | "legendFormat": "",
|
544 | 546 | "refId": "A"
|
545 | 547 | }
|
546 | 548 | ],
|
547 |
| - "thresholds": "83,87", |
548 |
| - "title": "GPU Avg. Temperature", |
| 549 | + "thresholds": "80,90", |
| 550 | + "title": "GPU Decored Utilization", |
549 | 551 | "type": "singlestat",
|
550 | 552 | "valueFontSize": "80%",
|
551 | 553 | "valueMaps": [
|
|
625 | 627 | "tableColumn": "",
|
626 | 628 | "targets": [
|
627 | 629 | {
|
628 |
| - "expr": "avg(DCGM_FI_DEV_ENC_UTIL{instance_id=\"$instance_id\"})", |
| 630 | + "expr": "DCGM_FI_DEV_ENC_UTIL{instance_id=\"$instance_id\"}", |
629 | 631 | "format": "time_series",
|
| 632 | + "instant": true, |
630 | 633 | "interval": "",
|
631 | 634 | "intervalFactor": 1,
|
632 | 635 | "legendFormat": "",
|
|
714 | 717 | "tableColumn": "",
|
715 | 718 | "targets": [
|
716 | 719 | {
|
717 |
| - "expr": "avg(DCGM_FI_DEV_MEM_COPY_UTIL{instance_id=\"$instance_id\"})", |
| 720 | + "expr": "DCGM_FI_DEV_FB_USED{instance_id=\"$instance_id\"}/(DCGM_FI_DEV_FB_FREE{instance_id=\"$instance_id\"}+DCGM_FI_DEV_FB_USED{instance_id=\"$instance_id\"})*100", |
718 | 721 | "format": "time_series",
|
| 722 | + "instant": true, |
719 | 723 | "interval": "",
|
720 | 724 | "intervalFactor": 1,
|
721 | 725 | "legendFormat": "",
|
722 | 726 | "refId": "A"
|
723 | 727 | }
|
724 | 728 | ],
|
725 | 729 | "thresholds": "70,90",
|
726 |
| - "title": "GPU Total Mem Cpy Utilization", |
| 730 | + "title": "GPU Mem Util.", |
727 | 731 | "type": "singlestat",
|
728 | 732 | "valueFontSize": "80%",
|
729 | 733 | "valueMaps": [
|
|
802 | 806 | "tableColumn": "",
|
803 | 807 | "targets": [
|
804 | 808 | {
|
805 |
| - "expr": "avg(DCGM_FI_DEV_SM_CLOCK{instance_id=\"$instance_id\"}*1000000)", |
| 809 | + "expr": "DCGM_FI_DEV_SM_CLOCK{instance_id=\"$instance_id\"}*1000000", |
806 | 810 | "format": "time_series",
|
807 | 811 | "interval": "",
|
808 | 812 | "intervalFactor": 1,
|
|
890 | 894 | "tableColumn": "",
|
891 | 895 | "targets": [
|
892 | 896 | {
|
893 |
| - "expr": "avg(DCGM_FI_DEV_MEM_CLOCK{instance_id=\"$instance_id\"}*1000000)", |
| 897 | + "expr": "DCGM_FI_DEV_MEM_CLOCK{instance_id=\"$instance_id\"}*1000000", |
894 | 898 | "format": "time_series",
|
895 | 899 | "interval": "",
|
896 | 900 | "intervalFactor": 1,
|
|
1044 | 1048 | "hiddenSeries": false,
|
1045 | 1049 | "id": 57,
|
1046 | 1050 | "legend": {
|
1047 |
| - "avg": false, |
| 1051 | + "alignAsTable": true, |
| 1052 | + "avg": true, |
1048 | 1053 | "current": true,
|
1049 |
| - "max": false, |
1050 |
| - "min": false, |
1051 |
| - "show": false, |
| 1054 | + "max": true, |
| 1055 | + "min": true, |
| 1056 | + "rightSide": true, |
| 1057 | + "show": true, |
1052 | 1058 | "total": false,
|
1053 | 1059 | "values": true
|
1054 | 1060 | },
|
|
1562 | 1568 | "hiddenSeries": false,
|
1563 | 1569 | "id": 42,
|
1564 | 1570 | "legend": {
|
1565 |
| - "avg": false, |
| 1571 | + "alignAsTable": true, |
| 1572 | + "avg": true, |
1566 | 1573 | "current": true,
|
1567 |
| - "max": false, |
1568 |
| - "min": false, |
1569 |
| - "show": false, |
| 1574 | + "max": true, |
| 1575 | + "min": true, |
| 1576 | + "rightSide": true, |
| 1577 | + "show": true, |
1570 | 1578 | "total": false,
|
1571 | 1579 | "values": true
|
1572 | 1580 | },
|
|
1588 | 1596 | "steppedLine": false,
|
1589 | 1597 | "targets": [
|
1590 | 1598 | {
|
1591 |
| - "expr": "DCGM_FI_DEV_FB_USED{instance_id=\"$instance_id\"}/(DCGM_FI_DEV_FB_USED{instance_id=\"$instance_id\"}+DCGM_FI_DEV_FB_FREE{instance_id=\"$instance_id\"})", |
| 1599 | + "expr": "DCGM_FI_DEV_FB_USED{instance_id=\"$instance_id\"}", |
1592 | 1600 | "format": "time_series",
|
1593 | 1601 | "hide": false,
|
1594 | 1602 | "interval": "",
|
|
1617 | 1625 | },
|
1618 | 1626 | "yaxes": [
|
1619 | 1627 | {
|
| 1628 | + "$$hashKey": "object:1193", |
1620 | 1629 | "decimals": null,
|
1621 |
| - "format": "percentunit", |
| 1630 | + "format": "decmbytes", |
1622 | 1631 | "label": null,
|
1623 | 1632 | "logBase": 1,
|
1624 | 1633 | "max": null,
|
1625 | 1634 | "min": "0",
|
1626 | 1635 | "show": true
|
1627 | 1636 | },
|
1628 | 1637 | {
|
| 1638 | + "$$hashKey": "object:1194", |
1629 | 1639 | "format": "watt",
|
1630 | 1640 | "label": null,
|
1631 | 1641 | "logBase": 1,
|
|
1848 | 1858 | "list": [
|
1849 | 1859 | {
|
1850 | 1860 | "allValue": null,
|
| 1861 | + "current": { |
| 1862 | + "selected": false, |
| 1863 | + "text": "i-076225e1b1aefc813", |
| 1864 | + "value": "i-076225e1b1aefc813" |
| 1865 | + }, |
1851 | 1866 | "datasource": "prometheus",
|
1852 | 1867 | "definition": "label_values(node_uname_info{job=~\"ec2_instances\",instance_type=~\"g[3-4].*\"}, instance_id)",
|
1853 | 1868 | "error": null,
|
|
0 commit comments