Skip to content

Commit c02aff0

Browse files
committed
fix Prometheus config and Grafana Dashboard for GPU support
1 parent 1e93a8b commit c02aff0

File tree

2 files changed

+21
-44
lines changed

2 files changed

+21
-44
lines changed

grafana/dashboards/gpu.json

Lines changed: 14 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@
1818
"editable": true,
1919
"gnetId": 11752,
2020
"graphTooltip": 0,
21-
"id": 7,
22-
"iteration": 1605631221337,
21+
"id": 5,
22+
"iteration": 1605896702545,
2323
"links": [
2424
{
2525
"icon": "external link",
@@ -951,7 +951,7 @@
951951
"alertThreshold": true
952952
},
953953
"percentage": false,
954-
"pluginVersion": "7.3.2",
954+
"pluginVersion": "7.3.3",
955955
"pointradius": 5,
956956
"points": false,
957957
"renderer": "flot",
@@ -1060,7 +1060,7 @@
10601060
"alertThreshold": true
10611061
},
10621062
"percentage": false,
1063-
"pluginVersion": "7.3.2",
1063+
"pluginVersion": "7.3.3",
10641064
"pointradius": 5,
10651065
"points": false,
10661066
"renderer": "flot",
@@ -1160,7 +1160,7 @@
11601160
"alertThreshold": true
11611161
},
11621162
"percentage": false,
1163-
"pluginVersion": "7.3.2",
1163+
"pluginVersion": "7.3.3",
11641164
"pointradius": 5,
11651165
"points": false,
11661166
"renderer": "flot",
@@ -1264,7 +1264,7 @@
12641264
"alertThreshold": true
12651265
},
12661266
"percentage": false,
1267-
"pluginVersion": "7.3.2",
1267+
"pluginVersion": "7.3.3",
12681268
"pointradius": 5,
12691269
"points": false,
12701270
"renderer": "flot",
@@ -1381,7 +1381,7 @@
13811381
"alertThreshold": true
13821382
},
13831383
"percentage": false,
1384-
"pluginVersion": "7.3.2",
1384+
"pluginVersion": "7.3.3",
13851385
"pointradius": 2,
13861386
"points": false,
13871387
"renderer": "flot",
@@ -1478,7 +1478,7 @@
14781478
"alertThreshold": true
14791479
},
14801480
"percentage": false,
1481-
"pluginVersion": "7.3.2",
1481+
"pluginVersion": "7.3.3",
14821482
"pointradius": 5,
14831483
"points": false,
14841484
"renderer": "flot",
@@ -1578,7 +1578,7 @@
15781578
"alertThreshold": true
15791579
},
15801580
"percentage": false,
1581-
"pluginVersion": "7.3.2",
1581+
"pluginVersion": "7.3.3",
15821582
"pointradius": 5,
15831583
"points": false,
15841584
"renderer": "flot",
@@ -1680,7 +1680,7 @@
16801680
"alertThreshold": true
16811681
},
16821682
"percentage": false,
1683-
"pluginVersion": "7.3.2",
1683+
"pluginVersion": "7.3.3",
16841684
"pointradius": 2,
16851685
"points": false,
16861686
"renderer": "flot",
@@ -1777,7 +1777,7 @@
17771777
"alertThreshold": true
17781778
},
17791779
"percentage": false,
1780-
"pluginVersion": "7.3.2",
1780+
"pluginVersion": "7.3.3",
17811781
"pointradius": 5,
17821782
"points": false,
17831783
"renderer": "flot",
@@ -1848,21 +1848,16 @@
18481848
"list": [
18491849
{
18501850
"allValue": null,
1851-
"current": {
1852-
"selected": true,
1853-
"text": "i-002fd33e53d27b6dd",
1854-
"value": "i-002fd33e53d27b6dd"
1855-
},
18561851
"datasource": "prometheus",
1857-
"definition": "label_values(node_uname_info{job=~\"ec2_instances\"}, instance_id)",
1852+
"definition": "label_values(node_uname_info{job=~\"ec2_instances\",instance_type=~\"g[3-4].*\"}, instance_id)",
18581853
"error": null,
18591854
"hide": 0,
18601855
"includeAll": false,
18611856
"label": "Instance ID",
18621857
"multi": false,
18631858
"name": "instance_id",
18641859
"options": [],
1865-
"query": "label_values(node_uname_info{job=~\"ec2_instances\"}, instance_id)",
1860+
"query": "label_values(node_uname_info{job=~\"ec2_instances\",instance_type=~\"g[3-4].*\"}, instance_id)",
18661861
"refresh": 1,
18671862
"regex": "",
18681863
"skipUrlSync": false,
@@ -1907,5 +1902,5 @@
19071902
"timezone": "browser",
19081903
"title": "GPU Nodes",
19091904
"uid": "hpcsyl6zhqk",
1910-
"version": 19
1905+
"version": 1
19111906
}

prometheus/prometheus.yml

Lines changed: 7 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,12 @@ scrape_configs:
2828
ec2_sd_configs:
2929
- port: 9100
3030
refresh_interval: 10s
31+
- port: 9400
32+
refresh_interval: 10s
33+
filters:
34+
- name: instance-type
35+
values:
36+
- g*
3137
relabel_configs:
3238
- source_labels: [__meta_ec2_tag_Name]
3339
target_label: instance_name
@@ -46,28 +52,4 @@ scrape_configs:
4652
- source_labels: [__meta_ec2_instance_type]
4753
target_label: instance_type
4854
- source_labels: [__meta_ec2_vpc_id]
49-
target_label: instance_vpc
50-
- job_name: 'ec2_gpu_instances'
51-
scrape_interval: 5s
52-
ec2_sd_configs:
53-
- port: 9400
54-
refresh_interval: 10s
55-
relabel_configs:
56-
- source_labels: [__meta_ec2_tag_Name]
57-
target_label: gpu_instance_name
58-
- source_labels: [__meta_ec2_tag_Application]
59-
target_label: gpu_instance_grafana
60-
regex: __Application__
61-
action: keep
62-
- source_labels: [__meta_ec2_instance_id]
63-
target_label: gpu_instance_id
64-
- source_labels: [__meta_ec2_availability_zone]
65-
target_label: gpu_instance_az
66-
- source_labels: [__meta_ec2_instance_state]
67-
regex: running
68-
action: keep
69-
target_label: gpu_instance_state
70-
- source_labels: [__meta_ec2_instance_type]
71-
target_label: gpu_instance_type
72-
- source_labels: [__meta_ec2_vpc_id]
73-
target_label: gpu_instance_vpc
55+
target_label: instance_vpc

0 commit comments

Comments
 (0)