@@ -6,10 +6,10 @@ metadata:
6
6
data :
7
7
{{- $service := dict "service" "coder" -}}
8
8
9
- {{- with .Values.global.coder.coderd }}
10
- coder .yaml : |-
9
+ {{- with .Values.global.coder.alerts. coderd }} {{/* start-section */ }}
10
+ coderd .yaml : |-
11
11
groups:
12
- {{- with .alerts. groups.CPU }}
12
+ {{- with .groups.CPU }}
13
13
{{- $group := . }}
14
14
{{- if .enabled }}
15
15
- name : CPU Usage
@@ -19,15 +19,16 @@ data:
19
19
- alert : {{ $alert }}
20
20
expr : max by (pod) (rate(container_cpu_usage_seconds_total{ {{- include "coderd-selector" $ -}} }[{{- $group.period -}}])) / max by(pod) (kube_pod_container_resource_limits{ {{- include "coderd-selector" $ -}}, resource="cpu"}) > {{ $threshold }}
21
21
for : {{ $group.delay }}
22
- labels :
22
+ annotations :
23
23
summary : The Coder instance {{ `{{ $labels.pod }}` }} is using high amounts of CPU, which may impact application performance.
24
+ labels :
24
25
severity : {{ $severity }}
25
26
runbook_url : {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }}
26
27
{{- end }}
27
28
{{- end }}
28
29
{{- end }}
29
30
30
- {{- with .alerts. groups.Memory }}
31
+ {{- with .groups.Memory }}
31
32
{{- $group := . }}
32
33
{{- if .enabled }}
33
34
- name : Memory Usage
@@ -37,15 +38,16 @@ data:
37
38
- alert : {{ $alert }}
38
39
expr : max by (pod) (container_memory_working_set_bytes{ {{- include "coderd-selector" $ -}} }) / max by (pod) (kube_pod_container_resource_limits{ {{- include "coderd-selector" $ -}}, resource="memory"}) > {{ $threshold }}
39
40
for : {{ $group.delay }}
40
- labels :
41
+ annotations :
41
42
summary : The Coder instance {{ `{{ $labels.pod }}` }} is using high amounts of memory, which may lead to an Out-Of-Memory (OOM) error.
43
+ labels :
42
44
severity : {{ $severity }}
43
45
runbook_url : {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }}
44
46
{{- end }}
45
47
{{- end }}
46
48
{{- end }}
47
49
48
- {{- with .alerts. groups.Restarts }}
50
+ {{- with .groups.Restarts }}
49
51
{{- $group := . }}
50
52
{{- if .enabled }}
51
53
- name : Pod Restarts
@@ -55,14 +57,41 @@ data:
55
57
- alert : {{ $alert }}
56
58
expr : sum by(pod) (increase(kube_pod_container_status_restarts_total{ {{- include "coderd-selector" $ -}} }[{{- $group.period -}}])) > {{ $threshold }}
57
59
for : {{ $group.delay }}
58
- labels :
60
+ annotations :
59
61
summary : The Coder instance {{ `{{ $labels.pod }}` }} has restarted multiple times in the last {{ $group.period -}}, which may indicate a CrashLoop.
62
+ labels :
60
63
severity : {{ $severity }}
61
64
runbook_url : {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }}
62
65
{{- end }}
63
66
{{- end }}
64
67
{{- end }}
68
+ {{- end }} {{/* end-section */}}
69
+
70
+
71
+ {{- $service = dict "service" "enterprise" -}}
72
+
73
+ {{- with .Values.global.coder.alerts.enterprise }} {{/* start-section */}}
74
+ enterprise.yaml : |-
75
+ groups:
76
+ {{- with .groups.Licences }}
77
+ {{- $group := . }}
78
+ {{- if .enabled }}
79
+ - name : Licences
80
+ rules :
81
+ {{ $alert := "CoderLicenseSeats" }}
82
+ {{- range $severity, $threshold := .thresholds }}
83
+ - alert : {{ $alert }}
84
+ expr : ' max(coderd_license_active_users) / max(coderd_license_limit_users) >= {{- $threshold }}'
85
+ for : {{ $group.delay }}
86
+ annotations :
87
+ summary : Your Coder enterprise licence usage is now at {{ `{{ $value | humanizePercentage }}` }} capacity.
88
+ labels :
89
+ severity : {{ $severity }}
90
+ runbook_url : {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }}
91
+ {{- end }}
92
+ {{- end }}
65
93
{{- end }}
94
+ {{- end }} {{/* end-section */}}
66
95
67
96
{{- $service = dict "service" "postgres" -}}
68
97
{{- with .Values.global.postgres }}
78
107
- alert : {{ $alert }}
79
108
expr : {{ include "postgres-pubsub-queue-usage-metric-name" . }} > {{ $threshold }}
80
109
for : {{ $group.delay }}
81
- labels :
110
+ annotations :
82
111
summary : The postgres instance {{ `{{ $labels.instance }}` }} has a notification that is filling up, which may impact application performance.
112
+ labels :
83
113
severity : {{ $severity }}
84
114
runbook_url : {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }}
85
115
{{- end }}
94
124
- alert : {{ $alert }}
95
125
expr : pg_up == 0
96
126
for : {{ $group.delay }}
97
- labels :
127
+ annotations :
98
128
summary : The postgres instance {{ `{{ $labels.instance }}` }} is down!
129
+ labels :
99
130
severity : critical
100
131
runbook_url : {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }}
101
132
{{- end }}
0 commit comments