Skip to content

Commit 6905f6b

Browse files
committed
Add licence alert
Signed-off-by: Danny Kopping <[email protected]>
1 parent 83c291b commit 6905f6b

File tree

3 files changed

+59
-14
lines changed

3 files changed

+59
-14
lines changed

coder-observability/runbooks/coderd.md

+6
Original file line numberDiff line numberDiff line change
@@ -54,3 +54,9 @@ If Coder is not restarting due to excessive memory usage, check the logs:
5454
```console
5555
kubectl -n <coder namespace> events --watch
5656
```
57+
58+
## CoderdLicenseSeats
59+
60+
Your Enterprise license is approaching or has exceeded the number of seats purchased.
61+
62+
Please contact your Coder sales contact, or visit https://coder.com/contact/sales.

coder-observability/templates/configmap-prometheus-alerts.yaml

+41-10
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,10 @@ metadata:
66
data:
77
{{- $service := dict "service" "coder" -}}
88

9-
{{- with .Values.global.coder.coderd }}
10-
coder.yaml: |-
9+
{{- with .Values.global.coder.alerts.coderd }} {{/* start-section */}}
10+
coderd.yaml: |-
1111
groups:
12-
{{- with .alerts.groups.CPU }}
12+
{{- with .groups.CPU }}
1313
{{- $group := . }}
1414
{{- if .enabled }}
1515
- name: CPU Usage
@@ -19,15 +19,16 @@ data:
1919
- alert: {{ $alert }}
2020
expr: max by (pod) (rate(container_cpu_usage_seconds_total{ {{- include "coderd-selector" $ -}} }[{{- $group.period -}}])) / max by(pod) (kube_pod_container_resource_limits{ {{- include "coderd-selector" $ -}}, resource="cpu"}) > {{ $threshold }}
2121
for: {{ $group.delay }}
22-
labels:
22+
annotations:
2323
summary: The Coder instance {{ `{{ $labels.pod }}` }} is using high amounts of CPU, which may impact application performance.
24+
labels:
2425
severity: {{ $severity }}
2526
runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }}
2627
{{- end }}
2728
{{- end }}
2829
{{- end }}
2930

30-
{{- with .alerts.groups.Memory }}
31+
{{- with .groups.Memory }}
3132
{{- $group := . }}
3233
{{- if .enabled }}
3334
- name: Memory Usage
@@ -37,15 +38,16 @@ data:
3738
- alert: {{ $alert }}
3839
expr: max by (pod) (container_memory_working_set_bytes{ {{- include "coderd-selector" $ -}} }) / max by (pod) (kube_pod_container_resource_limits{ {{- include "coderd-selector" $ -}}, resource="memory"}) > {{ $threshold }}
3940
for: {{ $group.delay }}
40-
labels:
41+
annotations:
4142
summary: The Coder instance {{ `{{ $labels.pod }}` }} is using high amounts of memory, which may lead to an Out-Of-Memory (OOM) error.
43+
labels:
4244
severity: {{ $severity }}
4345
runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }}
4446
{{- end }}
4547
{{- end }}
4648
{{- end }}
4749

48-
{{- with .alerts.groups.Restarts }}
50+
{{- with .groups.Restarts }}
4951
{{- $group := . }}
5052
{{- if .enabled }}
5153
- name: Pod Restarts
@@ -55,14 +57,41 @@ data:
5557
- alert: {{ $alert }}
5658
expr: sum by(pod) (increase(kube_pod_container_status_restarts_total{ {{- include "coderd-selector" $ -}} }[{{- $group.period -}}])) > {{ $threshold }}
5759
for: {{ $group.delay }}
58-
labels:
60+
annotations:
5961
summary: The Coder instance {{ `{{ $labels.pod }}` }} has restarted multiple times in the last {{ $group.period -}}, which may indicate a CrashLoop.
62+
labels:
6063
severity: {{ $severity }}
6164
runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }}
6265
{{- end }}
6366
{{- end }}
6467
{{- end }}
68+
{{- end }} {{/* end-section */}}
69+
70+
71+
{{- $service = dict "service" "enterprise" -}}
72+
73+
{{- with .Values.global.coder.alerts.enterprise }} {{/* start-section */}}
74+
enterprise.yaml: |-
75+
groups:
76+
{{- with .groups.Licences }}
77+
{{- $group := . }}
78+
{{- if .enabled }}
79+
- name: Licences
80+
rules:
81+
{{ $alert := "CoderLicenseSeats" }}
82+
{{- range $severity, $threshold := .thresholds }}
83+
- alert: {{ $alert }}
84+
expr: 'max(coderd_license_active_users) / max(coderd_license_limit_users) >= {{- $threshold }}'
85+
for: {{ $group.delay }}
86+
annotations:
87+
summary: Your Coder enterprise licence usage is now at {{ `{{ $value | humanizePercentage }}` }} capacity.
88+
labels:
89+
severity: {{ $severity }}
90+
runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }}
91+
{{- end }}
92+
{{- end }}
6593
{{- end }}
94+
{{- end }} {{/* end-section */}}
6695

6796
{{- $service = dict "service" "postgres" -}}
6897
{{- with .Values.global.postgres }}
@@ -78,8 +107,9 @@ data:
78107
- alert: {{ $alert }}
79108
expr: {{ include "postgres-pubsub-queue-usage-metric-name" . }} > {{ $threshold }}
80109
for: {{ $group.delay }}
81-
labels:
110+
annotations:
82111
summary: The postgres instance {{ `{{ $labels.instance }}` }} has a notification that is filling up, which may impact application performance.
112+
labels:
83113
severity: {{ $severity }}
84114
runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }}
85115
{{- end }}
@@ -94,8 +124,9 @@ data:
94124
- alert: {{ $alert }}
95125
expr: pg_up == 0
96126
for: {{ $group.delay }}
97-
labels:
127+
annotations:
98128
summary: The postgres instance {{ `{{ $labels.instance }}` }} is down!
129+
labels:
99130
severity: critical
100131
runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }}
101132
{{- end }}

coder-observability/values.yaml

+12-4
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,17 @@ global:
3030
# with regex matchers.
3131
# TODO: support "json" format
3232
logFormat: human
33-
# global.coder.alerts -- alerts for Coder
34-
coderd:
35-
alerts:
33+
# global.coder.alerts -- alerts for the various aspects of Coder
34+
alerts:
35+
enterprise:
36+
groups:
37+
Licences:
38+
enabled: true
39+
delay: 1m
40+
thresholds:
41+
warning: 0.9
42+
critical: 1
43+
coderd:
3644
groups:
3745
CPU:
3846
enabled: true
@@ -84,7 +92,7 @@ global:
8492
database: coder
8593
sslmode: disable
8694
# ensure that your secret has a field named `PGPASSWORD`
87-
mountSecret:
95+
mountSecret: "secret-postgres"
8896

8997
# global.postgres.alerts -- alerts for postgres
9098
alerts:

0 commit comments

Comments
 (0)