Skip to content

Commit f02fe47

Browse files
committed
add alert policies for several metrics
1 parent 119b9de commit f02fe47

File tree

4 files changed

+223
-0
lines changed

4 files changed

+223
-0
lines changed

terraform/infrastructure/main.tf

+9
Original file line numberDiff line numberDiff line change
@@ -136,3 +136,12 @@ module "cloud_endpoint" {
136136
google_project_service.project
137137
]
138138
}
139+
140+
module "alert_policies" {
141+
source = "../modules/alert_policies"
142+
project_id = var.project_id
143+
144+
depends_on = [
145+
google_project_service.project
146+
]
147+
}
+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
<!-- BEGIN_TF_DOCS -->
2+
## Requirements
3+
4+
No requirements.
5+
6+
## Providers
7+
8+
| Name | Version |
9+
|------|---------|
10+
| <a name="provider_google"></a> [google](#provider\_google) | n/a |
11+
12+
## Modules
13+
14+
No modules.
15+
16+
## Resources
17+
18+
| Name | Type |
19+
|------|------|
20+
| [google_monitoring_alert_policy.k8s_critical_resource_alert_policy](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/monitoring_alert_policy) | resource |
21+
| [google_monitoring_alert_policy.k8s_failed_pod_not_starting_alert_policy](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/monitoring_alert_policy) | resource |
22+
| [google_monitoring_alert_policy.k8s_pending_pod_not_starting_alert_policy](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/monitoring_alert_policy) | resource |
23+
| [google_monitoring_alert_policy.k8s_readiness_alert_policy](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/monitoring_alert_policy) | resource |
24+
| [google_monitoring_alert_policy.k8s_restart_alert_policy](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/monitoring_alert_policy) | resource |
25+
| [google_monitoring_notification_channel.email_channel](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/monitoring_notification_channel) | resource |
26+
27+
## Inputs
28+
29+
| Name | Description | Type | Default | Required |
30+
|------|-------------|------|---------|:--------:|
31+
| <a name="input_project_id"></a> [project\_id](#input\_project\_id) | The project ID to deploy to | `string` | n/a | yes |
32+
33+
## Outputs
34+
35+
No outputs.
36+
<!-- END_TF_DOCS -->
+174
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
resource "google_monitoring_notification_channel" "email_channel" {
2+
display_name = "Alert Policy Email Notification Channel"
3+
type = "email"
4+
labels = {
5+
email_address = "[email protected]"
6+
}
7+
force_delete = false
8+
}
9+
10+
# Alert policy for not starting pods
11+
resource "google_monitoring_alert_policy" "k8s_pending_pod_not_starting_alert_policy" {
12+
display_name = "K8s pending pods alert"
13+
notification_channels = [google_monitoring_notification_channel.email_channel.id]
14+
15+
combiner = "OR"
16+
conditions {
17+
display_name = "Pending Pod Alert"
18+
condition_prometheus_query_language {
19+
# Value is 0/1 depending if the pod is in the specified phase
20+
query = "kube_pod_status_phase{namespace=\"hono\", phase=\"Pending\", project_id=\"${var.project_id}\"} != 0"
21+
duration = "300s" # 5 min
22+
evaluation_interval = "30s"
23+
}
24+
}
25+
26+
alert_strategy {
27+
notification_channel_strategy {
28+
renotify_interval = "3600s"
29+
}
30+
}
31+
severity = "warning"
32+
project = var.project_id
33+
documentation {
34+
subject = "Pending Pods Alert"
35+
content = "Pod has been on pending for over 5mins."
36+
}
37+
}
38+
39+
# alert for failed pods
40+
resource "google_monitoring_alert_policy" "k8s_failed_pod_not_starting_alert_policy" {
41+
display_name = "K8s failed pods alert"
42+
notification_channels = [google_monitoring_notification_channel.email_channel.id]
43+
44+
combiner = "OR"
45+
conditions {
46+
display_name = "Failed Pod Alert"
47+
condition_prometheus_query_language {
48+
# Value is 0/1 depending if the pod is in the specified phase
49+
query = "kube_pod_status_phase{namespace=\"hono\", phase=\"Failed\", project_id=\"${var.project_id}\"} != 0"
50+
duration = "300s" # 5 min
51+
evaluation_interval = "30s"
52+
}
53+
}
54+
55+
alert_strategy {
56+
notification_channel_strategy {
57+
renotify_interval = "3600s"
58+
}
59+
}
60+
severity = "warning"
61+
project = var.project_id
62+
documentation {
63+
subject = "Failed Pod Alert"
64+
content = "Pod has been stuck in 'failed' for over 5mins."
65+
}
66+
}
67+
68+
# Alert Policy for constantly restarting pod
69+
resource "google_monitoring_alert_policy" "k8s_restart_alert_policy" {
70+
display_name = "K8s pods constantly restarting alert"
71+
notification_channels = [google_monitoring_notification_channel.email_channel.id]
72+
73+
combiner = "OR"
74+
conditions {
75+
display_name = "K8 pods in hono are continously restarting"
76+
condition_threshold {
77+
filter = "resource.type=\"k8s_container\" AND resource.labels.project_id=\"${var.project_id}\" AND metric.type=\"kubernetes.io/container/restart_count\" AND resource.labels.namespace_name=\"hono\""
78+
comparison = "COMPARISON_GT"
79+
threshold_value = 3
80+
duration = "300s"
81+
aggregations {
82+
alignment_period = "300s"
83+
per_series_aligner = "ALIGN_RATE"
84+
}
85+
}
86+
}
87+
88+
alert_strategy {
89+
notification_channel_strategy {
90+
renotify_interval = "3600s"
91+
}
92+
}
93+
severity = "warning"
94+
project = var.project_id
95+
documentation {
96+
subject = "Pod continously restarting."
97+
content = "The pod has been restarted more than 3 times in the last 5 mins."
98+
}
99+
}
100+
101+
102+
resource "google_monitoring_alert_policy" "k8s_critical_resource_alert_policy" {
103+
display_name = "K8s cluster critical resource usage alert"
104+
notification_channels = [google_monitoring_notification_channel.email_channel.id]
105+
106+
combiner = "OR"
107+
conditions {
108+
display_name = "K8s cluster cpu load is above 95%"
109+
condition_threshold {
110+
filter = "resource.type=\"k8s_container\" AND resource.labels.project_id=\"${var.project_id}\" AND metric.type=\"kubernetes.io/container/cpu/request_utilization\" AND resource.labels.namespace_name=\"hono\""
111+
comparison = "COMPARISON_GT"
112+
threshold_value = 0.95 # 95% CPU Load
113+
duration = "300s"
114+
aggregations {
115+
alignment_period = "300s"
116+
per_series_aligner = "ALIGN_MEAN"
117+
}
118+
}
119+
}
120+
121+
conditions {
122+
display_name = "K8s cluster memory usage is above 90%"
123+
condition_threshold {
124+
filter = "resource.type=\"k8s_container\" AND resource.labels.project_id=\"${var.project_id}\" AND metric.type=\"kubernetes.io/container/memory/request_utilization\" AND resource.labels.namespace_name=\"hono\""
125+
comparison = "COMPARISON_GT"
126+
threshold_value = 0.90 # 90% Memory Usage
127+
duration = "300s"
128+
aggregations {
129+
alignment_period = "300s"
130+
per_series_aligner = "ALIGN_MEAN"
131+
}
132+
}
133+
}
134+
135+
alert_strategy {
136+
notification_channel_strategy {
137+
renotify_interval = "3600s"
138+
}
139+
}
140+
severity = "warning"
141+
project = var.project_id
142+
documentation {
143+
subject = "Pod using critical amount of resources"
144+
content = "Pod has been using more than 95% CPU and 90% Memory for the last 5 mins"
145+
}
146+
}
147+
148+
resource "google_monitoring_alert_policy" "k8s_readiness_alert_policy" {
149+
display_name = "K8s readiness failures alert"
150+
notification_channels = [google_monitoring_notification_channel.email_channel.id]
151+
152+
combiner = "OR"
153+
conditions {
154+
display_name = "Pod readiness failure alert"
155+
condition_prometheus_query_language {
156+
# every container with status ready returns 1
157+
query = "kube_pod_container_status_ready{namespace=\"hono\",project_id=\"${var.project_id}\"} == 0"
158+
duration = "300s" # 5 min
159+
evaluation_interval = "30s"
160+
}
161+
}
162+
163+
alert_strategy {
164+
notification_channel_strategy {
165+
renotify_interval = "3600s"
166+
}
167+
}
168+
severity = "warning"
169+
project = var.project_id
170+
documentation {
171+
subject = "Pod not ready alert"
172+
content = "The pod has not been ready for over 5 mins."
173+
}
174+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
variable "project_id" {
2+
description = "The project ID to deploy to"
3+
type = string
4+
}

0 commit comments

Comments
 (0)