add alert policies for several metrics

eebi-sotec · eebi-sotec · commit f02fe470315f · 2024-10-23T12:34:50.000Z
diff --git a/terraform/infrastructure/main.tf b/terraform/infrastructure/main.tf
@@ -136,3 +136,12 @@ module "cloud_endpoint" {
     google_project_service.project
   ]
 }
+
+module "alert_policies" {
+  source = "../modules/alert_policies"
+  project_id = var.project_id
+
+  depends_on = [
+    google_project_service.project
+  ]
+}
diff --git a/terraform/modules/alert_policies/README.md b/terraform/modules/alert_policies/README.md
@@ -0,0 +1,36 @@
+<!-- BEGIN_TF_DOCS -->
+## Requirements
+
+No requirements.
+
+## Providers
+
+| Name | Version |
+|------|---------|
+| <a name="provider_google"></a> [google](#provider\_google) | n/a |
+
+## Modules
+
+No modules.
+
+## Resources
+
+| Name | Type |
+|------|------|
+| [google_monitoring_alert_policy.k8s_critical_resource_alert_policy](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/monitoring_alert_policy) | resource |
+| [google_monitoring_alert_policy.k8s_failed_pod_not_starting_alert_policy](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/monitoring_alert_policy) | resource |
+| [google_monitoring_alert_policy.k8s_pending_pod_not_starting_alert_policy](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/monitoring_alert_policy) | resource |
+| [google_monitoring_alert_policy.k8s_readiness_alert_policy](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/monitoring_alert_policy) | resource |
+| [google_monitoring_alert_policy.k8s_restart_alert_policy](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/monitoring_alert_policy) | resource |
+| [google_monitoring_notification_channel.email_channel](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/monitoring_notification_channel) | resource |
+
+## Inputs
+
+| Name | Description | Type | Default | Required |
+|------|-------------|------|---------|:--------:|
+| <a name="input_project_id"></a> [project\_id](#input\_project\_id) | The project ID to deploy to | `string` | n/a | yes |
+
+## Outputs
+
+No outputs.
+<!-- END_TF_DOCS -->
diff --git a/terraform/modules/alert_policies/main.tf b/terraform/modules/alert_policies/main.tf
@@ -0,0 +1,174 @@
+resource "google_monitoring_notification_channel" "email_channel" {
+  display_name = "Alert Policy Email Notification Channel"
+  type         = "email"
+  labels = {
+    email_address = "hono-devops@sotec.eu"
+  }
+  force_delete = false
+}
+
+# Alert policy for not starting pods
+resource "google_monitoring_alert_policy" "k8s_pending_pod_not_starting_alert_policy" {
+  display_name = "K8s pending pods alert"
+  notification_channels = [google_monitoring_notification_channel.email_channel.id]
+
+  combiner = "OR"
+  conditions {
+    display_name = "Pending Pod Alert"
+    condition_prometheus_query_language {
+      # Value is 0/1 depending if the pod is in the specified phase
+      query = "kube_pod_status_phase{namespace=\"hono\", phase=\"Pending\", project_id=\"${var.project_id}\"} != 0"
+      duration = "300s" # 5 min
+      evaluation_interval = "30s"
+    }
+  }
+
+  alert_strategy {
+    notification_channel_strategy {
+      renotify_interval = "3600s"
+    }
+  }
+  severity = "warning"
+  project = var.project_id
+  documentation {
+    subject = "Pending Pods Alert"
+    content = "Pod has been on pending for over 5mins."
+  }
+}
+
+# alert for failed pods
+resource "google_monitoring_alert_policy" "k8s_failed_pod_not_starting_alert_policy" {
+  display_name = "K8s failed pods alert"
+  notification_channels = [google_monitoring_notification_channel.email_channel.id]
+
+  combiner = "OR"
+  conditions {
+    display_name = "Failed Pod Alert"
+    condition_prometheus_query_language {
+      # Value is 0/1 depending if the pod is in the specified phase
+      query = "kube_pod_status_phase{namespace=\"hono\", phase=\"Failed\", project_id=\"${var.project_id}\"} != 0"
+      duration = "300s" # 5 min
+      evaluation_interval = "30s"
+    }
+  }
+
+  alert_strategy {
+    notification_channel_strategy {
+      renotify_interval = "3600s"
+    }
+  }
+  severity = "warning"
+  project = var.project_id
+  documentation {
+    subject = "Failed Pod Alert"
+    content = "Pod has been stuck in 'failed' for over 5mins."
+  }
+}
+
+# Alert Policy for constantly restarting pod
+resource "google_monitoring_alert_policy" "k8s_restart_alert_policy" {
+  display_name = "K8s pods constantly restarting alert"
+  notification_channels = [google_monitoring_notification_channel.email_channel.id]
+
+  combiner = "OR"
+  conditions {
+    display_name = "K8 pods in hono are continously restarting"
+    condition_threshold {
+      filter = "resource.type=\"k8s_container\" AND resource.labels.project_id=\"${var.project_id}\" AND metric.type=\"kubernetes.io/container/restart_count\" AND resource.labels.namespace_name=\"hono\""
+      comparison = "COMPARISON_GT"
+      threshold_value = 3
+      duration = "300s"
+      aggregations {
+        alignment_period = "300s"
+        per_series_aligner = "ALIGN_RATE"
+      }
+    }
+  }
+
+  alert_strategy {
+    notification_channel_strategy {
+      renotify_interval = "3600s"
+    }
+  }
+  severity = "warning"
+  project = var.project_id
+  documentation {
+    subject = "Pod continously restarting."
+    content = "The pod has been restarted more than 3 times in the last 5 mins."
+  }
+}
+
+
+resource "google_monitoring_alert_policy" "k8s_critical_resource_alert_policy" {
+  display_name = "K8s cluster critical resource usage alert"
+  notification_channels = [google_monitoring_notification_channel.email_channel.id]
+
+  combiner = "OR"
+  conditions {
+    display_name = "K8s cluster cpu load is above 95%"
+    condition_threshold {
+      filter = "resource.type=\"k8s_container\" AND resource.labels.project_id=\"${var.project_id}\" AND metric.type=\"kubernetes.io/container/cpu/request_utilization\" AND resource.labels.namespace_name=\"hono\""
+      comparison = "COMPARISON_GT"
+      threshold_value = 0.95  # 95% CPU Load
+      duration = "300s"
+      aggregations {
+        alignment_period = "300s"
+        per_series_aligner = "ALIGN_MEAN"
+      }
+    }
+  }
+
+  conditions {
+    display_name = "K8s cluster memory usage is above 90%"
+    condition_threshold {
+      filter = "resource.type=\"k8s_container\" AND resource.labels.project_id=\"${var.project_id}\" AND metric.type=\"kubernetes.io/container/memory/request_utilization\" AND resource.labels.namespace_name=\"hono\""
+      comparison = "COMPARISON_GT"
+      threshold_value = 0.90  # 90% Memory Usage
+      duration = "300s"
+      aggregations {
+        alignment_period = "300s"
+        per_series_aligner = "ALIGN_MEAN"
+      }
+    }
+  }
+
+  alert_strategy {
+    notification_channel_strategy {
+      renotify_interval = "3600s"
+    }
+  }
+  severity = "warning"
+  project = var.project_id
+  documentation {
+    subject = "Pod using critical amount of resources"
+    content = "Pod has been using more than 95% CPU and 90% Memory for the last 5 mins"
+  }
+}
+
+resource "google_monitoring_alert_policy" "k8s_readiness_alert_policy" {
+  display_name = "K8s readiness failures alert"
+  notification_channels = [google_monitoring_notification_channel.email_channel.id]
+
+  combiner = "OR"
+  conditions {
+    display_name = "Pod readiness failure alert"
+    condition_prometheus_query_language {
+      # every container with status ready returns 1
+      query = "kube_pod_container_status_ready{namespace=\"hono\",project_id=\"${var.project_id}\"} == 0"
+      duration = "300s" # 5 min
+      evaluation_interval = "30s"
+    }
+  }
+
+  alert_strategy {
+    notification_channel_strategy {
+        renotify_interval = "3600s"
+    }
+  }
+  severity = "warning"
+  project = var.project_id
+  documentation {
+    subject = "Pod not ready alert"
+    content = "The pod has not been ready for over 5 mins."
+  }
+}
diff --git a/terraform/modules/alert_policies/variables.tf b/terraform/modules/alert_policies/variables.tf
@@ -0,0 +1,4 @@
+variable "project_id" {
+  description = "The project ID to deploy to"
+  type        = string
+}

Original file line number	Diff line number	Diff line change
`@@ -136,3 +136,12 @@ module "cloud_endpoint" {`
`136`	`136`	`google_project_service.project`
`137`	`137`	`]`
`138`	`138`	`}`
	`139`	`+`
	`140`	`+module "alert_policies" {`
	`141`	`+ source = "../modules/alert_policies"`
	`142`	`+ project_id = var.project_id`
	`143`	`+`
	`144`	`+ depends_on = [`
	`145`	`+ google_project_service.project`
	`146`	`+ ]`
	`147`	`+}`