Add monitor with slack notification for uptime failures.

Michael Smit · Michael Smit · commit d03371743166 · 2025-03-24T16:24:29.000-07:00
Related to PolicyEngine/issues#224

Each service using the reusable fastapi module will now also get an automatic monitor for it's automatic uptime.
diff --git a/.github/workflows/gcp-deploy.reusable.yml b/.github/workflows/gcp-deploy.reusable.yml
@@ -20,6 +20,7 @@ env:
   TF_VAR_full_container_tag: ${{ github.sha }}
   TF_VAR_simulation_container_tag: ${{ github.sha }}
   TF_VAR_hugging_face_token: ${{ secrets.HUGGING_FACE_TOKEN }}
+  TF_VAR_slack_notification_channel_name: ${{ vars.SLACK_NOTIFICATION_CHANNEL }}
   BUILD_TAG: ${{ github.run_id }}.${{ github.run_number }}.${{ github.run_attempt }}
   COMMIT_TAG: ${{ github.sha }}
 jobs:
diff --git a/terraform/infra-policyengine-api/Makefile b/terraform/infra-policyengine-api/Makefile
@@ -1,6 +1,9 @@
 # For github action deployes we use environment variables to configure terraform
 # on desktop we use apply files generated either by the infra-policyengine-api bootstrap
 # or manual user config
+REPO_URL := $(shell git remote get-url origin | sed 's/\.git$$//' | sed 's/git@github.com:/https:\/\/github.com\//')
+COMMIT_SHA := $(shell git rev-parse HEAD)
+COMMIT_URL := $(REPO_URL)/commit/$(COMMIT_SHA)
 
 # get the project ID
 include ../.bootstrap_settings/project.env
@@ -18,7 +21,7 @@ deploy: .terraform
 	@echo "Latest Full API SHA: ${FULL_SHA}"
 	@echo "Latest Simulation API SHA: ${SIM_SHA}"
 	@echo "Running terraform apply with ../.bootstrap_settings/apply.tfvars"
-	terraform apply -var-file ../.bootstrap_settings/apply.tfvars -var "full_container_tag=${TAG}@${FULL_SHA}" -var "simulation_container_tag=${TAG}@${SIM_SHA}" -lock=false
+	terraform apply -var-file ../.bootstrap_settings/apply.tfvars -var "full_container_tag=${TAG}@${FULL_SHA}" -var "simulation_container_tag=${TAG}@${SIM_SHA}" -var "commit_url=${COMMIT_URL}"
 
 .terraform: ../.bootstrap_settings/backend.tfvars
 	@echo "Initializing terraform"
diff --git a/terraform/infra-policyengine-api/Makefile.deploy b/terraform/infra-policyengine-api/Makefile.deploy
@@ -1,11 +1,15 @@
+REPO_URL := $(shell git remote get-url origin | sed 's/\.git$$//' | sed 's/git@github.com:/https:\/\/github.com\//')
+COMMIT_SHA := $(shell git rev-parse HEAD)
+COMMIT_URL := $(REPO_URL)/commit/$(COMMIT_SHA)
+
 plan-deploy: .terraform
 	terraform plan -input=false
 
 state: .terraform
 	terraform output -json
 
 deploy: .terraform
-	terraform apply -input=false -auto-approve
+	terraform apply -input=false -auto-approve -var "commit_url=${COMMIT_URL}"
 	terraform output -json > terraform_output.json
 
 .terraform:
diff --git a/terraform/infra-policyengine-api/main.tf b/terraform/infra-policyengine-api/main.tf
@@ -28,6 +28,8 @@ module "cloud_run_full_api" {
   project_id=var.project_id
   region=var.region
   is_prod=var.is_prod
+  slack_notification_channel_name=var.slack_notification_channel_name
+  commit_url = var.commit_url
 }
 
 # https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/cloud_run_v2_service
diff --git a/terraform/infra-policyengine-api/modules/fastapi_cloudrun/main.tf b/terraform/infra-policyengine-api/modules/fastapi_cloudrun/main.tf
@@ -73,6 +73,10 @@ resource "google_cloud_run_v2_service" "api" {
   }
 }
 
+data "google_project" "project" {
+  project_id = var.project_id
+}
+
 data "google_iam_policy" "api" {
   binding {
     role = "roles/run.invoker"
diff --git a/terraform/infra-policyengine-api/modules/fastapi_cloudrun/uptime.tf b/terraform/infra-policyengine-api/modules/fastapi_cloudrun/uptime.tf
@@ -4,7 +4,7 @@ resource "google_monitoring_uptime_check_config" "cloudrun_health_check" {
   # response time can be slower because of container spin up in beta.
   timeout      = var.is_prod ? "1s" : "10s"
   # don't waste resources waking up the beta container all the time. Just do it once a day.
-  period       = var.is_prod ? "300s" : "86400s"
+  period       = var.is_prod ? "300s" : "900s"
 
   http_check {
     path         = "/ping/alive"
@@ -24,4 +24,59 @@ resource "google_monitoring_uptime_check_config" "cloudrun_health_check" {
       host       =  regex("://([^/:]+)", google_cloud_run_v2_service.api.uri)[0]
     }
   }
-}
+}
+
+# Only reference the Slack notification channel if the variable is not empty
+locals {
+  use_slack_notification = var.slack_notification_channel_name != ""
+  notification_channels = local.use_slack_notification ? [data.google_monitoring_notification_channel.slack[0].name] : []
+}
+
+#You need to do this in the console
+# Reference an existing Slack notification channel that was set up in the console
+data "google_monitoring_notification_channel" "slack" {
+  count        = local.use_slack_notification ? 1 : 0
+  display_name = var.slack_notification_channel_name
+}
+
+# Create the alerting policy with PromQL that references your uptime check
+resource "google_monitoring_alert_policy" "cloudrun_health_alert" {
+  display_name = "${var.name} Health Check Alert"
+  combiner     = "OR"
+  
+  conditions {
+    display_name = "Uptime Check Failed"
+    condition_prometheus_query_language {
+        query = "avg by (check_id)(avg_over_time(monitoring_googleapis_com:uptime_check_check_passed{check_id=\"${google_monitoring_uptime_check_config.cloudrun_health_check.uptime_check_id}\", monitored_resource=\"uptime_url\"}[60s])) < 1"
+        duration = "60s"
+        evaluation_interval = "60s"
+        alert_rule = "OnPresentAndFiring"
+        rule_group = "health_checks"
+        labels = {
+          severity = "critical"
+        }
+    }
+  }
+
+  notification_channels = local.notification_channels
+
+  # Add documentation with more detailed information for Slack messages
+  documentation {
+  content = <<-EOT
+      🚨 *${var.name} Service Health Alert*
+
+      The ${var.name} service is failing its health check at endpoint `/ping/alive`.
+
+      *Troubleshooting Steps:*
+      - Check the [${var.name} Cloud Run service](https://console.cloud.google.com/run/detail/${var.region}/${var.name}/metrics?project=${var.project_id})
+      - Check the [latest changes](${var.commit_url})
+      EOT
+    mime_type = "text/markdown"
+  }
+  
+
+  # Auto-close to reduce alert fatigue
+  #alert_strategy {
+  #  auto_close = var.is_prod ? "1800s" : "3600s" # 30 minutes for prod, 1 hour for non-prod
+  #}
+}
diff --git a/terraform/infra-policyengine-api/modules/fastapi_cloudrun/variables.tf b/terraform/infra-policyengine-api/modules/fastapi_cloudrun/variables.tf
@@ -33,7 +33,16 @@ variable "name" {
   type = string
 }
 
+variable "slack_notification_channel_name" {
+  type = string
+  default = ""
+}
 
 variable "test_account_email" {
   type = string
 }
+
+variable "commit_url" {
+  type = string
+  description = "URL of the commit this deployment is associated with"
+}
diff --git a/terraform/infra-policyengine-api/variables.tf b/terraform/infra-policyengine-api/variables.tf
@@ -30,3 +30,14 @@ variable "hugging_face_token" {
   type        = string
   sensitive   = true
 }
+
+variable "slack_notification_channel_name" {
+  description = "Manually configured slack notification channel's name"
+  type = string
+  default = ""
+}
+
+variable "commit_url" {
+  type = string
+  description = "URL of the commit this deployment is associated with"
+}
diff --git a/terraform/project-policyengine-api/scripts/bootstrap.sh b/terraform/project-policyengine-api/scripts/bootstrap.sh
@@ -61,6 +61,9 @@ terraform apply -var "org_id=${org_id}" -var "billing_account=${billing_account}
     -var "github_repo_owner_id=${github_repo_owner_id}" -var "github_repo=${github_repo}"
 terraform init -migrate-state
 
+echo " (OPTIONALLY) Please go into the console and create a slack notification channel. Write down the display name"
+read -p "Enter notification channel display name (LEAVE EMPTY FOR NONE):" slack_notification_channel_name
+
 
 project_id=$(terraform output -raw project_id)
 project_bucket=$(terraform output -raw project_bucket)
@@ -78,6 +81,7 @@ project_id       = "${project_id}"
 is_prod          = ${is_prod}
 github_repo_owner_id = "${github_repo_owner_id}"
 github_repo          = "${github_repo}"
+slack_notification_channel_name = "${slack_notification_channel_name}"
 EOF
 
 echo "Creating backend.tfvars which is used to configure the backend.tf settings when using terraform init"
@@ -88,4 +92,4 @@ EOF
 echo "Creating project.env for Makefiles to load the project id of the created project."
 cat > ../.bootstrap_settings/project.env << EOF
 PROJECT_ID=${project_id}
-EOF
+EOF

Original file line number	Diff line number	Diff line change
`@@ -28,6 +28,8 @@ module "cloud_run_full_api" {`
`28`	`28`	`project_id=var.project_id`
`29`	`29`	`region=var.region`
`30`	`30`	`is_prod=var.is_prod`
	`31`	`+ slack_notification_channel_name=var.slack_notification_channel_name`
	`32`	`+ commit_url = var.commit_url`
`31`	`33`	`}`
`32`	`34`
`33`	`35`	`# https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/cloud_run_v2_service`
Original file line number	Diff line number	Diff line change
`@@ -73,6 +73,10 @@ resource "google_cloud_run_v2_service" "api" {`
`73`	`73`	`}`
`74`	`74`	`}`
`75`	`75`
	`76`	`+data "google_project" "project" {`
	`77`	`+ project_id = var.project_id`
	`78`	`+}`
	`79`	`+`
`76`	`80`	`data "google_iam_policy" "api" {`
`77`	`81`	`binding {`
`78`	`82`	`role = "roles/run.invoker"`