Skip to content

Commit d033717

Browse files
author
Michael Smit
committed
Add monitor with slack notification for uptime failures.
Related to PolicyEngine/issues#224 Each service using the reusable fastapi module will now also get an automatic monitor for it's automatic uptime.
1 parent c1c4532 commit d033717

File tree

9 files changed

+98
-5
lines changed

9 files changed

+98
-5
lines changed

.github/workflows/gcp-deploy.reusable.yml

+1
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ env:
2020
TF_VAR_full_container_tag: ${{ github.sha }}
2121
TF_VAR_simulation_container_tag: ${{ github.sha }}
2222
TF_VAR_hugging_face_token: ${{ secrets.HUGGING_FACE_TOKEN }}
23+
TF_VAR_slack_notification_channel_name: ${{ vars.SLACK_NOTIFICATION_CHANNEL }}
2324
BUILD_TAG: ${{ github.run_id }}.${{ github.run_number }}.${{ github.run_attempt }}
2425
COMMIT_TAG: ${{ github.sha }}
2526
jobs:

terraform/infra-policyengine-api/Makefile

+4-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
# For github action deployes we use environment variables to configure terraform
22
# on desktop we use apply files generated either by the infra-policyengine-api bootstrap
33
# or manual user config
4+
REPO_URL := $(shell git remote get-url origin | sed 's/\.git$$//' | sed 's/[email protected]:/https:\/\/github.com\//')
5+
COMMIT_SHA := $(shell git rev-parse HEAD)
6+
COMMIT_URL := $(REPO_URL)/commit/$(COMMIT_SHA)
47

58
# get the project ID
69
include ../.bootstrap_settings/project.env
@@ -18,7 +21,7 @@ deploy: .terraform
1821
@echo "Latest Full API SHA: ${FULL_SHA}"
1922
@echo "Latest Simulation API SHA: ${SIM_SHA}"
2023
@echo "Running terraform apply with ../.bootstrap_settings/apply.tfvars"
21-
terraform apply -var-file ../.bootstrap_settings/apply.tfvars -var "full_container_tag=${TAG}@${FULL_SHA}" -var "simulation_container_tag=${TAG}@${SIM_SHA}" -lock=false
24+
terraform apply -var-file ../.bootstrap_settings/apply.tfvars -var "full_container_tag=${TAG}@${FULL_SHA}" -var "simulation_container_tag=${TAG}@${SIM_SHA}" -var "commit_url=${COMMIT_URL}"
2225

2326
.terraform: ../.bootstrap_settings/backend.tfvars
2427
@echo "Initializing terraform"

terraform/infra-policyengine-api/Makefile.deploy

+5-1
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,15 @@
1+
REPO_URL := $(shell git remote get-url origin | sed 's/\.git$$//' | sed 's/[email protected]:/https:\/\/github.com\//')
2+
COMMIT_SHA := $(shell git rev-parse HEAD)
3+
COMMIT_URL := $(REPO_URL)/commit/$(COMMIT_SHA)
4+
15
plan-deploy: .terraform
26
terraform plan -input=false
37

48
state: .terraform
59
terraform output -json
610

711
deploy: .terraform
8-
terraform apply -input=false -auto-approve
12+
terraform apply -input=false -auto-approve -var "commit_url=${COMMIT_URL}"
913
terraform output -json > terraform_output.json
1014

1115
.terraform:

terraform/infra-policyengine-api/main.tf

+2
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ module "cloud_run_full_api" {
2828
project_id=var.project_id
2929
region=var.region
3030
is_prod=var.is_prod
31+
slack_notification_channel_name=var.slack_notification_channel_name
32+
commit_url = var.commit_url
3133
}
3234

3335
# https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/cloud_run_v2_service

terraform/infra-policyengine-api/modules/fastapi_cloudrun/main.tf

+4
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,10 @@ resource "google_cloud_run_v2_service" "api" {
7373
}
7474
}
7575

76+
data "google_project" "project" {
77+
project_id = var.project_id
78+
}
79+
7680
data "google_iam_policy" "api" {
7781
binding {
7882
role = "roles/run.invoker"

terraform/infra-policyengine-api/modules/fastapi_cloudrun/uptime.tf

+57-2
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ resource "google_monitoring_uptime_check_config" "cloudrun_health_check" {
44
# response time can be slower because of container spin up in beta.
55
timeout = var.is_prod ? "1s" : "10s"
66
# don't waste resources waking up the beta container all the time. Just do it once a day.
7-
period = var.is_prod ? "300s" : "86400s"
7+
period = var.is_prod ? "300s" : "900s"
88

99
http_check {
1010
path = "/ping/alive"
@@ -24,4 +24,59 @@ resource "google_monitoring_uptime_check_config" "cloudrun_health_check" {
2424
host = regex("://([^/:]+)", google_cloud_run_v2_service.api.uri)[0]
2525
}
2626
}
27-
}
27+
}
28+
29+
# Only reference the Slack notification channel if the variable is not empty
30+
locals {
31+
use_slack_notification = var.slack_notification_channel_name != ""
32+
notification_channels = local.use_slack_notification ? [data.google_monitoring_notification_channel.slack[0].name] : []
33+
}
34+
35+
#You need to do this in the console
36+
# Reference an existing Slack notification channel that was set up in the console
37+
data "google_monitoring_notification_channel" "slack" {
38+
count = local.use_slack_notification ? 1 : 0
39+
display_name = var.slack_notification_channel_name
40+
}
41+
42+
# Create the alerting policy with PromQL that references your uptime check
43+
resource "google_monitoring_alert_policy" "cloudrun_health_alert" {
44+
display_name = "${var.name} Health Check Alert"
45+
combiner = "OR"
46+
47+
conditions {
48+
display_name = "Uptime Check Failed"
49+
condition_prometheus_query_language {
50+
query = "avg by (check_id)(avg_over_time(monitoring_googleapis_com:uptime_check_check_passed{check_id=\"${google_monitoring_uptime_check_config.cloudrun_health_check.uptime_check_id}\", monitored_resource=\"uptime_url\"}[60s])) < 1"
51+
duration = "60s"
52+
evaluation_interval = "60s"
53+
alert_rule = "OnPresentAndFiring"
54+
rule_group = "health_checks"
55+
labels = {
56+
severity = "critical"
57+
}
58+
}
59+
}
60+
61+
notification_channels = local.notification_channels
62+
63+
# Add documentation with more detailed information for Slack messages
64+
documentation {
65+
content = <<-EOT
66+
🚨 *${var.name} Service Health Alert*
67+
68+
The ${var.name} service is failing its health check at endpoint `/ping/alive`.
69+
70+
*Troubleshooting Steps:*
71+
- Check the [${var.name} Cloud Run service](https://console.cloud.google.com/run/detail/${var.region}/${var.name}/metrics?project=${var.project_id})
72+
- Check the [latest changes](${var.commit_url})
73+
EOT
74+
mime_type = "text/markdown"
75+
}
76+
77+
78+
# Auto-close to reduce alert fatigue
79+
#alert_strategy {
80+
# auto_close = var.is_prod ? "1800s" : "3600s" # 30 minutes for prod, 1 hour for non-prod
81+
#}
82+
}

terraform/infra-policyengine-api/modules/fastapi_cloudrun/variables.tf

+9
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,16 @@ variable "name" {
3333
type = string
3434
}
3535

36+
variable "slack_notification_channel_name" {
37+
type = string
38+
default = ""
39+
}
3640

3741
variable "test_account_email" {
3842
type = string
3943
}
44+
45+
variable "commit_url" {
46+
type = string
47+
description = "URL of the commit this deployment is associated with"
48+
}

terraform/infra-policyengine-api/variables.tf

+11
Original file line numberDiff line numberDiff line change
@@ -30,3 +30,14 @@ variable "hugging_face_token" {
3030
type = string
3131
sensitive = true
3232
}
33+
34+
variable "slack_notification_channel_name" {
35+
description = "Manually configured slack notification channel's name"
36+
type = string
37+
default = ""
38+
}
39+
40+
variable "commit_url" {
41+
type = string
42+
description = "URL of the commit this deployment is associated with"
43+
}

terraform/project-policyengine-api/scripts/bootstrap.sh

+5-1
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,9 @@ terraform apply -var "org_id=${org_id}" -var "billing_account=${billing_account}
6161
-var "github_repo_owner_id=${github_repo_owner_id}" -var "github_repo=${github_repo}"
6262
terraform init -migrate-state
6363

64+
echo " (OPTIONALLY) Please go into the console and create a slack notification channel. Write down the display name"
65+
read -p "Enter notification channel display name (LEAVE EMPTY FOR NONE):" slack_notification_channel_name
66+
6467

6568
project_id=$(terraform output -raw project_id)
6669
project_bucket=$(terraform output -raw project_bucket)
@@ -78,6 +81,7 @@ project_id = "${project_id}"
7881
is_prod = ${is_prod}
7982
github_repo_owner_id = "${github_repo_owner_id}"
8083
github_repo = "${github_repo}"
84+
slack_notification_channel_name = "${slack_notification_channel_name}"
8185
EOF
8286

8387
echo "Creating backend.tfvars which is used to configure the backend.tf settings when using terraform init"
@@ -88,4 +92,4 @@ EOF
8892
echo "Creating project.env for Makefiles to load the project id of the created project."
8993
cat > ../.bootstrap_settings/project.env << EOF
9094
PROJECT_ID=${project_id}
91-
EOF
95+
EOF

0 commit comments

Comments
 (0)