@@ -4,7 +4,7 @@ resource "google_monitoring_uptime_check_config" "cloudrun_health_check" {
4
4
# response time can be slower because of container spin up in beta.
5
5
timeout = var. is_prod ? " 1s" : " 10s"
6
6
# don't waste resources waking up the beta container all the time. Just do it once a day.
7
- period = var. is_prod ? " 300s" : " 86400s "
7
+ period = var. is_prod ? " 300s" : " 900s "
8
8
9
9
http_check {
10
10
path = " /ping/alive"
@@ -24,4 +24,59 @@ resource "google_monitoring_uptime_check_config" "cloudrun_health_check" {
24
24
host = regex (" ://([^/:]+)" , google_cloud_run_v2_service. api . uri )[0 ]
25
25
}
26
26
}
27
- }
27
+ }
28
+
29
+ # Only reference the Slack notification channel if the variable is not empty
30
+ locals {
31
+ use_slack_notification = var. slack_notification_channel_name != " "
32
+ notification_channels = local. use_slack_notification ? [data . google_monitoring_notification_channel . slack [0 ]. name ] : []
33
+ }
34
+
35
+ # You need to do this in the console
36
+ # Reference an existing Slack notification channel that was set up in the console
37
+ data "google_monitoring_notification_channel" "slack" {
38
+ count = local. use_slack_notification ? 1 : 0
39
+ display_name = var. slack_notification_channel_name
40
+ }
41
+
42
+ # Create the alerting policy with PromQL that references your uptime check
43
+ resource "google_monitoring_alert_policy" "cloudrun_health_alert" {
44
+ display_name = " ${ var . name } Health Check Alert"
45
+ combiner = " OR"
46
+
47
+ conditions {
48
+ display_name = " Uptime Check Failed"
49
+ condition_prometheus_query_language {
50
+ query = " avg by (check_id)(avg_over_time(monitoring_googleapis_com:uptime_check_check_passed{check_id=\" ${ google_monitoring_uptime_check_config . cloudrun_health_check . uptime_check_id } \" , monitored_resource=\" uptime_url\" }[60s])) < 1"
51
+ duration = " 60s"
52
+ evaluation_interval = " 60s"
53
+ alert_rule = " OnPresentAndFiring"
54
+ rule_group = " health_checks"
55
+ labels = {
56
+ severity = " critical"
57
+ }
58
+ }
59
+ }
60
+
61
+ notification_channels = local. notification_channels
62
+
63
+ # Add documentation with more detailed information for Slack messages
64
+ documentation {
65
+ content = <<- EOT
66
+ 🚨 *${ var . name } Service Health Alert*
67
+
68
+ The ${ var . name } service is failing its health check at endpoint `/ping/alive`.
69
+
70
+ *Troubleshooting Steps:*
71
+ - Check the [${ var . name } Cloud Run service](https://console.cloud.google.com/run/detail/${ var . region } /${ var . name } /metrics?project=${ var . project_id } )
72
+ - Check the [latest changes](${ var . commit_url } )
73
+ EOT
74
+ mime_type = " text/markdown"
75
+ }
76
+
77
+
78
+ # Auto-close to reduce alert fatigue
79
+ # alert_strategy {
80
+ # auto_close = var.is_prod ? "1800s" : "3600s" # 30 minutes for prod, 1 hour for non-prod
81
+ # }
82
+ }
0 commit comments