Tweak perf and alarm settings for services.

Michael Smit · Michael Smit · commit 1c21c8a6a155 · 2025-04-02T18:47:05.000-07:00
Fixes #25 1. Make the uptime check less sensitive. 2. Make the memory usage check less sensitive 3. Fine tune the max request count, min instance count, etc. for simulation api vs full * a single container for simulation api will accept no more than 2 requests. * the server runs 3 workers (allowing + 1 for handling live checks)
diff --git a/projects/policyengine-api-simulation/Dockerfile b/projects/policyengine-api-simulation/Dockerfile
@@ -23,4 +23,7 @@ COPY src ./src/
 RUN poetry install --with main --no-root
 
 EXPOSE 8080
-CMD ["poetry", "run", "uvicorn", "src.policyengine_api_simulation.main:app", "--host", "0.0.0.0", "--port", "8080", "--workers", "4"]
+# bottlenck is memory
+# two workers to handle external reuqests (max allowed = 2)
+# one worker to do the liveness check
+CMD ["poetry", "run", "uvicorn", "src.policyengine_api_simulation.main:app", "--host", "0.0.0.0", "--port", "8080", "--workers", "3"]
diff --git a/terraform/infra-policyengine-api/main.tf b/terraform/infra-policyengine-api/main.tf
@@ -28,9 +28,18 @@ module "cloud_run_full_api" {
 
   project_id=var.project_id
   region=var.region
-  is_prod=var.is_prod
   slack_notification_channel_name=var.slack_notification_channel_name
   commit_url = var.commit_url
+
+  uptime_timeout = var.is_prod ? "1s" : "30s"
+  min_instance_count = var.is_prod ? 1: 0
+  max_instance_count = 2
+  #guessing. Need to tune.
+  max_instance_request_concurrency = var.is_prod ? 80: 1
+  #this service should return basically immediately to all requests.
+  timeout = "1s"
+
+  enable_uptime_check = true
 }
 
 module "cloud_run_simulation_api" {
@@ -56,13 +65,26 @@ module "cloud_run_simulation_api" {
     memory = "16Gi"
   }
 
-  request_based_billing = true
+  
 
   project_id=var.project_id
   region=var.region
-  is_prod=var.is_prod
   slack_notification_channel_name=var.slack_notification_channel_name
   commit_url = var.commit_url
+
+  uptime_timeout = var.is_prod ? "1s" : "30s"
+  request_based_billing = true
+  min_instance_count = var.is_prod ? 1: 0
+  #arbitrary number. May need to tweak
+  max_instance_count = var.is_prod ? 10 : 1
+  #we are currently memory bound. internally it runs 3 handlers. keep one open for liveness checks.
+  max_instance_request_concurrency = 2
+  #permit max timeout since we run entire population simulations.
+  timeout = "3600s"
+
+  # This service can't really handle more than one request in a single container so
+  # we don't use uptime check
+  enable_uptime_check = false
 }
 
 # Create a workflow
diff --git a/terraform/infra-policyengine-api/modules/fastapi_cloudrun/main.tf b/terraform/infra-policyengine-api/modules/fastapi_cloudrun/main.tf
@@ -30,20 +30,18 @@ resource "google_cloud_run_v2_service" "api" {
 
   template {
     service_account = google_service_account.api.email
-    # Assumption from cost estimate.
-    max_instance_request_concurrency = var.is_prod ? 80 : null
+    max_instance_request_concurrency = var.max_instance_request_concurrency
     containers {
       image = local.api_image
       resources {
-        #default to whatever the cheapest instance is unless in prod in which
-        # case values are again based on the cost esitmate.
         limits = {
           cpu    = var.limits.cpu
           memory = var.limits.memory
         }
         cpu_idle = var.request_based_billing ? true : false
         startup_cpu_boost = true
       }
+
       startup_probe {
         initial_delay_seconds = 0
         timeout_seconds = 1
@@ -54,17 +52,14 @@ resource "google_cloud_run_v2_service" "api" {
           path = "/ping/started"
         }
       }
-      # Only include liveness_probe in production environment so we don't
-      # waste money running beta containers.
-      dynamic "liveness_probe" {
-        for_each = var.is_prod ? [1] : []
-        content {
-          period_seconds = 30 
-          timeout_seconds = 1 
-          failure_threshold = 2
-          http_get {
-            path = "/ping/alive"
-          }
+
+      liveness_probe {
+        #once every 5 minutes
+        period_seconds = 300 
+        timeout_seconds = 1 
+        failure_threshold = 2
+        http_get {
+          path = "/ping/alive"
         }
       }
 
@@ -83,12 +78,13 @@ resource "google_cloud_run_v2_service" "api" {
     }
     scaling {
       # always keep one instance hot in prod
-      min_instance_count = var.is_prod ? 1 : 0
+      min_instance_count = var.min_instance_count
       # in beta don't create a bunch of containers
       # max in prod based on assumptions from cost estimate
-      max_instance_count = var.is_prod ? 10 : 1
+      max_instance_count = var.max_instance_count
     }
-    timeout = "3600s"
+    #Max timeout of 1 hour permitted.
+    timeout = var.timeout
   }
 }
 
diff --git a/terraform/infra-policyengine-api/modules/fastapi_cloudrun/monitoring.tf b/terraform/infra-policyengine-api/modules/fastapi_cloudrun/monitoring.tf
@@ -1,11 +1,12 @@
 
 
 resource "google_monitoring_uptime_check_config" "cloudrun_health_check" {
+  for_each = var.enable_uptime_check ? toset(["0"]) : toset([])
   display_name = "${var.name} Health Check"
   # response time can be slower because of container spin up in beta.
-  timeout      = var.is_prod ? "1s" : "10s"
-  # don't waste resources waking up the beta container all the time. Just do it once a day.
-  period       = var.is_prod ? "300s" : "900s"
+  timeout      = var.uptime_timeout
+  # once every 5 minutes
+  period       = "300s"
 
   http_check {
     path         = "/ping/alive"
@@ -27,11 +28,9 @@ resource "google_monitoring_uptime_check_config" "cloudrun_health_check" {
   }
 
   # See https://github.com/PolicyEngine/policyengine-api-v2/issues/117
-  # in beta just check from the US to reduce cost
-  # in prod check europe and the US since those are the two regions we currently primarily
-  # operate in (pending multi-region expansion which we don't currently do)
+  # we are not yet multi-regional so just check the places we operate in.
   # https://cloud.google.com/monitoring/api/ref_v3/rest/v3/projects.uptimeCheckConfigs#UptimeCheckRegion
-  selected_regions = var.is_prod ? ["USA", "EUROPE"] : ["USA"]
+  selected_regions = ["USA", "EUROPE"]
 }
 
 # Only reference the Slack notification channel if the variable is not empty
@@ -49,17 +48,16 @@ data "google_monitoring_notification_channel" "slack" {
 
 # Create the alerting policy with PromQL that references your uptime check
 resource "google_monitoring_alert_policy" "cloudrun_health_alert" {
+  for_each = var.enable_uptime_check ? toset(["0"]) : ([])
   display_name = "${var.name} Health Check Alert"
   combiner     = "OR"
   
   conditions {
     display_name = "Uptime Check Failed"
     condition_prometheus_query_language {
-        query = "avg by (check_id)(avg_over_time(monitoring_googleapis_com:uptime_check_check_passed{check_id=\"${google_monitoring_uptime_check_config.cloudrun_health_check.uptime_check_id}\", monitored_resource=\"uptime_url\"}[60s])) < 1"
-        duration = "60s"
-        evaluation_interval = "60s"
-        alert_rule = "OnPresentAndFiring"
-        rule_group = "health_checks"
+        query = "avg by (check_id)(avg_over_time(monitoring_googleapis_com:uptime_check_check_passed{check_id=\"${google_monitoring_uptime_check_config.cloudrun_health_check[0].uptime_check_id}\", monitored_resource=\"uptime_url\"}[60s])) < 1"
+        #fail two consecutive checks (5 minutes)
+        duration = "600s" #10m
         labels = {
           severity = "critical"
         }
@@ -81,27 +79,21 @@ resource "google_monitoring_alert_policy" "cloudrun_health_alert" {
       EOT
     mime_type = "text/markdown"
   }
-
-  # Auto-close to reduce alert fatigue
-  #alert_strategy {
-  #  auto_close = var.is_prod ? "1800s" : "3600s" # 30 minutes for prod, 1 hour for non-prod
-  #}
 }
 
 resource "google_monitoring_alert_policy" "limit_alert" {
   display_name = "${var.name} Limit Check"
   combiner     = "OR"
   
   conditions {
-    display_name = "Memory usage over 75%"
+    display_name = "Memory usage over 90%"
     condition_prometheus_query_language {
         #go into the monitoring console, query metrics, select the thing you want to monitor and then select the prometheus view in order to get the right syntax for these.
         #the documentation is pretty bad and none of the LLMs, including google's, know how to do these properly.
-        query = "histogram_quantile(0.95,sum by (le)(increase(run_googleapis_com:container_memory_utilizations_bucket{monitored_resource=\"cloud_run_revision\",service_name=\"${google_cloud_run_v2_service.api.name}\"}[1m]))) > .75"
+        query = "histogram_quantile(0.95,sum by (le)(increase(run_googleapis_com:container_memory_utilizations_bucket{monitored_resource=\"cloud_run_revision\",service_name=\"${google_cloud_run_v2_service.api.name}\"}[1m]))) > .9"
+        #if the memory jumps above 90% immediately notify the team in slack that we're approaching our limit.
         duration = "60s"
         evaluation_interval = "60s"
-        alert_rule = "OnPresentAndFiring"
-        rule_group = "health_checks"
         labels = {
           severity = "critical"
         }
diff --git a/terraform/infra-policyengine-api/modules/fastapi_cloudrun/variables.tf b/terraform/infra-policyengine-api/modules/fastapi_cloudrun/variables.tf
@@ -1,8 +1,3 @@
-variable "is_prod" {
-  description = "Whether this is a production deployment"
-  type        = bool
-}
-
 variable "project_id" {
   description = "The GCP project to deploy to"
   type        = string
@@ -30,18 +25,45 @@ variable limits {
   })
 }
 
+variable min_instance_count {
+  description = "The minimum number of instances to keep 'hot' regardless of requests"
+  type = number
+}
+
+variable max_instance_count {
+  description = "The maximum number of instances to allow;"
+  type = number
+  default = 1
+}
+
+variable max_instance_request_concurrency {
+  description = "How many requests can a single container handle at once"
+  type = number
+}
+
 variable request_based_billing {
   description = "Whether to use request-based billing for the Cloud Run service"
   type        = bool
   default     = false
 }
 
+variable "uptime_timeout" {
+  type = string
+  description = "number of seconds to wait for the uptime check response before failing"
+}
+
+
 variable "environment_secrets" {
   description = "Map of environment variable names to their corresponding secret IDs in Google Secret Manager"
   type = map(string)
   default = {}
 }
 
+variable "timeout" {
+  description = "Max time a container can take to respond to a request up to 1 hour"
+  type = string
+}
+
 variable "description" {
   type = string
 }
@@ -71,3 +93,8 @@ variable "service_roles" {
   description = "roles to give the service account for this clodurun service"
   default = []
 }
+
+variable "enable_uptime_check" {
+  type = bool
+  description = "Should this autogenerate an uptime check for the cloudrun service"
+}