Skip to content

Commit 1c21c8a

Browse files
author
Michael Smit
committed
Tweak perf and alarm settings for services.
Fixes #25 1. Make the uptime check less sensitive. 2. Make the memory usage check less sensitive 3. Fine tune the max request count, min instance count, etc. for simulation api vs full * a single container for simulation api will accept no more than 2 requests. * the server runs 3 workers (allowing + 1 for handling live checks)
1 parent b515fb7 commit 1c21c8a

File tree

5 files changed

+88
-48
lines changed

5 files changed

+88
-48
lines changed

projects/policyengine-api-simulation/Dockerfile

+4-1
Original file line numberDiff line numberDiff line change
@@ -23,4 +23,7 @@ COPY src ./src/
2323
RUN poetry install --with main --no-root
2424

2525
EXPOSE 8080
26-
CMD ["poetry", "run", "uvicorn", "src.policyengine_api_simulation.main:app", "--host", "0.0.0.0", "--port", "8080", "--workers", "4"]
26+
# bottlenck is memory
27+
# two workers to handle external reuqests (max allowed = 2)
28+
# one worker to do the liveness check
29+
CMD ["poetry", "run", "uvicorn", "src.policyengine_api_simulation.main:app", "--host", "0.0.0.0", "--port", "8080", "--workers", "3"]

terraform/infra-policyengine-api/main.tf

+25-3
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,18 @@ module "cloud_run_full_api" {
2828

2929
project_id=var.project_id
3030
region=var.region
31-
is_prod=var.is_prod
3231
slack_notification_channel_name=var.slack_notification_channel_name
3332
commit_url = var.commit_url
33+
34+
uptime_timeout = var.is_prod ? "1s" : "30s"
35+
min_instance_count = var.is_prod ? 1: 0
36+
max_instance_count = 2
37+
#guessing. Need to tune.
38+
max_instance_request_concurrency = var.is_prod ? 80: 1
39+
#this service should return basically immediately to all requests.
40+
timeout = "1s"
41+
42+
enable_uptime_check = true
3443
}
3544

3645
module "cloud_run_simulation_api" {
@@ -56,13 +65,26 @@ module "cloud_run_simulation_api" {
5665
memory = "16Gi"
5766
}
5867

59-
request_based_billing = true
68+
6069

6170
project_id=var.project_id
6271
region=var.region
63-
is_prod=var.is_prod
6472
slack_notification_channel_name=var.slack_notification_channel_name
6573
commit_url = var.commit_url
74+
75+
uptime_timeout = var.is_prod ? "1s" : "30s"
76+
request_based_billing = true
77+
min_instance_count = var.is_prod ? 1: 0
78+
#arbitrary number. May need to tweak
79+
max_instance_count = var.is_prod ? 10 : 1
80+
#we are currently memory bound. internally it runs 3 handlers. keep one open for liveness checks.
81+
max_instance_request_concurrency = 2
82+
#permit max timeout since we run entire population simulations.
83+
timeout = "3600s"
84+
85+
# This service can't really handle more than one request in a single container so
86+
# we don't use uptime check
87+
enable_uptime_check = false
6688
}
6789

6890
# Create a workflow

terraform/infra-policyengine-api/modules/fastapi_cloudrun/main.tf

+14-18
Original file line numberDiff line numberDiff line change
@@ -30,20 +30,18 @@ resource "google_cloud_run_v2_service" "api" {
3030

3131
template {
3232
service_account = google_service_account.api.email
33-
# Assumption from cost estimate.
34-
max_instance_request_concurrency = var.is_prod ? 80 : null
33+
max_instance_request_concurrency = var.max_instance_request_concurrency
3534
containers {
3635
image = local.api_image
3736
resources {
38-
#default to whatever the cheapest instance is unless in prod in which
39-
# case values are again based on the cost esitmate.
4037
limits = {
4138
cpu = var.limits.cpu
4239
memory = var.limits.memory
4340
}
4441
cpu_idle = var.request_based_billing ? true : false
4542
startup_cpu_boost = true
4643
}
44+
4745
startup_probe {
4846
initial_delay_seconds = 0
4947
timeout_seconds = 1
@@ -54,17 +52,14 @@ resource "google_cloud_run_v2_service" "api" {
5452
path = "/ping/started"
5553
}
5654
}
57-
# Only include liveness_probe in production environment so we don't
58-
# waste money running beta containers.
59-
dynamic "liveness_probe" {
60-
for_each = var.is_prod ? [1] : []
61-
content {
62-
period_seconds = 30
63-
timeout_seconds = 1
64-
failure_threshold = 2
65-
http_get {
66-
path = "/ping/alive"
67-
}
55+
56+
liveness_probe {
57+
#once every 5 minutes
58+
period_seconds = 300
59+
timeout_seconds = 1
60+
failure_threshold = 2
61+
http_get {
62+
path = "/ping/alive"
6863
}
6964
}
7065

@@ -83,12 +78,13 @@ resource "google_cloud_run_v2_service" "api" {
8378
}
8479
scaling {
8580
# always keep one instance hot in prod
86-
min_instance_count = var.is_prod ? 1 : 0
81+
min_instance_count = var.min_instance_count
8782
# in beta don't create a bunch of containers
8883
# max in prod based on assumptions from cost estimate
89-
max_instance_count = var.is_prod ? 10 : 1
84+
max_instance_count = var.max_instance_count
9085
}
91-
timeout = "3600s"
86+
#Max timeout of 1 hour permitted.
87+
timeout = var.timeout
9288
}
9389
}
9490

terraform/infra-policyengine-api/modules/fastapi_cloudrun/monitoring.tf

+13-21
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11

22

33
resource "google_monitoring_uptime_check_config" "cloudrun_health_check" {
4+
for_each = var.enable_uptime_check ? toset(["0"]) : toset([])
45
display_name = "${var.name} Health Check"
56
# response time can be slower because of container spin up in beta.
6-
timeout = var.is_prod ? "1s" : "10s"
7-
# don't waste resources waking up the beta container all the time. Just do it once a day.
8-
period = var.is_prod ? "300s" : "900s"
7+
timeout = var.uptime_timeout
8+
# once every 5 minutes
9+
period = "300s"
910

1011
http_check {
1112
path = "/ping/alive"
@@ -27,11 +28,9 @@ resource "google_monitoring_uptime_check_config" "cloudrun_health_check" {
2728
}
2829

2930
# See https://github.com/PolicyEngine/policyengine-api-v2/issues/117
30-
# in beta just check from the US to reduce cost
31-
# in prod check europe and the US since those are the two regions we currently primarily
32-
# operate in (pending multi-region expansion which we don't currently do)
31+
# we are not yet multi-regional so just check the places we operate in.
3332
# https://cloud.google.com/monitoring/api/ref_v3/rest/v3/projects.uptimeCheckConfigs#UptimeCheckRegion
34-
selected_regions = var.is_prod ? ["USA", "EUROPE"] : ["USA"]
33+
selected_regions = ["USA", "EUROPE"]
3534
}
3635

3736
# Only reference the Slack notification channel if the variable is not empty
@@ -49,17 +48,16 @@ data "google_monitoring_notification_channel" "slack" {
4948

5049
# Create the alerting policy with PromQL that references your uptime check
5150
resource "google_monitoring_alert_policy" "cloudrun_health_alert" {
51+
for_each = var.enable_uptime_check ? toset(["0"]) : ([])
5252
display_name = "${var.name} Health Check Alert"
5353
combiner = "OR"
5454

5555
conditions {
5656
display_name = "Uptime Check Failed"
5757
condition_prometheus_query_language {
58-
query = "avg by (check_id)(avg_over_time(monitoring_googleapis_com:uptime_check_check_passed{check_id=\"${google_monitoring_uptime_check_config.cloudrun_health_check.uptime_check_id}\", monitored_resource=\"uptime_url\"}[60s])) < 1"
59-
duration = "60s"
60-
evaluation_interval = "60s"
61-
alert_rule = "OnPresentAndFiring"
62-
rule_group = "health_checks"
58+
query = "avg by (check_id)(avg_over_time(monitoring_googleapis_com:uptime_check_check_passed{check_id=\"${google_monitoring_uptime_check_config.cloudrun_health_check[0].uptime_check_id}\", monitored_resource=\"uptime_url\"}[60s])) < 1"
59+
#fail two consecutive checks (5 minutes)
60+
duration = "600s" #10m
6361
labels = {
6462
severity = "critical"
6563
}
@@ -81,27 +79,21 @@ resource "google_monitoring_alert_policy" "cloudrun_health_alert" {
8179
EOT
8280
mime_type = "text/markdown"
8381
}
84-
85-
# Auto-close to reduce alert fatigue
86-
#alert_strategy {
87-
# auto_close = var.is_prod ? "1800s" : "3600s" # 30 minutes for prod, 1 hour for non-prod
88-
#}
8982
}
9083

9184
resource "google_monitoring_alert_policy" "limit_alert" {
9285
display_name = "${var.name} Limit Check"
9386
combiner = "OR"
9487

9588
conditions {
96-
display_name = "Memory usage over 75%"
89+
display_name = "Memory usage over 90%"
9790
condition_prometheus_query_language {
9891
#go into the monitoring console, query metrics, select the thing you want to monitor and then select the prometheus view in order to get the right syntax for these.
9992
#the documentation is pretty bad and none of the LLMs, including google's, know how to do these properly.
100-
query = "histogram_quantile(0.95,sum by (le)(increase(run_googleapis_com:container_memory_utilizations_bucket{monitored_resource=\"cloud_run_revision\",service_name=\"${google_cloud_run_v2_service.api.name}\"}[1m]))) > .75"
93+
query = "histogram_quantile(0.95,sum by (le)(increase(run_googleapis_com:container_memory_utilizations_bucket{monitored_resource=\"cloud_run_revision\",service_name=\"${google_cloud_run_v2_service.api.name}\"}[1m]))) > .9"
94+
#if the memory jumps above 90% immediately notify the team in slack that we're approaching our limit.
10195
duration = "60s"
10296
evaluation_interval = "60s"
103-
alert_rule = "OnPresentAndFiring"
104-
rule_group = "health_checks"
10597
labels = {
10698
severity = "critical"
10799
}

terraform/infra-policyengine-api/modules/fastapi_cloudrun/variables.tf

+32-5
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,3 @@
1-
variable "is_prod" {
2-
description = "Whether this is a production deployment"
3-
type = bool
4-
}
5-
61
variable "project_id" {
72
description = "The GCP project to deploy to"
83
type = string
@@ -30,18 +25,45 @@ variable limits {
3025
})
3126
}
3227

28+
variable min_instance_count {
29+
description = "The minimum number of instances to keep 'hot' regardless of requests"
30+
type = number
31+
}
32+
33+
variable max_instance_count {
34+
description = "The maximum number of instances to allow;"
35+
type = number
36+
default = 1
37+
}
38+
39+
variable max_instance_request_concurrency {
40+
description = "How many requests can a single container handle at once"
41+
type = number
42+
}
43+
3344
variable request_based_billing {
3445
description = "Whether to use request-based billing for the Cloud Run service"
3546
type = bool
3647
default = false
3748
}
3849

50+
variable "uptime_timeout" {
51+
type = string
52+
description = "number of seconds to wait for the uptime check response before failing"
53+
}
54+
55+
3956
variable "environment_secrets" {
4057
description = "Map of environment variable names to their corresponding secret IDs in Google Secret Manager"
4158
type = map(string)
4259
default = {}
4360
}
4461

62+
variable "timeout" {
63+
description = "Max time a container can take to respond to a request up to 1 hour"
64+
type = string
65+
}
66+
4567
variable "description" {
4668
type = string
4769
}
@@ -71,3 +93,8 @@ variable "service_roles" {
7193
description = "roles to give the service account for this clodurun service"
7294
default = []
7395
}
96+
97+
variable "enable_uptime_check" {
98+
type = bool
99+
description = "Should this autogenerate an uptime check for the cloudrun service"
100+
}

0 commit comments

Comments
 (0)