feat(medcat-service): Add Gunicorn CLI args for max requests, jitter, and any other input from env (#201)

alhendrickson · web-flow · commit a83db7c8c50a · 2025-10-29T11:00:25.000Z
* feat(medcat-service): Add Gunicorn config for max requests and jitter

* feat(medcat-service): Support any extra args to gunicorn with env var

* feat(medcat-service): Support any extra gunicorn args - add readme
diff --git a/medcat-service/README.md b/medcat-service/README.md
@@ -316,6 +316,9 @@ The following environment variables are available for tailoring the MedCAT Servi
 - `SERVER_PORT` - the port number used (default: `5000`),
 - `SERVER_WORKERS` - the number of workers serving the Flask app working in parallel (default: `1` ; only used in production server).
 - `SERVER_WORKER_TIMEOUT` - the max timeout (in sec) for receiving response from worker (default: `300` ; only used with production server).
+- `SERVER_GUNICORN_MAX_REQUESTS` - maximum number of requests a worker will process before restarting (default: `1000`),
+- `SERVER_GUNICORN_MAX_REQUESTS_JITTER` - adds randomness to `MAX_REQUESTS` to avoid all workers restarting simultaneously (default: `50`),
+- `SERVER_GUNICORN_EXTRA_ARGS` - any additional Gunicorn CLI arguments you want to pass (default: none). (Example value: "SERVER_GUNICORN_EXTRA_ARGS=--backlog 20")
 
 The following environment variables are available for tailoring the MedCAT Service wrapper:
 
diff --git a/medcat-service/env/app.env b/medcat-service/env/app.env
@@ -36,11 +36,13 @@ SERVER_PORT=5000
 SERVER_WORKERS=1
 SERVER_WORKER_TIMEOUT=300
 SERVER_THREADS=1
+SERVER_GUNICORN_MAX_REQUESTS=1000
+SERVER_GUNICORN_MAX_REQUESTS_JITTER=50
 
 # set the number of torch threads, this should be used ONLY if you are using CPUs and the default image
 # set to -1 or 0 if you are using GPU
 APP_TORCH_THREADS=8
 
 # GPU SETTING
 # CAUTION, use only if you are using the GPU docker image.
-APP_CUDA_DEVICE_COUNT=1
+APP_CUDA_DEVICE_COUNT=-1
diff --git a/medcat-service/env/app_deid.env b/medcat-service/env/app_deid.env
@@ -36,6 +36,8 @@ SERVER_PORT=5000
 SERVER_WORKERS=1
 SERVER_WORKER_TIMEOUT=300
 SERVER_THREADS=1
+SERVER_GUNICORN_MAX_REQUESTS=1000
+SERVER_GUNICORN_MAX_REQUESTS_JITTER=50
 
 # set the number of torch threads, this should be used ONLY if you are using CPUs and the default image
 # set to -1 or 0 if you are using GPU
diff --git a/medcat-service/start_service_production.sh b/medcat-service/start_service_production.sh
@@ -33,13 +33,25 @@ if [ -z ${SERVER_WORKER_TIMEOUT+x} ]; then
   echo "SERVER_WORKER_TIMEOUT is unset -- setting to default (sec): $SERVER_WORKER_TIMEOUT";
 fi
 
+if [ -z ${SERVER_GUNICORN_MAX_REQUESTS+x} ]; then
+  SERVER_GUNICORN_MAX_REQUESTS=1000;
+  echo "SERVER_GUNICORN_MAX_REQUESTS is unset -- setting to default: $SERVER_GUNICORN_MAX_REQUESTS";
+fi
+
+if [ -z ${SERVER_GUNICORN_MAX_REQUESTS_JITTER+x} ]; then
+  SERVER_GUNICORN_MAX_REQUESTS_JITTER=50;
+  echo "SERVER_GUNICORN_MAX_REQUESTS_JITTER is unset -- setting to default: $SERVER_GUNICORN_MAX_REQUESTS_JITTER";
+fi
+
 # Note - SERVER_ACCESS_LOG_FORMAT is unused when worker-class is set to UvicornWorker
 SERVER_ACCESS_LOG_FORMAT="%(t)s [ACCESS] %(h)s \"%(r)s\" %(s)s \"%(f)s\" \"%(a)s\""
 
 # start the server
 #
 # Using Gunicorn, even though FastAPI recommends Uvicorn, to keep support for the post_fork config
 echo "Starting up the service using gunicorn server ..."
+set -x
+
 exec gunicorn \
   --bind "$SERVER_HOST:$SERVER_PORT" \
   --workers="$SERVER_WORKERS" \
@@ -50,5 +62,8 @@ exec gunicorn \
   --error-logfile=- \
   --log-level info \
   --config /cat/config.py \
+  --max-requests="$SERVER_GUNICORN_MAX_REQUESTS" \
+  --max-requests-jitter="$SERVER_GUNICORN_MAX_REQUESTS_JITTER" \
+  ${SERVER_GUNICORN_EXTRA_ARGS:-} \
   --worker-class uvicorn.workers.UvicornWorker \
   medcat_service.main:app