From cfa1806a7a6c3c8c4367bc94e605f88099f65ffd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Sun, 8 Mar 2026 09:47:49 -0700 Subject: [PATCH 1/7] docs: document Python version constraints for deployed workers GPU workers are pinned to Python 3.12 (torch/CUDA only installed for 3.12 in base image). CPU workers support 3.10-3.12. Build pipeline handles wheel selection automatically. Also bumps requires-python to <3.14 and runpod-flash to >=1.7.0. --- README.md | 9 ++++++++- pyproject.toml | 4 ++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 6fc04fd..c73bcdf 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,13 @@ async def generate_image(prompt: str) -> bytes: - **uv**: Install with `curl -LsSf https://astral.sh/uv/install.sh | sh` - **Runpod account**: [Sign up here](https://runpod.io/console/signup) +### Python version in deployed workers + +Your local Python version does not affect what runs in the cloud. `flash build` downloads wheels for the container's Python version automatically. + +- **GPU workers**: Python 3.12 only. The GPU base image ships multiple interpreters (3.9-3.14) for interactive pod use, but torch and CUDA libraries are installed only for 3.12. +- **CPU workers**: Python 3.10, 3.11, or 3.12. Configurable via `PYTHON_VERSION` build arg. + ## Quick Start ```bash @@ -139,7 +146,7 @@ print(job.output) Workers automatically scale based on demand: - `workers=(0, 3)` - Scale from 0 to 3 workers (cost-efficient) - `workers=(1, 5)` - Keep 1 warm, scale up to 5 -- `idle_timeout=5` - Minutes before scaling down +- `idle_timeout=5` - Seconds before scaling down ## Resources diff --git a/pyproject.toml b/pyproject.toml index 83b1b42..c54284a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,9 +3,9 @@ name = "runpod-flash-examples" version = "1.0.0" description = "A collection of example applications showcasing Runpod Flash - a framework for building production-ready AI applications with distributed GPU and CPU computing." readme = "README.md" -requires-python = ">=3.10" +requires-python = ">=3.10,<3.14" dependencies = [ - "runpod-flash>=1.4.0", + "runpod-flash>=1.7.0", ] [dependency-groups] From dd03f4a95b8eb89639ac8c0c99d55ae17a4315b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Sun, 8 Mar 2026 09:47:55 -0700 Subject: [PATCH 2/7] refactor: migrate GpuGroup.ANY to GpuType with specific GPU models Replace GpuGroup.ANY with explicit GpuType values (e.g. GpuType.NVIDIA_GEFORCE_RTX_4090) across all examples, docs, and contributing guide. Specific GPU types give users faster provisioning and predictable behavior. --- 01_getting_started/01_hello_world/README.md | 6 +- 01_getting_started/03_mixed_workers/README.md | 4 +- 01_getting_started/04_dependencies/README.md | 56 +++++++++---------- .../05_load_balancer/README.md | 4 +- .../05_load_balancer/gpu_lb.py | 8 ++- .../01_autoscaling/README.md | 12 ++-- .../01_autoscaling/gpu_worker.py | 10 ++-- .../01_network_volumes/gpu_worker.py | 9 +-- CLAUDE.md | 12 ++-- CONTRIBUTING.md | 10 ++-- docs/cli/troubleshooting.md | 22 ++++---- docs/cli/workflows.md | 32 +++++------ 12 files changed, 93 insertions(+), 92 deletions(-) diff --git a/01_getting_started/01_hello_world/README.md b/01_getting_started/01_hello_world/README.md index a72e3b0..91181b9 100644 --- a/01_getting_started/01_hello_world/README.md +++ b/01_getting_started/01_hello_world/README.md @@ -112,9 +112,9 @@ The `@Endpoint` decorator transparently executes functions on serverless infrast - Handles serialization and resource management ```python -from runpod_flash import Endpoint, GpuGroup +from runpod_flash import Endpoint, GpuType -@Endpoint(name="my-worker", gpu=GpuGroup.ANY, workers=(0, 3)) +@Endpoint(name="my-worker", gpu=GpuType.NVIDIA_GEFORCE_RTX_4090, workers=(0, 3)) async def my_function(data: dict) -> dict: return {"result": "processed"} ``` @@ -145,7 +145,7 @@ flash run ## Next Steps -- Customize GPU type: Change `GpuGroup.ANY` to a specific GPU (e.g. `GpuGroup.ADA_24`, `GpuGroup.AMPERE_80`) +- Customize GPU type: Change `GpuType.NVIDIA_GEFORCE_RTX_4090` to another GPU (e.g. `GpuType.NVIDIA_A100_80GB`, `GpuType.NVIDIA_H100_80GB`) - Add your own GPU-accelerated code - Implement error handling and validation - Deploy to production with `flash deploy` diff --git a/01_getting_started/03_mixed_workers/README.md b/01_getting_started/03_mixed_workers/README.md index 78e8562..e85fad4 100644 --- a/01_getting_started/03_mixed_workers/README.md +++ b/01_getting_started/03_mixed_workers/README.md @@ -134,7 +134,7 @@ Total: $0.0019/sec name="preprocess_worker", cpu=CpuInstanceType.CPU3G_2_8, # 2 vCPU, 8GB workers=(0, 10), - idle_timeout=3, + idle_timeout=180, ) async def preprocess_text(input_data: dict) -> dict: ... ``` @@ -148,7 +148,7 @@ async def preprocess_text(input_data: dict) -> dict: ... name="inference_worker", gpu=GpuGroup.ADA_24, # RTX 4090 workers=(0, 3), - idle_timeout=5, + idle_timeout=300, dependencies=["torch"], ) async def gpu_inference(input_data: dict) -> dict: ... diff --git a/01_getting_started/04_dependencies/README.md b/01_getting_started/04_dependencies/README.md index 5ea27fe..6f7a6fd 100644 --- a/01_getting_started/04_dependencies/README.md +++ b/01_getting_started/04_dependencies/README.md @@ -6,7 +6,7 @@ Learn how to manage Python packages and system dependencies in Flash workers. - **Python dependencies** - Installing packages with version constraints - **System dependencies** - Installing apt packages (ffmpeg, libgl1, etc.) -- **Version pinning** - Reproducible builds with exact versions +- **Version constraints** - Supported syntax for version pinning - **Dependency optimization** - Minimizing cold start time ## Quick Start @@ -105,7 +105,7 @@ async def simple_function(data: dict) -> dict: ```python "requests==2.32.3" # Exactly 2.32.3 ``` -**Use when:** You need reproducible builds +**Use when:** You need reproducible builds for a specific Python version ### Minimum Version (>=) ```python @@ -117,7 +117,7 @@ async def simple_function(data: dict) -> dict: ```python "python-dateutil<3.0.0" # Below 3.0.0 ``` -**Use when:** Avoiding breaking changes +**Use when:** Avoiding breaking changes in a major release ### Compatible Release (~=) ```python @@ -129,7 +129,7 @@ async def simple_function(data: dict) -> dict: ```python "pandas" # Latest available ``` -**Use when:** You always want the newest version (not recommended for production) +**Use when:** You want the latest compatible version (recommended for examples and prototyping) ## Common Dependencies @@ -146,8 +146,8 @@ dependencies=[ ### Data Science ```python dependencies=[ - "pandas==2.1.3", - "numpy==1.26.2", + "pandas", + "numpy", "scipy>=1.11.0", "matplotlib", "scikit-learn", @@ -196,32 +196,28 @@ system_dependencies=["ffmpeg", "libgl1", "wget"] ## Best Practices -### 1. Pin Versions for Production +### 1. Use Version Constraints Thoughtfully ```python -# Good - Reproducible -@Endpoint( - name="worker", - gpu=GpuGroup.ADA_24, - dependencies=[ - "requests==2.32.3", - "transformers==4.35.2", - "numpy==1.26.2", - ], -) +# Good for examples and prototyping - works across Python versions +dependencies=[ + "requests", + "transformers", + "numpy", +] -# Bad - Unpredictable -@Endpoint( - name="worker", - gpu=GpuGroup.ADA_24, - dependencies=[ - "requests", # Version changes over time - "transformers", - "numpy", - ], -) +# Good for production - reproducible on a known Python version +dependencies=[ + "requests==2.32.3", + "transformers==4.35.2", + "numpy==1.26.2", +] ``` +Exact pins can break across Python versions (e.g., older numpy +builds don't exist for Python 3.13+). Pin only when you control +the target Python version. + ### 2. Minimize Dependencies ```python @@ -287,7 +283,9 @@ ERROR: Cannot install requests==2.25.0 and urllib3==2.2.1 because these package versions have conflicting dependencies. ``` -**Solution:** Check compatibility matrix, adjust versions: +**Solutions:** +1. Drop exact pins and let pip resolve compatible versions +2. Check compatibility matrix and adjust versions: ```python dependencies=[ "requests>=2.32.0", @@ -332,7 +330,7 @@ For local development, create `requirements.txt`: runpod-flash transformers==4.35.2 Pillow>=10.0.0 -numpy==1.26.2 +numpy ``` **Note:** Worker dependencies in the `Endpoint` decorator are deployed automatically. `requirements.txt` is for local development only. diff --git a/03_advanced_workers/05_load_balancer/README.md b/03_advanced_workers/05_load_balancer/README.md index f3071ac..2c6eadc 100644 --- a/03_advanced_workers/05_load_balancer/README.md +++ b/03_advanced_workers/05_load_balancer/README.md @@ -86,10 +86,10 @@ curl -X POST http://localhost:8888/05_load_balancer/cpu/transform \ Load-balanced endpoints use the `Endpoint` class with route decorators (`.get()`, `.post()`, etc.) to define HTTP routes. The decorator automatically registers the function as an HTTP endpoint on the load-balancer runtime. ```python -from runpod_flash import Endpoint, GpuGroup +from runpod_flash import Endpoint, GpuType # create load-balanced endpoint -api = Endpoint(name="my-service", gpu=GpuGroup.ANY, workers=(1, 3)) +api = Endpoint(name="my-service", gpu=GpuType.NVIDIA_GEFORCE_RTX_4090, workers=(1, 3)) # define HTTP routes with method decorators @api.get("/health") diff --git a/03_advanced_workers/05_load_balancer/gpu_lb.py b/03_advanced_workers/05_load_balancer/gpu_lb.py index a17301a..574b029 100644 --- a/03_advanced_workers/05_load_balancer/gpu_lb.py +++ b/03_advanced_workers/05_load_balancer/gpu_lb.py @@ -1,9 +1,13 @@ # gpu load-balanced endpoints with custom HTTP routes. # run with: flash run # test directly: python gpu_lb.py -from runpod_flash import Endpoint, GpuGroup +from runpod_flash import Endpoint, GpuType -api = Endpoint(name="03_05_load_balancer_gpu", gpu=GpuGroup.ANY, workers=(1, 3)) +api = Endpoint( + name="03_05_load_balancer_gpu", + gpu=GpuType.NVIDIA_GEFORCE_RTX_4090, + workers=(1, 3), +) @api.get("/health") diff --git a/04_scaling_performance/01_autoscaling/README.md b/04_scaling_performance/01_autoscaling/README.md index 2d8101a..3c1a09d 100644 --- a/04_scaling_performance/01_autoscaling/README.md +++ b/04_scaling_performance/01_autoscaling/README.md @@ -86,20 +86,20 @@ Requests arrive | `idle_timeout` | int | 60 | Minutes before idle workers terminate | | `scaler_type` | ServerlessScalerType | QUEUE_DELAY | Scaling trigger metric | | `scaler_value` | int | 4 | Target value for the scaler metric | -| `gpu` | GpuGroup or GpuType | ANY | GPU type for GPU endpoints | +| `gpu` | GpuType or GpuGroup | -- | GPU type for GPU endpoints | | `cpu` | CpuInstanceType or str | -- | CPU instance type for CPU endpoints | ### Example Configurations ```python -from runpod_flash import Endpoint, GpuGroup, ServerlessScalerType +from runpod_flash import Endpoint, GpuType, ServerlessScalerType # scale to zero, cost-optimized @Endpoint( name="batch-worker", - gpu=GpuGroup.ANY, + gpu=GpuType.NVIDIA_GEFORCE_RTX_4090, workers=(0, 3), - idle_timeout=5, + idle_timeout=1, scaler_type=ServerlessScalerType.QUEUE_DELAY, scaler_value=4, ) @@ -108,7 +108,7 @@ async def batch_process(payload: dict) -> dict: ... # always-on, latency-optimized @Endpoint( name="api-worker", - gpu=GpuGroup.ANY, + gpu=GpuType.NVIDIA_GEFORCE_RTX_4090, workers=(1, 3), idle_timeout=60, ) @@ -117,7 +117,7 @@ async def api_process(payload: dict) -> dict: ... # high-throughput, burst-optimized @Endpoint( name="burst-worker", - gpu=GpuGroup.ANY, + gpu=GpuType.NVIDIA_GEFORCE_RTX_4090, workers=(2, 10), idle_timeout=30, scaler_type=ServerlessScalerType.REQUEST_COUNT, diff --git a/04_scaling_performance/01_autoscaling/gpu_worker.py b/04_scaling_performance/01_autoscaling/gpu_worker.py index ca982a2..9af139c 100644 --- a/04_scaling_performance/01_autoscaling/gpu_worker.py +++ b/04_scaling_performance/01_autoscaling/gpu_worker.py @@ -1,7 +1,7 @@ # gpu autoscaling strategies -- scale-to-zero, always-on, high-throughput. # run with: flash run # test directly: python gpu_worker.py -from runpod_flash import Endpoint, GpuGroup, ServerlessScalerType +from runpod_flash import Endpoint, GpuType, ServerlessScalerType # --- strategy 1: scale to zero --- @@ -9,9 +9,9 @@ # workers scale down to zero after 5 minutes of idle time. @Endpoint( name="04_01_scale_to_zero", - gpu=GpuGroup.ANY, + gpu=GpuType.NVIDIA_GEFORCE_RTX_4090, workers=(0, 3), - idle_timeout=5, + idle_timeout=300, scaler_type=ServerlessScalerType.QUEUE_DELAY, scaler_value=4, ) @@ -55,7 +55,7 @@ async def scale_to_zero_inference(payload: dict) -> dict: # at least one worker stays warm to avoid cold starts. @Endpoint( name="04_01_always_on", - gpu=GpuGroup.ANY, + gpu=GpuType.NVIDIA_GEFORCE_RTX_4090, workers=(1, 3), idle_timeout=60, scaler_type=ServerlessScalerType.QUEUE_DELAY, @@ -101,7 +101,7 @@ async def always_on_inference(payload: dict) -> dict: # starts with 2 warm workers, scales aggressively to 10 based on request count. @Endpoint( name="04_01_high_throughput", - gpu=GpuGroup.ANY, + gpu=GpuType.NVIDIA_GEFORCE_RTX_4090, workers=(2, 10), idle_timeout=30, scaler_type=ServerlessScalerType.REQUEST_COUNT, diff --git a/05_data_workflows/01_network_volumes/gpu_worker.py b/05_data_workflows/01_network_volumes/gpu_worker.py index f9ac69e..da47596 100644 --- a/05_data_workflows/01_network_volumes/gpu_worker.py +++ b/05_data_workflows/01_network_volumes/gpu_worker.py @@ -3,7 +3,7 @@ # test directly: python gpu_worker.py import logging -from runpod_flash import Endpoint, GpuGroup, NetworkVolume +from runpod_flash import Endpoint, GpuType, NetworkVolume logger = logging.getLogger(__name__) @@ -17,12 +17,12 @@ @Endpoint( name="05_01_gpu_worker", - gpu=GpuGroup.ANY, + gpu=GpuType.NVIDIA_GEFORCE_RTX_5090, workers=(0, 3), - idle_timeout=5, + idle_timeout=300, volume=volume, env={"HF_HUB_CACHE": MODEL_PATH, "MODEL_PATH": MODEL_PATH}, - dependencies=["diffusers", "transformers"], + dependencies=["torch", "diffusers", "transformers", "accelerate"], ) class SimpleSD: def __init__(self): @@ -58,6 +58,7 @@ def __init__(self): f"Model weights stored in {model_path}: {os.listdir(model_path)}" ) + async def generate_image(self, prompt: str) -> dict: """Generate a single image from prompt.""" self.logger.info(f"Generating image for: '{prompt}'") diff --git a/CLAUDE.md b/CLAUDE.md index a9a348c..b3e33be 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -44,13 +44,11 @@ All worker files across 6 categories. Each file is an independent entry point di **Queue-based (function decorator):** ```python -from runpod_flash import Endpoint, GpuGroup +from runpod_flash import Endpoint, GpuType @Endpoint( name="my-worker", - gpu=GpuGroup.ANY, - workers=(0, 3), - idle_timeout=5, + gpu=GpuType.NVIDIA_GEFORCE_RTX_4090, ) async def my_function(payload: dict) -> dict: """All runtime imports inside the function body.""" @@ -79,9 +77,9 @@ GPU vs CPU is a parameter, not a class choice: | Config | Syntax | Use Case | |--------|--------|----------| -| GPU endpoint | `@Endpoint(name=..., gpu=GpuGroup.ANY)` | GPU workers | +| GPU endpoint | `@Endpoint(name=..., gpu=GpuType.NVIDIA_GEFORCE_RTX_4090)` | GPU workers | | CPU endpoint | `@Endpoint(name=..., cpu="cpu3c-1-2")` | CPU workers | -| GPU LB | `api = Endpoint(name=..., gpu=GpuGroup.ANY); @api.post(...)` | GPU LB endpoints | +| GPU LB | `api = Endpoint(name=..., gpu=GpuType.NVIDIA_GEFORCE_RTX_4090); @api.post(...)` | GPU LB endpoints | | CPU LB | `api = Endpoint(name=..., cpu="cpu3c-1-2"); @api.post(...)` | CPU LB endpoints | ### Cross-Worker Orchestration @@ -108,7 +106,7 @@ All examples import from `runpod_flash`. Import frequency by symbol: | Symbol | Files Using It | Breakage Risk | |--------|---------------|---------------| | `Endpoint` | 18 | ALL examples break | -| `GpuGroup` | 7 | GPU config breaks | +| `GpuType` | 7 | GPU config breaks | | `CpuInstanceType` | 4 | CPU config breaks | | `NetworkVolume` | 2 | Volume examples break | | `ServerlessScalerType` | 1 | Scaling example breaks | diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f42f66b..35181ec 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -182,9 +182,9 @@ your_example/ ### Minimal Worker (`gpu_worker.py`) ```python -from runpod_flash import Endpoint, GpuGroup +from runpod_flash import Endpoint, GpuType -@Endpoint(name="your-worker", gpu=GpuGroup.ANY, dependencies=["torch"]) +@Endpoint(name="your-worker", gpu=GpuType.NVIDIA_GEFORCE_RTX_4090, dependencies=["torch"]) async def your_function(payload: dict) -> dict: """ Clear docstring explaining what this function does. @@ -406,10 +406,10 @@ RUNPOD_API_KEY = "hardcoded_key" # Never do this! ```python # Good - handle errors within @Endpoint functions -from runpod_flash import Endpoint, GpuGroup +from runpod_flash import Endpoint, GpuType # Good -@Endpoint(name="processor", gpu=GpuGroup.ANY) +@Endpoint(name="processor", gpu=GpuType.NVIDIA_GEFORCE_RTX_4090) async def process(data: dict) -> dict: try: result = do_work(data) @@ -418,7 +418,7 @@ async def process(data: dict) -> dict: return {"status": "error", "detail": str(e)} # Bad -@Endpoint(name="processor", gpu=GpuGroup.ANY) +@Endpoint(name="processor", gpu=GpuType.NVIDIA_GEFORCE_RTX_4090) async def process(data: dict) -> dict: result = do_work(data) # no error handling return result diff --git a/docs/cli/troubleshooting.md b/docs/cli/troubleshooting.md index 9f0cc58..54612b0 100644 --- a/docs/cli/troubleshooting.md +++ b/docs/cli/troubleshooting.md @@ -789,13 +789,13 @@ ERROR: Failed to create endpoint: Insufficient GPU availability **Solutions:** -**1. Use more flexible GPU type:** +**1. Switch to a commonly-available GPU type:** ```python # before (specific GPU) -@Endpoint(name="worker", gpu=GpuGroup.A100) +@Endpoint(name="worker", gpu=GpuType.NVIDIA_A100_80GB) -# after (any GPU) -@Endpoint(name="worker", gpu=GpuGroup.ANY) +# after (widely available) +@Endpoint(name="worker", gpu=GpuType.NVIDIA_GEFORCE_RTX_4090) ``` Redeploy: @@ -803,12 +803,12 @@ Redeploy: flash deploy --env production ``` -**2. Try different GPU type:** +**2. Use GpuGroup for maximum flexibility:** ```python -# More common/available GPUs -gpus=[GpuGroup.RTX_4090] -# or -gpus=[GpuGroup.RTX_3090] +# Accepts any GPU in the group +gpu=GpuGroup.ADA_24 +# or any available GPU at all +gpu=GpuGroup.ANY ``` **3. Wait and retry:** @@ -926,7 +926,7 @@ RuntimeError: CUDA not available Fix: ```python # ensure GPU specified in Endpoint -@Endpoint(name="worker", gpu=GpuGroup.ANY) +@Endpoint(name="worker", gpu=GpuType.NVIDIA_GEFORCE_RTX_4090) ``` **4. Redeploy after fixing:** @@ -1409,7 +1409,7 @@ flash deploy --env production | Environment not found | `flash env create ` | | Module not found | `pip install -e .` | | Upload failed | Retry or reduce size | -| GPU unavailable | Use `gpus=[GpuGroup.ANY]` | +| GPU unavailable | Use `gpu=GpuType.NVIDIA_GEFORCE_RTX_4090` | **Diagnostic Commands:** diff --git a/docs/cli/workflows.md b/docs/cli/workflows.md index 3283969..f0ea3b2 100644 --- a/docs/cli/workflows.md +++ b/docs/cli/workflows.md @@ -101,7 +101,7 @@ INFO: Application startup complete. Edit your worker files (e.g., `gpu_worker.py`): ```python -@Endpoint(name="my-worker", gpu=GpuGroup.ANY) +@Endpoint(name="my-worker", gpu=GpuType.NVIDIA_GEFORCE_RTX_4090) async def process_request(payload: dict) -> dict: """Process incoming requests on GPU.""" result = perform_processing(payload) @@ -601,7 +601,7 @@ flash deploy --env production ```python @Endpoint( name="myapi_dev_gpu", - gpu=GpuGroup.ANY, # any GPU is fine + gpu=GpuType.NVIDIA_GEFORCE_RTX_4090, # specific GPU for fast provisioning workers=(0, 2), # scale to zero, small max for cost idle_timeout=1, # quick shutdown ) @@ -614,7 +614,7 @@ async def process(payload: dict) -> dict: ... name="myapi_prod_gpu", gpu=GpuGroup.A100, # specific GPU for consistency workers=(1, 10), # always have one ready, handle load spikes - idle_timeout=5, # keep warm longer + idle_timeout=300, # keep warm longer ) async def process(payload: dict) -> dict: ... ``` @@ -1155,12 +1155,12 @@ flash env get production **Before (high cost):** ```python -@Endpoint(name="worker", gpu=GpuGroup.ANY, workers=(5, 10)) # always 5 running +@Endpoint(name="worker", gpu=GpuType.NVIDIA_GEFORCE_RTX_4090, workers=(5, 10)) # always 5 running ``` **After (optimized):** ```python -@Endpoint(name="worker", gpu=GpuGroup.ANY, workers=(0, 10), idle_timeout=1) # scale to zero +@Endpoint(name="worker", gpu=GpuType.NVIDIA_GEFORCE_RTX_4090, workers=(0, 10), idle_timeout=1) # scale to zero ``` Redeploy with optimized config: @@ -1448,13 +1448,13 @@ cat gpu_worker.py **Solutions:** -**A. Change GPU type:** +**A. Choose a commonly-available GPU type:** ```python # before (specific GPU, may not be available) -@Endpoint(name="worker", gpu=GpuGroup.A100) +@Endpoint(name="worker", gpu=GpuType.NVIDIA_A100_80GB) -# after (more flexible) -@Endpoint(name="worker", gpu=GpuGroup.ANY) +# after (widely available) +@Endpoint(name="worker", gpu=GpuType.NVIDIA_GEFORCE_RTX_4090) ``` Redeploy: @@ -1469,9 +1469,9 @@ sleep 300 flash deploy --env production ``` -**C. Choose different GPU type:** +**C. Use GpuGroup for maximum flexibility:** ```python -@Endpoint(name="worker", gpu=GpuGroup.RTX_4090) # more common +@Endpoint(name="worker", gpu=GpuGroup.ANY) # any available GPU ``` #### Issue 5: Runtime Errors @@ -1534,7 +1534,7 @@ RuntimeError: CUDA not available Solution: Verify GPU configuration: ```python -@Endpoint(name="worker", gpu=GpuGroup.ANY) +@Endpoint(name="worker", gpu=GpuType.NVIDIA_GEFORCE_RTX_4090) ``` #### Issue 6: Performance Issues @@ -1560,9 +1560,9 @@ flash env get production # keep workers warm with workers=(1, N) @Endpoint( name="worker", - gpu=GpuGroup.ANY, + gpu=GpuType.NVIDIA_GEFORCE_RTX_4090, workers=(1, 5), # keep 1 warm - idle_timeout=5, # keep alive longer + idle_timeout=600, # keep alive longer ) ``` @@ -1575,7 +1575,7 @@ flash env get production # lazy loading example _model = None -@Endpoint(name="worker", gpu=GpuGroup.ANY) +@Endpoint(name="worker", gpu=GpuType.NVIDIA_GEFORCE_RTX_4090) async def infer(payload: dict) -> dict: global _model if _model is None: @@ -1586,7 +1586,7 @@ async def infer(payload: dict) -> dict: **C. Increase worker capacity:** ```python # handle more concurrent requests -@Endpoint(name="worker", gpu=GpuGroup.ANY, workers=(0, 10)) +@Endpoint(name="worker", gpu=GpuType.NVIDIA_GEFORCE_RTX_4090, workers=(0, 10)) ``` ### General Debugging Approach From 2f029529e1223b1b3afcc6854aba40c0db66182a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Sun, 8 Mar 2026 09:48:01 -0700 Subject: [PATCH 3/7] fix: correct idle_timeout values and dependency pins MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - idle_timeout is seconds, not minutes — update values accordingly - Remove unnecessary workers/idle_timeout defaults from simple examples - Unpin pandas/numpy versions for cross-Python-version compatibility - Add missing accelerate dependency to SD example - Add explicit workers to LB endpoints that need them --- 01_getting_started/01_hello_world/gpu_worker.py | 4 +--- 01_getting_started/02_cpu_worker/cpu_worker.py | 2 -- 01_getting_started/03_mixed_workers/gpu_worker.py | 1 - 01_getting_started/04_dependencies/cpu_worker.py | 4 ++-- 02_ml_inference/01_text_to_speech/gpu_worker.py | 2 +- 03_advanced_workers/05_load_balancer/cpu_lb.py | 6 +++++- 04_scaling_performance/01_autoscaling/cpu_worker.py | 2 +- 05_data_workflows/01_network_volumes/cpu_worker.py | 1 + 8 files changed, 11 insertions(+), 11 deletions(-) diff --git a/01_getting_started/01_hello_world/gpu_worker.py b/01_getting_started/01_hello_world/gpu_worker.py index 5c7c093..d204c16 100644 --- a/01_getting_started/01_hello_world/gpu_worker.py +++ b/01_getting_started/01_hello_world/gpu_worker.py @@ -6,9 +6,7 @@ @Endpoint( name="01_01_gpu_worker", - gpu=GpuGroup.ANY, - workers=(0, 3), - idle_timeout=5, + gpu=GpuGroup.ADA_24, ) async def gpu_hello(input_data: dict) -> dict: """GPU worker that returns GPU hardware info.""" diff --git a/01_getting_started/02_cpu_worker/cpu_worker.py b/01_getting_started/02_cpu_worker/cpu_worker.py index 01fdef7..0679296 100644 --- a/01_getting_started/02_cpu_worker/cpu_worker.py +++ b/01_getting_started/02_cpu_worker/cpu_worker.py @@ -7,8 +7,6 @@ @Endpoint( name="01_02_cpu_worker", cpu=CpuInstanceType.CPU3C_1_2, - workers=(0, 3), - idle_timeout=5, ) async def cpu_hello(input_data: dict) -> dict: """CPU worker that returns a greeting.""" diff --git a/01_getting_started/03_mixed_workers/gpu_worker.py b/01_getting_started/03_mixed_workers/gpu_worker.py index 64ab66c..b6ae065 100644 --- a/01_getting_started/03_mixed_workers/gpu_worker.py +++ b/01_getting_started/03_mixed_workers/gpu_worker.py @@ -9,7 +9,6 @@ name="01_03_mixed_inference", gpu=GpuGroup.ADA_24, workers=(0, 3), - idle_timeout=5, ) async def gpu_inference(input_data: dict) -> dict: """GPU inference: mock sentiment classification.""" diff --git a/01_getting_started/04_dependencies/cpu_worker.py b/01_getting_started/04_dependencies/cpu_worker.py index 05f994c..64e2c96 100644 --- a/01_getting_started/04_dependencies/cpu_worker.py +++ b/01_getting_started/04_dependencies/cpu_worker.py @@ -10,8 +10,8 @@ cpu=CpuInstanceType.CPU3C_8_16, workers=(0, 3), dependencies=[ - "pandas==2.1.3", - "numpy==1.26.2", + "pandas", + "numpy", "scipy>=1.11.0", "matplotlib", ], diff --git a/02_ml_inference/01_text_to_speech/gpu_worker.py b/02_ml_inference/01_text_to_speech/gpu_worker.py index a9e3cd1..6d60e01 100644 --- a/02_ml_inference/01_text_to_speech/gpu_worker.py +++ b/02_ml_inference/01_text_to_speech/gpu_worker.py @@ -8,7 +8,7 @@ name="02_01_text_to_speech_gpu", gpu=GpuGroup.ADA_24, workers=(0, 3), - idle_timeout=5, + idle_timeout=300, dependencies=["qwen-tts", "soundfile"], ) async def generate_speech(input_data: dict) -> dict: diff --git a/03_advanced_workers/05_load_balancer/cpu_lb.py b/03_advanced_workers/05_load_balancer/cpu_lb.py index 47f1d95..08a9105 100644 --- a/03_advanced_workers/05_load_balancer/cpu_lb.py +++ b/03_advanced_workers/05_load_balancer/cpu_lb.py @@ -3,7 +3,11 @@ # test directly: python cpu_lb.py from runpod_flash import Endpoint -api = Endpoint(name="03_05_load_balancer_cpu", cpu="cpu3c-1-2") +api = Endpoint( + name="03_05_load_balancer_cpu", + cpu="cpu3c-1-2", + workers=(1, 3), +) @api.get("/health") diff --git a/04_scaling_performance/01_autoscaling/cpu_worker.py b/04_scaling_performance/01_autoscaling/cpu_worker.py index a508158..fa4f206 100644 --- a/04_scaling_performance/01_autoscaling/cpu_worker.py +++ b/04_scaling_performance/01_autoscaling/cpu_worker.py @@ -10,7 +10,7 @@ name="04_01_cpu_scale_to_zero", cpu=CpuInstanceType.CPU3C_1_2, workers=(0, 5), - idle_timeout=5, + idle_timeout=1, ) async def cpu_scale_to_zero(payload: dict) -> dict: """CPU worker with scale-to-zero -- cost-optimized preprocessing.""" diff --git a/05_data_workflows/01_network_volumes/cpu_worker.py b/05_data_workflows/01_network_volumes/cpu_worker.py index d2c02cf..5d1dad4 100644 --- a/05_data_workflows/01_network_volumes/cpu_worker.py +++ b/05_data_workflows/01_network_volumes/cpu_worker.py @@ -12,6 +12,7 @@ name="05_01_cpu_worker", cpu="cpu3c-1-2", workers=(1, 3), + idle_timeout=120, volume=volume, ) From f7593ca2d39f4beae6c2c3d163fd779c576354a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Sun, 8 Mar 2026 09:51:02 -0700 Subject: [PATCH 4/7] chore: format gpu_worker.py --- 05_data_workflows/01_network_volumes/gpu_worker.py | 1 - 1 file changed, 1 deletion(-) diff --git a/05_data_workflows/01_network_volumes/gpu_worker.py b/05_data_workflows/01_network_volumes/gpu_worker.py index da47596..fd4c7b2 100644 --- a/05_data_workflows/01_network_volumes/gpu_worker.py +++ b/05_data_workflows/01_network_volumes/gpu_worker.py @@ -58,7 +58,6 @@ def __init__(self): f"Model weights stored in {model_path}: {os.listdir(model_path)}" ) - async def generate_image(self, prompt: str) -> dict: """Generate a single image from prompt.""" self.logger.info(f"Generating image for: '{prompt}'") From 5a42db41847bfdc6f87578c7416c3a863e64d6ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Sun, 8 Mar 2026 21:43:42 -0700 Subject: [PATCH 5/7] chore: add project metadata and tighten Python upper bound to <3.13 --- pyproject.toml | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index c54284a..ad95415 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,8 +2,18 @@ name = "runpod-flash-examples" version = "1.0.0" description = "A collection of example applications showcasing Runpod Flash - a framework for building production-ready AI applications with distributed GPU and CPU computing." +authors = [ + { name = "Runpod", email = "engineer@runpod.io" }, +] +license = { text = "MIT" } +classifiers = [ + "Development Status :: 4 - Beta", + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", +] readme = "README.md" -requires-python = ">=3.10,<3.14" +requires-python = ">=3.10,<3.13" dependencies = [ "runpod-flash>=1.7.0", ] From af4b3675b047c50424432ee3a00bb53aa9363abf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Mon, 9 Mar 2026 08:42:46 -0700 Subject: [PATCH 6/7] fix(review): address PR #41 feedback -- GpuType naming, idle_timeout, examples - Fix GpuType.NVIDIA_A100_80GB to NVIDIA_A100_80GB_PCIe across docs - Fix GpuType.NVIDIA_H100_80GB to NVIDIA_H100_80GB_HBM3 in README - Migrate hello_world gpu_worker from GpuGroup.ADA_24 to GpuType.NVIDIA_GEFORCE_RTX_4090 - Fix idle_timeout config dict mismatches in autoscaling workers - Correct idle_timeout units from minutes to seconds in README tables - Use GpuGroup.ANY for GPU unavailable troubleshooting quick fix --- 01_getting_started/01_hello_world/README.md | 2 +- 01_getting_started/01_hello_world/gpu_worker.py | 4 ++-- 04_scaling_performance/01_autoscaling/README.md | 14 +++++++------- .../01_autoscaling/cpu_worker.py | 2 +- .../01_autoscaling/gpu_worker.py | 2 +- docs/cli/troubleshooting.md | 4 ++-- docs/cli/workflows.md | 2 +- 7 files changed, 15 insertions(+), 15 deletions(-) diff --git a/01_getting_started/01_hello_world/README.md b/01_getting_started/01_hello_world/README.md index 91181b9..beb291c 100644 --- a/01_getting_started/01_hello_world/README.md +++ b/01_getting_started/01_hello_world/README.md @@ -145,7 +145,7 @@ flash run ## Next Steps -- Customize GPU type: Change `GpuType.NVIDIA_GEFORCE_RTX_4090` to another GPU (e.g. `GpuType.NVIDIA_A100_80GB`, `GpuType.NVIDIA_H100_80GB`) +- Customize GPU type: Change `GpuType.NVIDIA_GEFORCE_RTX_4090` to another GPU (e.g. `GpuType.NVIDIA_A100_80GB_PCIe`, `GpuType.NVIDIA_H100_80GB_HBM3`) - Add your own GPU-accelerated code - Implement error handling and validation - Deploy to production with `flash deploy` diff --git a/01_getting_started/01_hello_world/gpu_worker.py b/01_getting_started/01_hello_world/gpu_worker.py index d204c16..d7a330f 100644 --- a/01_getting_started/01_hello_world/gpu_worker.py +++ b/01_getting_started/01_hello_world/gpu_worker.py @@ -1,12 +1,12 @@ # gpu serverless worker -- detects available GPU hardware. # run with: flash run # test directly: python gpu_worker.py -from runpod_flash import Endpoint, GpuGroup +from runpod_flash import Endpoint, GpuType @Endpoint( name="01_01_gpu_worker", - gpu=GpuGroup.ADA_24, + gpu=GpuType.NVIDIA_GEFORCE_RTX_4090, ) async def gpu_hello(input_data: dict) -> dict: """GPU worker that returns GPU hardware info.""" diff --git a/04_scaling_performance/01_autoscaling/README.md b/04_scaling_performance/01_autoscaling/README.md index 3c1a09d..0e02e67 100644 --- a/04_scaling_performance/01_autoscaling/README.md +++ b/04_scaling_performance/01_autoscaling/README.md @@ -38,16 +38,16 @@ curl -X POST http://localhost:8888/cpu_worker/runsync \ | Strategy | workers | idle_timeout | scaler_type | scaler_value | Use Case | |----------|---------|-------------|-------------|-------------|----------| -| Scale to Zero | (0, 3) | 5 min | QUEUE_DELAY | 4 | Sporadic/batch, cost-first | -| Always On | (1, 3) | 60 min | QUEUE_DELAY | 4 | Steady traffic, latency-first | -| High Throughput | (2, 10) | 30 min | REQUEST_COUNT | 3 | Bursty traffic, throughput-first | +| Scale to Zero | (0, 3) | 300s | QUEUE_DELAY | 4 | Sporadic/batch, cost-first | +| Always On | (1, 3) | 60s | QUEUE_DELAY | 4 | Steady traffic, latency-first | +| High Throughput | (2, 10) | 30s | REQUEST_COUNT | 3 | Bursty traffic, throughput-first | ### CPU Workers (`cpu_worker.py`) | Strategy | workers | idle_timeout | Use Case | |----------|---------|-------------|----------| -| Scale to Zero | (0, 5) | 5 min | Cost-optimized preprocessing | -| Burst Ready | (1, 10) | 30 min | Always-warm API gateway | +| Scale to Zero | (0, 5) | 1s | Cost-optimized preprocessing | +| Burst Ready | (1, 10) | 30s | Always-warm API gateway | ## How Autoscaling Works @@ -83,7 +83,7 @@ Requests arrive | Parameter | Type | Default | Description | |-----------|------|---------|-------------| | `workers` | int or (min, max) | (0, 1) | Worker scaling bounds | -| `idle_timeout` | int | 60 | Minutes before idle workers terminate | +| `idle_timeout` | int | 60 | Seconds before idle workers terminate | | `scaler_type` | ServerlessScalerType | QUEUE_DELAY | Scaling trigger metric | | `scaler_value` | int | 4 | Target value for the scaler metric | | `gpu` | GpuType or GpuGroup | -- | GPU type for GPU endpoints | @@ -99,7 +99,7 @@ from runpod_flash import Endpoint, GpuType, ServerlessScalerType name="batch-worker", gpu=GpuType.NVIDIA_GEFORCE_RTX_4090, workers=(0, 3), - idle_timeout=1, + idle_timeout=300, scaler_type=ServerlessScalerType.QUEUE_DELAY, scaler_value=4, ) diff --git a/04_scaling_performance/01_autoscaling/cpu_worker.py b/04_scaling_performance/01_autoscaling/cpu_worker.py index fa4f206..6660ea3 100644 --- a/04_scaling_performance/01_autoscaling/cpu_worker.py +++ b/04_scaling_performance/01_autoscaling/cpu_worker.py @@ -39,7 +39,7 @@ async def cpu_scale_to_zero(payload: dict) -> dict: "token_count": len(tokens), "byte_size": byte_size, }, - "config": {"workersMin": 0, "workersMax": 5, "idleTimeout": 5}, + "config": {"workersMin": 0, "workersMax": 5, "idleTimeout": 1}, "timestamp": time.strftime("%Y-%m-%dT%H:%M:%S"), } diff --git a/04_scaling_performance/01_autoscaling/gpu_worker.py b/04_scaling_performance/01_autoscaling/gpu_worker.py index 9af139c..2d12fb0 100644 --- a/04_scaling_performance/01_autoscaling/gpu_worker.py +++ b/04_scaling_performance/01_autoscaling/gpu_worker.py @@ -45,7 +45,7 @@ async def scale_to_zero_inference(payload: dict) -> dict: if torch.cuda.is_available() else "N/A", }, - "config": {"workersMin": 0, "workersMax": 3, "idleTimeout": 5}, + "config": {"workersMin": 0, "workersMax": 3, "idleTimeout": 300}, "timestamp": time.strftime("%Y-%m-%dT%H:%M:%S"), } diff --git a/docs/cli/troubleshooting.md b/docs/cli/troubleshooting.md index 54612b0..9d689a6 100644 --- a/docs/cli/troubleshooting.md +++ b/docs/cli/troubleshooting.md @@ -792,7 +792,7 @@ ERROR: Failed to create endpoint: Insufficient GPU availability **1. Switch to a commonly-available GPU type:** ```python # before (specific GPU) -@Endpoint(name="worker", gpu=GpuType.NVIDIA_A100_80GB) +@Endpoint(name="worker", gpu=GpuType.NVIDIA_A100_80GB_PCIe) # after (widely available) @Endpoint(name="worker", gpu=GpuType.NVIDIA_GEFORCE_RTX_4090) @@ -1409,7 +1409,7 @@ flash deploy --env production | Environment not found | `flash env create ` | | Module not found | `pip install -e .` | | Upload failed | Retry or reduce size | -| GPU unavailable | Use `gpu=GpuType.NVIDIA_GEFORCE_RTX_4090` | +| GPU unavailable | Use `gpu=GpuGroup.ANY` or `gpu=GpuType.ANY` | **Diagnostic Commands:** diff --git a/docs/cli/workflows.md b/docs/cli/workflows.md index f0ea3b2..78bc718 100644 --- a/docs/cli/workflows.md +++ b/docs/cli/workflows.md @@ -1451,7 +1451,7 @@ cat gpu_worker.py **A. Choose a commonly-available GPU type:** ```python # before (specific GPU, may not be available) -@Endpoint(name="worker", gpu=GpuType.NVIDIA_A100_80GB) +@Endpoint(name="worker", gpu=GpuType.NVIDIA_A100_80GB_PCIe) # after (widely available) @Endpoint(name="worker", gpu=GpuType.NVIDIA_GEFORCE_RTX_4090) From e4ec7e9ba5baef79b3ed78bd9e94aa03cf30a250 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Mon, 9 Mar 2026 17:20:30 -0700 Subject: [PATCH 7/7] chore: bump runpod-flash minimum version to 1.8.0 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index ad95415..3be2308 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,7 +15,7 @@ classifiers = [ readme = "README.md" requires-python = ">=3.10,<3.13" dependencies = [ - "runpod-flash>=1.7.0", + "runpod-flash>=1.8.0", ] [dependency-groups]