From cfa1806a7a6c3c8c4367bc94e605f88099f65ffd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Sun, 8 Mar 2026 09:47:49 -0700
Subject: [PATCH 1/7] docs: document Python version constraints for deployed
 workers

GPU workers are pinned to Python 3.12 (torch/CUDA only installed for
3.12 in base image). CPU workers support 3.10-3.12. Build pipeline
handles wheel selection automatically.

Also bumps requires-python to <3.14 and runpod-flash to >=1.7.0.
---
 README.md      | 9 ++++++++-
 pyproject.toml | 4 ++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 6fc04fd..c73bcdf 100644
--- a/README.md
+++ b/README.md
@@ -27,6 +27,13 @@ async def generate_image(prompt: str) -> bytes:
 - **uv**: Install with `curl -LsSf https://astral.sh/uv/install.sh | sh`
 - **Runpod account**: [Sign up here](https://runpod.io/console/signup)
 
+### Python version in deployed workers
+
+Your local Python version does not affect what runs in the cloud. `flash build` downloads wheels for the container's Python version automatically.
+
+- **GPU workers**: Python 3.12 only. The GPU base image ships multiple interpreters (3.9-3.14) for interactive pod use, but torch and CUDA libraries are installed only for 3.12.
+- **CPU workers**: Python 3.10, 3.11, or 3.12. Configurable via `PYTHON_VERSION` build arg.
+
 ## Quick Start
 
 ```bash
@@ -139,7 +146,7 @@ print(job.output)
 Workers automatically scale based on demand:
 - `workers=(0, 3)` - Scale from 0 to 3 workers (cost-efficient)
 - `workers=(1, 5)` - Keep 1 warm, scale up to 5
-- `idle_timeout=5` - Minutes before scaling down
+- `idle_timeout=5` - Seconds before scaling down
 
 ## Resources
 
diff --git a/pyproject.toml b/pyproject.toml
index 83b1b42..c54284a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,9 +3,9 @@ name = "runpod-flash-examples"
 version = "1.0.0"
 description = "A collection of example applications showcasing Runpod Flash - a framework for building production-ready AI applications with distributed GPU and CPU computing."
 readme = "README.md"
-requires-python = ">=3.10"
+requires-python = ">=3.10,<3.14"
 dependencies = [
-    "runpod-flash>=1.4.0",
+    "runpod-flash>=1.7.0",
 ]
 
 [dependency-groups]

From dd03f4a95b8eb89639ac8c0c99d55ae17a4315b1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Sun, 8 Mar 2026 09:47:55 -0700
Subject: [PATCH 2/7] refactor: migrate GpuGroup.ANY to GpuType with specific
 GPU models

Replace GpuGroup.ANY with explicit GpuType values (e.g.
GpuType.NVIDIA_GEFORCE_RTX_4090) across all examples, docs, and
contributing guide. Specific GPU types give users faster provisioning
and predictable behavior.
---
 01_getting_started/01_hello_world/README.md   |  6 +-
 01_getting_started/03_mixed_workers/README.md |  4 +-
 01_getting_started/04_dependencies/README.md  | 56 +++++++++----------
 .../05_load_balancer/README.md                |  4 +-
 .../05_load_balancer/gpu_lb.py                |  8 ++-
 .../01_autoscaling/README.md                  | 12 ++--
 .../01_autoscaling/gpu_worker.py              | 10 ++--
 .../01_network_volumes/gpu_worker.py          |  9 +--
 CLAUDE.md                                     | 12 ++--
 CONTRIBUTING.md                               | 10 ++--
 docs/cli/troubleshooting.md                   | 22 ++++----
 docs/cli/workflows.md                         | 32 +++++------
 12 files changed, 93 insertions(+), 92 deletions(-)

diff --git a/01_getting_started/01_hello_world/README.md b/01_getting_started/01_hello_world/README.md
index a72e3b0..91181b9 100644
--- a/01_getting_started/01_hello_world/README.md
+++ b/01_getting_started/01_hello_world/README.md
@@ -112,9 +112,9 @@ The `@Endpoint` decorator transparently executes functions on serverless infrast
 - Handles serialization and resource management
 
 ```python
-from runpod_flash import Endpoint, GpuGroup
+from runpod_flash import Endpoint, GpuType
 
-@Endpoint(name="my-worker", gpu=GpuGroup.ANY, workers=(0, 3))
+@Endpoint(name="my-worker", gpu=GpuType.NVIDIA_GEFORCE_RTX_4090, workers=(0, 3))
 async def my_function(data: dict) -> dict:
     return {"result": "processed"}
 ```
@@ -145,7 +145,7 @@ flash run
 
 ## Next Steps
 
-- Customize GPU type: Change `GpuGroup.ANY` to a specific GPU (e.g. `GpuGroup.ADA_24`, `GpuGroup.AMPERE_80`)
+- Customize GPU type: Change `GpuType.NVIDIA_GEFORCE_RTX_4090` to another GPU (e.g. `GpuType.NVIDIA_A100_80GB`, `GpuType.NVIDIA_H100_80GB`)
 - Add your own GPU-accelerated code
 - Implement error handling and validation
 - Deploy to production with `flash deploy`
diff --git a/01_getting_started/03_mixed_workers/README.md b/01_getting_started/03_mixed_workers/README.md
index 78e8562..e85fad4 100644
--- a/01_getting_started/03_mixed_workers/README.md
+++ b/01_getting_started/03_mixed_workers/README.md
@@ -134,7 +134,7 @@ Total: $0.0019/sec
     name="preprocess_worker",
     cpu=CpuInstanceType.CPU3G_2_8,  # 2 vCPU, 8GB
     workers=(0, 10),
-    idle_timeout=3,
+    idle_timeout=180,
 )
 async def preprocess_text(input_data: dict) -> dict: ...
 ```
@@ -148,7 +148,7 @@ async def preprocess_text(input_data: dict) -> dict: ...
     name="inference_worker",
     gpu=GpuGroup.ADA_24,  # RTX 4090
     workers=(0, 3),
-    idle_timeout=5,
+    idle_timeout=300,
     dependencies=["torch"],
 )
 async def gpu_inference(input_data: dict) -> dict: ...
diff --git a/01_getting_started/04_dependencies/README.md b/01_getting_started/04_dependencies/README.md
index 5ea27fe..6f7a6fd 100644
--- a/01_getting_started/04_dependencies/README.md
+++ b/01_getting_started/04_dependencies/README.md
@@ -6,7 +6,7 @@ Learn how to manage Python packages and system dependencies in Flash workers.
 
 - **Python dependencies** - Installing packages with version constraints
 - **System dependencies** - Installing apt packages (ffmpeg, libgl1, etc.)
-- **Version pinning** - Reproducible builds with exact versions
+- **Version constraints** - Supported syntax for version pinning
 - **Dependency optimization** - Minimizing cold start time
 
 ## Quick Start
@@ -105,7 +105,7 @@ async def simple_function(data: dict) -> dict:
 ```python
 "requests==2.32.3"  # Exactly 2.32.3
 ```
-**Use when:** You need reproducible builds
+**Use when:** You need reproducible builds for a specific Python version
 
 ### Minimum Version (>=)
 ```python
@@ -117,7 +117,7 @@ async def simple_function(data: dict) -> dict:
 ```python
 "python-dateutil<3.0.0"  # Below 3.0.0
 ```
-**Use when:** Avoiding breaking changes
+**Use when:** Avoiding breaking changes in a major release
 
 ### Compatible Release (~=)
 ```python
@@ -129,7 +129,7 @@ async def simple_function(data: dict) -> dict:
 ```python
 "pandas"  # Latest available
 ```
-**Use when:** You always want the newest version (not recommended for production)
+**Use when:** You want the latest compatible version (recommended for examples and prototyping)
 
 ## Common Dependencies
 
@@ -146,8 +146,8 @@ dependencies=[
 ### Data Science
 ```python
 dependencies=[
-    "pandas==2.1.3",
-    "numpy==1.26.2",
+    "pandas",
+    "numpy",
     "scipy>=1.11.0",
     "matplotlib",
     "scikit-learn",
@@ -196,32 +196,28 @@ system_dependencies=["ffmpeg", "libgl1", "wget"]
 
 ## Best Practices
 
-### 1. Pin Versions for Production
+### 1. Use Version Constraints Thoughtfully
 
 ```python
-# Good - Reproducible
-@Endpoint(
-    name="worker",
-    gpu=GpuGroup.ADA_24,
-    dependencies=[
-        "requests==2.32.3",
-        "transformers==4.35.2",
-        "numpy==1.26.2",
-    ],
-)
+# Good for examples and prototyping - works across Python versions
+dependencies=[
+    "requests",
+    "transformers",
+    "numpy",
+]
 
-# Bad - Unpredictable
-@Endpoint(
-    name="worker",
-    gpu=GpuGroup.ADA_24,
-    dependencies=[
-        "requests",  # Version changes over time
-        "transformers",
-        "numpy",
-    ],
-)
+# Good for production - reproducible on a known Python version
+dependencies=[
+    "requests==2.32.3",
+    "transformers==4.35.2",
+    "numpy==1.26.2",
+]
 ```
 
+Exact pins can break across Python versions (e.g., older numpy
+builds don't exist for Python 3.13+). Pin only when you control
+the target Python version.
+
 ### 2. Minimize Dependencies
 
 ```python
@@ -287,7 +283,9 @@ ERROR: Cannot install requests==2.25.0 and urllib3==2.2.1
 because these package versions have conflicting dependencies.
 ```
 
-**Solution:** Check compatibility matrix, adjust versions:
+**Solutions:**
+1. Drop exact pins and let pip resolve compatible versions
+2. Check compatibility matrix and adjust versions:
 ```python
 dependencies=[
     "requests>=2.32.0",
@@ -332,7 +330,7 @@ For local development, create `requirements.txt`:
 runpod-flash
 transformers==4.35.2
 Pillow>=10.0.0
-numpy==1.26.2
+numpy
 ```
 
 **Note:** Worker dependencies in the `Endpoint` decorator are deployed automatically. `requirements.txt` is for local development only.
diff --git a/03_advanced_workers/05_load_balancer/README.md b/03_advanced_workers/05_load_balancer/README.md
index f3071ac..2c6eadc 100644
--- a/03_advanced_workers/05_load_balancer/README.md
+++ b/03_advanced_workers/05_load_balancer/README.md
@@ -86,10 +86,10 @@ curl -X POST http://localhost:8888/05_load_balancer/cpu/transform \
 Load-balanced endpoints use the `Endpoint` class with route decorators (`.get()`, `.post()`, etc.) to define HTTP routes. The decorator automatically registers the function as an HTTP endpoint on the load-balancer runtime.
 
 ```python
-from runpod_flash import Endpoint, GpuGroup
+from runpod_flash import Endpoint, GpuType
 
 # create load-balanced endpoint
-api = Endpoint(name="my-service", gpu=GpuGroup.ANY, workers=(1, 3))
+api = Endpoint(name="my-service", gpu=GpuType.NVIDIA_GEFORCE_RTX_4090, workers=(1, 3))
 
 # define HTTP routes with method decorators
 @api.get("/health")
diff --git a/03_advanced_workers/05_load_balancer/gpu_lb.py b/03_advanced_workers/05_load_balancer/gpu_lb.py
index a17301a..574b029 100644
--- a/03_advanced_workers/05_load_balancer/gpu_lb.py
+++ b/03_advanced_workers/05_load_balancer/gpu_lb.py
@@ -1,9 +1,13 @@
 # gpu load-balanced endpoints with custom HTTP routes.
 # run with: flash run
 # test directly: python gpu_lb.py
-from runpod_flash import Endpoint, GpuGroup
+from runpod_flash import Endpoint, GpuType
 
-api = Endpoint(name="03_05_load_balancer_gpu", gpu=GpuGroup.ANY, workers=(1, 3))
+api = Endpoint(
+    name="03_05_load_balancer_gpu",
+    gpu=GpuType.NVIDIA_GEFORCE_RTX_4090,
+    workers=(1, 3),
+)
 
 
 @api.get("/health")
diff --git a/04_scaling_performance/01_autoscaling/README.md b/04_scaling_performance/01_autoscaling/README.md
index 2d8101a..3c1a09d 100644
--- a/04_scaling_performance/01_autoscaling/README.md
+++ b/04_scaling_performance/01_autoscaling/README.md
@@ -86,20 +86,20 @@ Requests arrive
 | `idle_timeout` | int | 60 | Minutes before idle workers terminate |
 | `scaler_type` | ServerlessScalerType | QUEUE_DELAY | Scaling trigger metric |
 | `scaler_value` | int | 4 | Target value for the scaler metric |
-| `gpu` | GpuGroup or GpuType | ANY | GPU type for GPU endpoints |
+| `gpu` | GpuType or GpuGroup | -- | GPU type for GPU endpoints |
 | `cpu` | CpuInstanceType or str | -- | CPU instance type for CPU endpoints |
 
 ### Example Configurations
 
 ```python
-from runpod_flash import Endpoint, GpuGroup, ServerlessScalerType
+from runpod_flash import Endpoint, GpuType, ServerlessScalerType
 
 # scale to zero, cost-optimized
 @Endpoint(
     name="batch-worker",
-    gpu=GpuGroup.ANY,
+    gpu=GpuType.NVIDIA_GEFORCE_RTX_4090,
     workers=(0, 3),
-    idle_timeout=5,
+    idle_timeout=1,
     scaler_type=ServerlessScalerType.QUEUE_DELAY,
     scaler_value=4,
 )
@@ -108,7 +108,7 @@ async def batch_process(payload: dict) -> dict: ...
 # always-on, latency-optimized
 @Endpoint(
     name="api-worker",
-    gpu=GpuGroup.ANY,
+    gpu=GpuType.NVIDIA_GEFORCE_RTX_4090,
     workers=(1, 3),
     idle_timeout=60,
 )
@@ -117,7 +117,7 @@ async def api_process(payload: dict) -> dict: ...
 # high-throughput, burst-optimized
 @Endpoint(
     name="burst-worker",
-    gpu=GpuGroup.ANY,
+    gpu=GpuType.NVIDIA_GEFORCE_RTX_4090,
     workers=(2, 10),
     idle_timeout=30,
     scaler_type=ServerlessScalerType.REQUEST_COUNT,
diff --git a/04_scaling_performance/01_autoscaling/gpu_worker.py b/04_scaling_performance/01_autoscaling/gpu_worker.py
index ca982a2..9af139c 100644
--- a/04_scaling_performance/01_autoscaling/gpu_worker.py
+++ b/04_scaling_performance/01_autoscaling/gpu_worker.py
@@ -1,7 +1,7 @@
 # gpu autoscaling strategies -- scale-to-zero, always-on, high-throughput.
 # run with: flash run
 # test directly: python gpu_worker.py
-from runpod_flash import Endpoint, GpuGroup, ServerlessScalerType
+from runpod_flash import Endpoint, GpuType, ServerlessScalerType
 
 
 # --- strategy 1: scale to zero ---
@@ -9,9 +9,9 @@
 # workers scale down to zero after 5 minutes of idle time.
 @Endpoint(
     name="04_01_scale_to_zero",
-    gpu=GpuGroup.ANY,
+    gpu=GpuType.NVIDIA_GEFORCE_RTX_4090,
     workers=(0, 3),
-    idle_timeout=5,
+    idle_timeout=300,
     scaler_type=ServerlessScalerType.QUEUE_DELAY,
     scaler_value=4,
 )
@@ -55,7 +55,7 @@ async def scale_to_zero_inference(payload: dict) -> dict:
 # at least one worker stays warm to avoid cold starts.
 @Endpoint(
     name="04_01_always_on",
-    gpu=GpuGroup.ANY,
+    gpu=GpuType.NVIDIA_GEFORCE_RTX_4090,
     workers=(1, 3),
     idle_timeout=60,
     scaler_type=ServerlessScalerType.QUEUE_DELAY,
@@ -101,7 +101,7 @@ async def always_on_inference(payload: dict) -> dict:
 # starts with 2 warm workers, scales aggressively to 10 based on request count.
 @Endpoint(
     name="04_01_high_throughput",
-    gpu=GpuGroup.ANY,
+    gpu=GpuType.NVIDIA_GEFORCE_RTX_4090,
     workers=(2, 10),
     idle_timeout=30,
     scaler_type=ServerlessScalerType.REQUEST_COUNT,
diff --git a/05_data_workflows/01_network_volumes/gpu_worker.py b/05_data_workflows/01_network_volumes/gpu_worker.py
index f9ac69e..da47596 100644
--- a/05_data_workflows/01_network_volumes/gpu_worker.py
+++ b/05_data_workflows/01_network_volumes/gpu_worker.py
@@ -3,7 +3,7 @@
 # test directly: python gpu_worker.py
 import logging
 
-from runpod_flash import Endpoint, GpuGroup, NetworkVolume
+from runpod_flash import Endpoint, GpuType, NetworkVolume
 
 logger = logging.getLogger(__name__)
 
@@ -17,12 +17,12 @@
 
 @Endpoint(
     name="05_01_gpu_worker",
-    gpu=GpuGroup.ANY,
+    gpu=GpuType.NVIDIA_GEFORCE_RTX_5090,
     workers=(0, 3),
-    idle_timeout=5,
+    idle_timeout=300,
     volume=volume,
     env={"HF_HUB_CACHE": MODEL_PATH, "MODEL_PATH": MODEL_PATH},
-    dependencies=["diffusers", "transformers"],
+    dependencies=["torch", "diffusers", "transformers", "accelerate"],
 )
 class SimpleSD:
     def __init__(self):
@@ -58,6 +58,7 @@ def __init__(self):
             f"Model weights stored in {model_path}: {os.listdir(model_path)}"
         )
 
+
     async def generate_image(self, prompt: str) -> dict:
         """Generate a single image from prompt."""
         self.logger.info(f"Generating image for: '{prompt}'")
diff --git a/CLAUDE.md b/CLAUDE.md
index a9a348c..b3e33be 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -44,13 +44,11 @@ All worker files across 6 categories. Each file is an independent entry point di
 
 **Queue-based (function decorator):**
 ```python
-from runpod_flash import Endpoint, GpuGroup
+from runpod_flash import Endpoint, GpuType
 
 @Endpoint(
     name="my-worker",
-    gpu=GpuGroup.ANY,
-    workers=(0, 3),
-    idle_timeout=5,
+    gpu=GpuType.NVIDIA_GEFORCE_RTX_4090,
 )
 async def my_function(payload: dict) -> dict:
     """All runtime imports inside the function body."""
@@ -79,9 +77,9 @@ GPU vs CPU is a parameter, not a class choice:
 
 | Config | Syntax | Use Case |
 |--------|--------|----------|
-| GPU endpoint | `@Endpoint(name=..., gpu=GpuGroup.ANY)` | GPU workers |
+| GPU endpoint | `@Endpoint(name=..., gpu=GpuType.NVIDIA_GEFORCE_RTX_4090)` | GPU workers |
 | CPU endpoint | `@Endpoint(name=..., cpu="cpu3c-1-2")` | CPU workers |
-| GPU LB | `api = Endpoint(name=..., gpu=GpuGroup.ANY); @api.post(...)` | GPU LB endpoints |
+| GPU LB | `api = Endpoint(name=..., gpu=GpuType.NVIDIA_GEFORCE_RTX_4090); @api.post(...)` | GPU LB endpoints |
 | CPU LB | `api = Endpoint(name=..., cpu="cpu3c-1-2"); @api.post(...)` | CPU LB endpoints |
 
 ### Cross-Worker Orchestration
@@ -108,7 +106,7 @@ All examples import from `runpod_flash`. Import frequency by symbol:
 | Symbol | Files Using It | Breakage Risk |
 |--------|---------------|---------------|
 | `Endpoint` | 18 | ALL examples break |
-| `GpuGroup` | 7 | GPU config breaks |
+| `GpuType` | 7 | GPU config breaks |
 | `CpuInstanceType` | 4 | CPU config breaks |
 | `NetworkVolume` | 2 | Volume examples break |
 | `ServerlessScalerType` | 1 | Scaling example breaks |
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index f42f66b..35181ec 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -182,9 +182,9 @@ your_example/
 ### Minimal Worker (`gpu_worker.py`)
 
 ```python
-from runpod_flash import Endpoint, GpuGroup
+from runpod_flash import Endpoint, GpuType
 
-@Endpoint(name="your-worker", gpu=GpuGroup.ANY, dependencies=["torch"])
+@Endpoint(name="your-worker", gpu=GpuType.NVIDIA_GEFORCE_RTX_4090, dependencies=["torch"])
 async def your_function(payload: dict) -> dict:
     """
     Clear docstring explaining what this function does.
@@ -406,10 +406,10 @@ RUNPOD_API_KEY = "hardcoded_key"  # Never do this!
 
 ```python
 # Good - handle errors within @Endpoint functions
-from runpod_flash import Endpoint, GpuGroup
+from runpod_flash import Endpoint, GpuType
 
 # Good
-@Endpoint(name="processor", gpu=GpuGroup.ANY)
+@Endpoint(name="processor", gpu=GpuType.NVIDIA_GEFORCE_RTX_4090)
 async def process(data: dict) -> dict:
     try:
         result = do_work(data)
@@ -418,7 +418,7 @@ async def process(data: dict) -> dict:
         return {"status": "error", "detail": str(e)}
 
 # Bad
-@Endpoint(name="processor", gpu=GpuGroup.ANY)
+@Endpoint(name="processor", gpu=GpuType.NVIDIA_GEFORCE_RTX_4090)
 async def process(data: dict) -> dict:
     result = do_work(data)  # no error handling
     return result
diff --git a/docs/cli/troubleshooting.md b/docs/cli/troubleshooting.md
index 9f0cc58..54612b0 100644
--- a/docs/cli/troubleshooting.md
+++ b/docs/cli/troubleshooting.md
@@ -789,13 +789,13 @@ ERROR: Failed to create endpoint: Insufficient GPU availability
 
 **Solutions:**
 
-**1. Use more flexible GPU type:**
+**1. Switch to a commonly-available GPU type:**
 ```python
 # before (specific GPU)
-@Endpoint(name="worker", gpu=GpuGroup.A100)
+@Endpoint(name="worker", gpu=GpuType.NVIDIA_A100_80GB)
 
-# after (any GPU)
-@Endpoint(name="worker", gpu=GpuGroup.ANY)
+# after (widely available)
+@Endpoint(name="worker", gpu=GpuType.NVIDIA_GEFORCE_RTX_4090)
 ```
 
 Redeploy:
@@ -803,12 +803,12 @@ Redeploy:
 flash deploy --env production
 ```
 
-**2. Try different GPU type:**
+**2. Use GpuGroup for maximum flexibility:**
 ```python
-# More common/available GPUs
-gpus=[GpuGroup.RTX_4090]
-# or
-gpus=[GpuGroup.RTX_3090]
+# Accepts any GPU in the group
+gpu=GpuGroup.ADA_24
+# or any available GPU at all
+gpu=GpuGroup.ANY
 ```
 
 **3. Wait and retry:**
@@ -926,7 +926,7 @@ RuntimeError: CUDA not available
 Fix:
 ```python
 # ensure GPU specified in Endpoint
-@Endpoint(name="worker", gpu=GpuGroup.ANY)
+@Endpoint(name="worker", gpu=GpuType.NVIDIA_GEFORCE_RTX_4090)
 ```
 
 **4. Redeploy after fixing:**
@@ -1409,7 +1409,7 @@ flash deploy --env production
 | Environment not found | `flash env create <name>` |
 | Module not found | `pip install -e .` |
 | Upload failed | Retry or reduce size |
-| GPU unavailable | Use `gpus=[GpuGroup.ANY]` |
+| GPU unavailable | Use `gpu=GpuType.NVIDIA_GEFORCE_RTX_4090` |
 
 **Diagnostic Commands:**
 
diff --git a/docs/cli/workflows.md b/docs/cli/workflows.md
index 3283969..f0ea3b2 100644
--- a/docs/cli/workflows.md
+++ b/docs/cli/workflows.md
@@ -101,7 +101,7 @@ INFO: Application startup complete.
 Edit your worker files (e.g., `gpu_worker.py`):
 
 ```python
-@Endpoint(name="my-worker", gpu=GpuGroup.ANY)
+@Endpoint(name="my-worker", gpu=GpuType.NVIDIA_GEFORCE_RTX_4090)
 async def process_request(payload: dict) -> dict:
     """Process incoming requests on GPU."""
     result = perform_processing(payload)
@@ -601,7 +601,7 @@ flash deploy --env production
 ```python
 @Endpoint(
     name="myapi_dev_gpu",
-    gpu=GpuGroup.ANY,      # any GPU is fine
+    gpu=GpuType.NVIDIA_GEFORCE_RTX_4090,  # specific GPU for fast provisioning
     workers=(0, 2),        # scale to zero, small max for cost
     idle_timeout=1,        # quick shutdown
 )
@@ -614,7 +614,7 @@ async def process(payload: dict) -> dict: ...
     name="myapi_prod_gpu",
     gpu=GpuGroup.A100,     # specific GPU for consistency
     workers=(1, 10),       # always have one ready, handle load spikes
-    idle_timeout=5,        # keep warm longer
+    idle_timeout=300,        # keep warm longer
 )
 async def process(payload: dict) -> dict: ...
 ```
@@ -1155,12 +1155,12 @@ flash env get production
 
 **Before (high cost):**
 ```python
-@Endpoint(name="worker", gpu=GpuGroup.ANY, workers=(5, 10))  # always 5 running
+@Endpoint(name="worker", gpu=GpuType.NVIDIA_GEFORCE_RTX_4090, workers=(5, 10))  # always 5 running
 ```
 
 **After (optimized):**
 ```python
-@Endpoint(name="worker", gpu=GpuGroup.ANY, workers=(0, 10), idle_timeout=1)  # scale to zero
+@Endpoint(name="worker", gpu=GpuType.NVIDIA_GEFORCE_RTX_4090, workers=(0, 10), idle_timeout=1)  # scale to zero
 ```
 
 Redeploy with optimized config:
@@ -1448,13 +1448,13 @@ cat gpu_worker.py
 
 **Solutions:**
 
-**A. Change GPU type:**
+**A. Choose a commonly-available GPU type:**
 ```python
 # before (specific GPU, may not be available)
-@Endpoint(name="worker", gpu=GpuGroup.A100)
+@Endpoint(name="worker", gpu=GpuType.NVIDIA_A100_80GB)
 
-# after (more flexible)
-@Endpoint(name="worker", gpu=GpuGroup.ANY)
+# after (widely available)
+@Endpoint(name="worker", gpu=GpuType.NVIDIA_GEFORCE_RTX_4090)
 ```
 
 Redeploy:
@@ -1469,9 +1469,9 @@ sleep 300
 flash deploy --env production
 ```
 
-**C. Choose different GPU type:**
+**C. Use GpuGroup for maximum flexibility:**
 ```python
-@Endpoint(name="worker", gpu=GpuGroup.RTX_4090)  # more common
+@Endpoint(name="worker", gpu=GpuGroup.ANY)  # any available GPU
 ```
 
 #### Issue 5: Runtime Errors
@@ -1534,7 +1534,7 @@ RuntimeError: CUDA not available
 
 Solution: Verify GPU configuration:
 ```python
-@Endpoint(name="worker", gpu=GpuGroup.ANY)
+@Endpoint(name="worker", gpu=GpuType.NVIDIA_GEFORCE_RTX_4090)
 ```
 
 #### Issue 6: Performance Issues
@@ -1560,9 +1560,9 @@ flash env get production
 # keep workers warm with workers=(1, N)
 @Endpoint(
     name="worker",
-    gpu=GpuGroup.ANY,
+    gpu=GpuType.NVIDIA_GEFORCE_RTX_4090,
     workers=(1, 5),       # keep 1 warm
-    idle_timeout=5,       # keep alive longer
+    idle_timeout=600,       # keep alive longer
 )
 ```
 
@@ -1575,7 +1575,7 @@ flash env get production
 # lazy loading example
 _model = None
 
-@Endpoint(name="worker", gpu=GpuGroup.ANY)
+@Endpoint(name="worker", gpu=GpuType.NVIDIA_GEFORCE_RTX_4090)
 async def infer(payload: dict) -> dict:
     global _model
     if _model is None:
@@ -1586,7 +1586,7 @@ async def infer(payload: dict) -> dict:
 **C. Increase worker capacity:**
 ```python
 # handle more concurrent requests
-@Endpoint(name="worker", gpu=GpuGroup.ANY, workers=(0, 10))
+@Endpoint(name="worker", gpu=GpuType.NVIDIA_GEFORCE_RTX_4090, workers=(0, 10))
 ```
 
 ### General Debugging Approach

From 2f029529e1223b1b3afcc6854aba40c0db66182a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Sun, 8 Mar 2026 09:48:01 -0700
Subject: [PATCH 3/7] fix: correct idle_timeout values and dependency pins
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- idle_timeout is seconds, not minutes — update values accordingly
- Remove unnecessary workers/idle_timeout defaults from simple examples
- Unpin pandas/numpy versions for cross-Python-version compatibility
- Add missing accelerate dependency to SD example
- Add explicit workers to LB endpoints that need them
---
 01_getting_started/01_hello_world/gpu_worker.py     | 4 +---
 01_getting_started/02_cpu_worker/cpu_worker.py      | 2 --
 01_getting_started/03_mixed_workers/gpu_worker.py   | 1 -
 01_getting_started/04_dependencies/cpu_worker.py    | 4 ++--
 02_ml_inference/01_text_to_speech/gpu_worker.py     | 2 +-
 03_advanced_workers/05_load_balancer/cpu_lb.py      | 6 +++++-
 04_scaling_performance/01_autoscaling/cpu_worker.py | 2 +-
 05_data_workflows/01_network_volumes/cpu_worker.py  | 1 +
 8 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/01_getting_started/01_hello_world/gpu_worker.py b/01_getting_started/01_hello_world/gpu_worker.py
index 5c7c093..d204c16 100644
--- a/01_getting_started/01_hello_world/gpu_worker.py
+++ b/01_getting_started/01_hello_world/gpu_worker.py
@@ -6,9 +6,7 @@
 
 @Endpoint(
     name="01_01_gpu_worker",
-    gpu=GpuGroup.ANY,
-    workers=(0, 3),
-    idle_timeout=5,
+    gpu=GpuGroup.ADA_24,
 )
 async def gpu_hello(input_data: dict) -> dict:
     """GPU worker that returns GPU hardware info."""
diff --git a/01_getting_started/02_cpu_worker/cpu_worker.py b/01_getting_started/02_cpu_worker/cpu_worker.py
index 01fdef7..0679296 100644
--- a/01_getting_started/02_cpu_worker/cpu_worker.py
+++ b/01_getting_started/02_cpu_worker/cpu_worker.py
@@ -7,8 +7,6 @@
 @Endpoint(
     name="01_02_cpu_worker",
     cpu=CpuInstanceType.CPU3C_1_2,
-    workers=(0, 3),
-    idle_timeout=5,
 )
 async def cpu_hello(input_data: dict) -> dict:
     """CPU worker that returns a greeting."""
diff --git a/01_getting_started/03_mixed_workers/gpu_worker.py b/01_getting_started/03_mixed_workers/gpu_worker.py
index 64ab66c..b6ae065 100644
--- a/01_getting_started/03_mixed_workers/gpu_worker.py
+++ b/01_getting_started/03_mixed_workers/gpu_worker.py
@@ -9,7 +9,6 @@
     name="01_03_mixed_inference",
     gpu=GpuGroup.ADA_24,
     workers=(0, 3),
-    idle_timeout=5,
 )
 async def gpu_inference(input_data: dict) -> dict:
     """GPU inference: mock sentiment classification."""
diff --git a/01_getting_started/04_dependencies/cpu_worker.py b/01_getting_started/04_dependencies/cpu_worker.py
index 05f994c..64e2c96 100644
--- a/01_getting_started/04_dependencies/cpu_worker.py
+++ b/01_getting_started/04_dependencies/cpu_worker.py
@@ -10,8 +10,8 @@
     cpu=CpuInstanceType.CPU3C_8_16,
     workers=(0, 3),
     dependencies=[
-        "pandas==2.1.3",
-        "numpy==1.26.2",
+        "pandas",
+        "numpy",
         "scipy>=1.11.0",
         "matplotlib",
     ],
diff --git a/02_ml_inference/01_text_to_speech/gpu_worker.py b/02_ml_inference/01_text_to_speech/gpu_worker.py
index a9e3cd1..6d60e01 100644
--- a/02_ml_inference/01_text_to_speech/gpu_worker.py
+++ b/02_ml_inference/01_text_to_speech/gpu_worker.py
@@ -8,7 +8,7 @@
     name="02_01_text_to_speech_gpu",
     gpu=GpuGroup.ADA_24,
     workers=(0, 3),
-    idle_timeout=5,
+    idle_timeout=300,
     dependencies=["qwen-tts", "soundfile"],
 )
 async def generate_speech(input_data: dict) -> dict:
diff --git a/03_advanced_workers/05_load_balancer/cpu_lb.py b/03_advanced_workers/05_load_balancer/cpu_lb.py
index 47f1d95..08a9105 100644
--- a/03_advanced_workers/05_load_balancer/cpu_lb.py
+++ b/03_advanced_workers/05_load_balancer/cpu_lb.py
@@ -3,7 +3,11 @@
 # test directly: python cpu_lb.py
 from runpod_flash import Endpoint
 
-api = Endpoint(name="03_05_load_balancer_cpu", cpu="cpu3c-1-2")
+api = Endpoint(
+    name="03_05_load_balancer_cpu",
+    cpu="cpu3c-1-2",
+    workers=(1, 3),
+)
 
 
 @api.get("/health")
diff --git a/04_scaling_performance/01_autoscaling/cpu_worker.py b/04_scaling_performance/01_autoscaling/cpu_worker.py
index a508158..fa4f206 100644
--- a/04_scaling_performance/01_autoscaling/cpu_worker.py
+++ b/04_scaling_performance/01_autoscaling/cpu_worker.py
@@ -10,7 +10,7 @@
     name="04_01_cpu_scale_to_zero",
     cpu=CpuInstanceType.CPU3C_1_2,
     workers=(0, 5),
-    idle_timeout=5,
+    idle_timeout=1,
 )
 async def cpu_scale_to_zero(payload: dict) -> dict:
     """CPU worker with scale-to-zero -- cost-optimized preprocessing."""
diff --git a/05_data_workflows/01_network_volumes/cpu_worker.py b/05_data_workflows/01_network_volumes/cpu_worker.py
index d2c02cf..5d1dad4 100644
--- a/05_data_workflows/01_network_volumes/cpu_worker.py
+++ b/05_data_workflows/01_network_volumes/cpu_worker.py
@@ -12,6 +12,7 @@
     name="05_01_cpu_worker",
     cpu="cpu3c-1-2",
     workers=(1, 3),
+    idle_timeout=120,
     volume=volume,
 )
 

From f7593ca2d39f4beae6c2c3d163fd779c576354a9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Sun, 8 Mar 2026 09:51:02 -0700
Subject: [PATCH 4/7] chore: format gpu_worker.py

---
 05_data_workflows/01_network_volumes/gpu_worker.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/05_data_workflows/01_network_volumes/gpu_worker.py b/05_data_workflows/01_network_volumes/gpu_worker.py
index da47596..fd4c7b2 100644
--- a/05_data_workflows/01_network_volumes/gpu_worker.py
+++ b/05_data_workflows/01_network_volumes/gpu_worker.py
@@ -58,7 +58,6 @@ def __init__(self):
             f"Model weights stored in {model_path}: {os.listdir(model_path)}"
         )
 
-
     async def generate_image(self, prompt: str) -> dict:
         """Generate a single image from prompt."""
         self.logger.info(f"Generating image for: '{prompt}'")

From 5a42db41847bfdc6f87578c7416c3a863e64d6ed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Sun, 8 Mar 2026 21:43:42 -0700
Subject: [PATCH 5/7] chore: add project metadata and tighten Python upper
 bound to <3.13

---
 pyproject.toml | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index c54284a..ad95415 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -2,8 +2,18 @@
 name = "runpod-flash-examples"
 version = "1.0.0"
 description = "A collection of example applications showcasing Runpod Flash - a framework for building production-ready AI applications with distributed GPU and CPU computing."
+authors = [
+    { name = "Runpod", email = "engineer@runpod.io" },
+]
+license = { text = "MIT" }
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: MIT License",
+    "Operating System :: OS Independent",
+]
 readme = "README.md"
-requires-python = ">=3.10,<3.14"
+requires-python = ">=3.10,<3.13"
 dependencies = [
     "runpod-flash>=1.7.0",
 ]

From af4b3675b047c50424432ee3a00bb53aa9363abf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Mon, 9 Mar 2026 08:42:46 -0700
Subject: [PATCH 6/7] fix(review): address PR #41 feedback -- GpuType naming,
 idle_timeout, examples

- Fix GpuType.NVIDIA_A100_80GB to NVIDIA_A100_80GB_PCIe across docs
- Fix GpuType.NVIDIA_H100_80GB to NVIDIA_H100_80GB_HBM3 in README
- Migrate hello_world gpu_worker from GpuGroup.ADA_24 to GpuType.NVIDIA_GEFORCE_RTX_4090
- Fix idle_timeout config dict mismatches in autoscaling workers
- Correct idle_timeout units from minutes to seconds in README tables
- Use GpuGroup.ANY for GPU unavailable troubleshooting quick fix
---
 01_getting_started/01_hello_world/README.md        |  2 +-
 01_getting_started/01_hello_world/gpu_worker.py    |  4 ++--
 04_scaling_performance/01_autoscaling/README.md    | 14 +++++++-------
 .../01_autoscaling/cpu_worker.py                   |  2 +-
 .../01_autoscaling/gpu_worker.py                   |  2 +-
 docs/cli/troubleshooting.md                        |  4 ++--
 docs/cli/workflows.md                              |  2 +-
 7 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/01_getting_started/01_hello_world/README.md b/01_getting_started/01_hello_world/README.md
index 91181b9..beb291c 100644
--- a/01_getting_started/01_hello_world/README.md
+++ b/01_getting_started/01_hello_world/README.md
@@ -145,7 +145,7 @@ flash run
 
 ## Next Steps
 
-- Customize GPU type: Change `GpuType.NVIDIA_GEFORCE_RTX_4090` to another GPU (e.g. `GpuType.NVIDIA_A100_80GB`, `GpuType.NVIDIA_H100_80GB`)
+- Customize GPU type: Change `GpuType.NVIDIA_GEFORCE_RTX_4090` to another GPU (e.g. `GpuType.NVIDIA_A100_80GB_PCIe`, `GpuType.NVIDIA_H100_80GB_HBM3`)
 - Add your own GPU-accelerated code
 - Implement error handling and validation
 - Deploy to production with `flash deploy`
diff --git a/01_getting_started/01_hello_world/gpu_worker.py b/01_getting_started/01_hello_world/gpu_worker.py
index d204c16..d7a330f 100644
--- a/01_getting_started/01_hello_world/gpu_worker.py
+++ b/01_getting_started/01_hello_world/gpu_worker.py
@@ -1,12 +1,12 @@
 # gpu serverless worker -- detects available GPU hardware.
 # run with: flash run
 # test directly: python gpu_worker.py
-from runpod_flash import Endpoint, GpuGroup
+from runpod_flash import Endpoint, GpuType
 
 
 @Endpoint(
     name="01_01_gpu_worker",
-    gpu=GpuGroup.ADA_24,
+    gpu=GpuType.NVIDIA_GEFORCE_RTX_4090,
 )
 async def gpu_hello(input_data: dict) -> dict:
     """GPU worker that returns GPU hardware info."""
diff --git a/04_scaling_performance/01_autoscaling/README.md b/04_scaling_performance/01_autoscaling/README.md
index 3c1a09d..0e02e67 100644
--- a/04_scaling_performance/01_autoscaling/README.md
+++ b/04_scaling_performance/01_autoscaling/README.md
@@ -38,16 +38,16 @@ curl -X POST http://localhost:8888/cpu_worker/runsync \
 
 | Strategy | workers | idle_timeout | scaler_type | scaler_value | Use Case |
 |----------|---------|-------------|-------------|-------------|----------|
-| Scale to Zero | (0, 3) | 5 min | QUEUE_DELAY | 4 | Sporadic/batch, cost-first |
-| Always On | (1, 3) | 60 min | QUEUE_DELAY | 4 | Steady traffic, latency-first |
-| High Throughput | (2, 10) | 30 min | REQUEST_COUNT | 3 | Bursty traffic, throughput-first |
+| Scale to Zero | (0, 3) | 300s | QUEUE_DELAY | 4 | Sporadic/batch, cost-first |
+| Always On | (1, 3) | 60s | QUEUE_DELAY | 4 | Steady traffic, latency-first |
+| High Throughput | (2, 10) | 30s | REQUEST_COUNT | 3 | Bursty traffic, throughput-first |
 
 ### CPU Workers (`cpu_worker.py`)
 
 | Strategy | workers | idle_timeout | Use Case |
 |----------|---------|-------------|----------|
-| Scale to Zero | (0, 5) | 5 min | Cost-optimized preprocessing |
-| Burst Ready | (1, 10) | 30 min | Always-warm API gateway |
+| Scale to Zero | (0, 5) | 1s | Cost-optimized preprocessing |
+| Burst Ready | (1, 10) | 30s | Always-warm API gateway |
 
 ## How Autoscaling Works
 
@@ -83,7 +83,7 @@ Requests arrive
 | Parameter | Type | Default | Description |
 |-----------|------|---------|-------------|
 | `workers` | int or (min, max) | (0, 1) | Worker scaling bounds |
-| `idle_timeout` | int | 60 | Minutes before idle workers terminate |
+| `idle_timeout` | int | 60 | Seconds before idle workers terminate |
 | `scaler_type` | ServerlessScalerType | QUEUE_DELAY | Scaling trigger metric |
 | `scaler_value` | int | 4 | Target value for the scaler metric |
 | `gpu` | GpuType or GpuGroup | -- | GPU type for GPU endpoints |
@@ -99,7 +99,7 @@ from runpod_flash import Endpoint, GpuType, ServerlessScalerType
     name="batch-worker",
     gpu=GpuType.NVIDIA_GEFORCE_RTX_4090,
     workers=(0, 3),
-    idle_timeout=1,
+    idle_timeout=300,
     scaler_type=ServerlessScalerType.QUEUE_DELAY,
     scaler_value=4,
 )
diff --git a/04_scaling_performance/01_autoscaling/cpu_worker.py b/04_scaling_performance/01_autoscaling/cpu_worker.py
index fa4f206..6660ea3 100644
--- a/04_scaling_performance/01_autoscaling/cpu_worker.py
+++ b/04_scaling_performance/01_autoscaling/cpu_worker.py
@@ -39,7 +39,7 @@ async def cpu_scale_to_zero(payload: dict) -> dict:
             "token_count": len(tokens),
             "byte_size": byte_size,
         },
-        "config": {"workersMin": 0, "workersMax": 5, "idleTimeout": 5},
+        "config": {"workersMin": 0, "workersMax": 5, "idleTimeout": 1},
         "timestamp": time.strftime("%Y-%m-%dT%H:%M:%S"),
     }
 
diff --git a/04_scaling_performance/01_autoscaling/gpu_worker.py b/04_scaling_performance/01_autoscaling/gpu_worker.py
index 9af139c..2d12fb0 100644
--- a/04_scaling_performance/01_autoscaling/gpu_worker.py
+++ b/04_scaling_performance/01_autoscaling/gpu_worker.py
@@ -45,7 +45,7 @@ async def scale_to_zero_inference(payload: dict) -> dict:
             if torch.cuda.is_available()
             else "N/A",
         },
-        "config": {"workersMin": 0, "workersMax": 3, "idleTimeout": 5},
+        "config": {"workersMin": 0, "workersMax": 3, "idleTimeout": 300},
         "timestamp": time.strftime("%Y-%m-%dT%H:%M:%S"),
     }
 
diff --git a/docs/cli/troubleshooting.md b/docs/cli/troubleshooting.md
index 54612b0..9d689a6 100644
--- a/docs/cli/troubleshooting.md
+++ b/docs/cli/troubleshooting.md
@@ -792,7 +792,7 @@ ERROR: Failed to create endpoint: Insufficient GPU availability
 **1. Switch to a commonly-available GPU type:**
 ```python
 # before (specific GPU)
-@Endpoint(name="worker", gpu=GpuType.NVIDIA_A100_80GB)
+@Endpoint(name="worker", gpu=GpuType.NVIDIA_A100_80GB_PCIe)
 
 # after (widely available)
 @Endpoint(name="worker", gpu=GpuType.NVIDIA_GEFORCE_RTX_4090)
@@ -1409,7 +1409,7 @@ flash deploy --env production
 | Environment not found | `flash env create <name>` |
 | Module not found | `pip install -e .` |
 | Upload failed | Retry or reduce size |
-| GPU unavailable | Use `gpu=GpuType.NVIDIA_GEFORCE_RTX_4090` |
+| GPU unavailable | Use `gpu=GpuGroup.ANY` or `gpu=GpuType.ANY` |
 
 **Diagnostic Commands:**
 
diff --git a/docs/cli/workflows.md b/docs/cli/workflows.md
index f0ea3b2..78bc718 100644
--- a/docs/cli/workflows.md
+++ b/docs/cli/workflows.md
@@ -1451,7 +1451,7 @@ cat gpu_worker.py
 **A. Choose a commonly-available GPU type:**
 ```python
 # before (specific GPU, may not be available)
-@Endpoint(name="worker", gpu=GpuType.NVIDIA_A100_80GB)
+@Endpoint(name="worker", gpu=GpuType.NVIDIA_A100_80GB_PCIe)
 
 # after (widely available)
 @Endpoint(name="worker", gpu=GpuType.NVIDIA_GEFORCE_RTX_4090)

From e4ec7e9ba5baef79b3ed78bd9e94aa03cf30a250 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Mon, 9 Mar 2026 17:20:30 -0700
Subject: [PATCH 7/7] chore: bump runpod-flash minimum version to 1.8.0

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index ad95415..3be2308 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,7 +15,7 @@ classifiers = [
 readme = "README.md"
 requires-python = ">=3.10,<3.13"
 dependencies = [
-    "runpod-flash>=1.7.0",
+    "runpod-flash>=1.8.0",
 ]
 
 [dependency-groups]