runpod · deanq · Mar 10, 2026 · Mar 8, 2026 · Mar 8, 2026 · Mar 8, 2026
diff --git a/01_getting_started/01_hello_world/README.md b/01_getting_started/01_hello_world/README.md
@@ -112,9 +112,9 @@ The `@Endpoint` decorator transparently executes functions on serverless infrast
 - Handles serialization and resource management
 
 ```python
-from runpod_flash import Endpoint, GpuGroup
+from runpod_flash import Endpoint, GpuType
 
-@Endpoint(name="my-worker", gpu=GpuGroup.ANY, workers=(0, 3))
+@Endpoint(name="my-worker", gpu=GpuType.NVIDIA_GEFORCE_RTX_4090, workers=(0, 3))
 async def my_function(data: dict) -> dict:
     return {"result": "processed"}
 ```
@@ -145,7 +145,7 @@ flash run
 
 ## Next Steps
 
-- Customize GPU type: Change `GpuGroup.ANY` to a specific GPU (e.g. `GpuGroup.ADA_24`, `GpuGroup.AMPERE_80`)
+- Customize GPU type: Change `GpuType.NVIDIA_GEFORCE_RTX_4090` to another GPU (e.g. `GpuType.NVIDIA_A100_80GB_PCIe`, `GpuType.NVIDIA_H100_80GB_HBM3`)
 - Add your own GPU-accelerated code
 - Implement error handling and validation
 - Deploy to production with `flash deploy`

diff --git a/01_getting_started/01_hello_world/gpu_worker.py b/01_getting_started/01_hello_world/gpu_worker.py
@@ -1,14 +1,12 @@
 # gpu serverless worker -- detects available GPU hardware.
 # run with: flash run
 # test directly: python gpu_worker.py
-from runpod_flash import Endpoint, GpuGroup
+from runpod_flash import Endpoint, GpuType
 
 
 @Endpoint(
     name="01_01_gpu_worker",
-    gpu=GpuGroup.ANY,
-    workers=(0, 3),
-    idle_timeout=5,
+    gpu=GpuType.NVIDIA_GEFORCE_RTX_4090,
 )
 async def gpu_hello(input_data: dict) -> dict:
     """GPU worker that returns GPU hardware info."""

diff --git a/01_getting_started/02_cpu_worker/cpu_worker.py b/01_getting_started/02_cpu_worker/cpu_worker.py
@@ -7,8 +7,6 @@
 @Endpoint(
     name="01_02_cpu_worker",
     cpu=CpuInstanceType.CPU3C_1_2,
-    workers=(0, 3),
-    idle_timeout=5,
 )
 async def cpu_hello(input_data: dict) -> dict:
     """CPU worker that returns a greeting."""

diff --git a/01_getting_started/03_mixed_workers/README.md b/01_getting_started/03_mixed_workers/README.md
@@ -134,7 +134,7 @@ Total: $0.0019/sec
     name="preprocess_worker",
     cpu=CpuInstanceType.CPU3G_2_8,  # 2 vCPU, 8GB
     workers=(0, 10),
-    idle_timeout=3,
+    idle_timeout=180,
 )
 async def preprocess_text(input_data: dict) -> dict: ...
 ```
@@ -148,7 +148,7 @@ async def preprocess_text(input_data: dict) -> dict: ...
     name="inference_worker",
     gpu=GpuGroup.ADA_24,  # RTX 4090
     workers=(0, 3),
-    idle_timeout=5,
+    idle_timeout=300,
     dependencies=["torch"],
 )
 async def gpu_inference(input_data: dict) -> dict: ...

diff --git a/01_getting_started/03_mixed_workers/gpu_worker.py b/01_getting_started/03_mixed_workers/gpu_worker.py
@@ -9,7 +9,6 @@
     name="01_03_mixed_inference",
     gpu=GpuGroup.ADA_24,
     workers=(0, 3),
-    idle_timeout=5,
 )
 async def gpu_inference(input_data: dict) -> dict:
     """GPU inference: mock sentiment classification."""

diff --git a/01_getting_started/04_dependencies/README.md b/01_getting_started/04_dependencies/README.md
@@ -6,7 +6,7 @@ Learn how to manage Python packages and system dependencies in Flash workers.
 
 - **Python dependencies** - Installing packages with version constraints
 - **System dependencies** - Installing apt packages (ffmpeg, libgl1, etc.)
-- **Version pinning** - Reproducible builds with exact versions
+- **Version constraints** - Supported syntax for version pinning
 - **Dependency optimization** - Minimizing cold start time
 
 ## Quick Start
@@ -105,7 +105,7 @@ async def simple_function(data: dict) -> dict:
 ```python
 "requests==2.32.3"  # Exactly 2.32.3
 ```
-**Use when:** You need reproducible builds
+**Use when:** You need reproducible builds for a specific Python version
 
 ### Minimum Version (>=)
 ```python
@@ -117,7 +117,7 @@ async def simple_function(data: dict) -> dict:
 ```python
 "python-dateutil<3.0.0"  # Below 3.0.0
 ```
-**Use when:** Avoiding breaking changes
+**Use when:** Avoiding breaking changes in a major release
 
 ### Compatible Release (~=)
 ```python
@@ -129,7 +129,7 @@ async def simple_function(data: dict) -> dict:
 ```python
 "pandas"  # Latest available
 ```
-**Use when:** You always want the newest version (not recommended for production)
+**Use when:** You want the latest compatible version (recommended for examples and prototyping)
 
 ## Common Dependencies
 
@@ -146,8 +146,8 @@ dependencies=[
 ### Data Science
 ```python
 dependencies=[
-    "pandas==2.1.3",
-    "numpy==1.26.2",
+    "pandas",
+    "numpy",
     "scipy>=1.11.0",
     "matplotlib",
     "scikit-learn",
@@ -196,32 +196,28 @@ system_dependencies=["ffmpeg", "libgl1", "wget"]
 
 ## Best Practices
 
-### 1. Pin Versions for Production
+### 1. Use Version Constraints Thoughtfully
 
 ```python
-# Good - Reproducible
-@Endpoint(
-    name="worker",
-    gpu=GpuGroup.ADA_24,
-    dependencies=[
-        "requests==2.32.3",
-        "transformers==4.35.2",
-        "numpy==1.26.2",
-    ],
-)
+# Good for examples and prototyping - works across Python versions
+dependencies=[
+    "requests",
+    "transformers",
+    "numpy",
+]
 
-# Bad - Unpredictable
-@Endpoint(
-    name="worker",
-    gpu=GpuGroup.ADA_24,
-    dependencies=[
-        "requests",  # Version changes over time
-        "transformers",
-        "numpy",
-    ],
-)
+# Good for production - reproducible on a known Python version
+dependencies=[
+    "requests==2.32.3",
+    "transformers==4.35.2",
+    "numpy==1.26.2",
+]
 ```
 
+Exact pins can break across Python versions (e.g., older numpy
+builds don't exist for Python 3.13+). Pin only when you control
+the target Python version.
+
 ### 2. Minimize Dependencies
 
 ```python
@@ -287,7 +283,9 @@ ERROR: Cannot install requests==2.25.0 and urllib3==2.2.1
 because these package versions have conflicting dependencies.
 ```
 
-**Solution:** Check compatibility matrix, adjust versions:
+**Solutions:**
+1. Drop exact pins and let pip resolve compatible versions
+2. Check compatibility matrix and adjust versions:
 ```python
 dependencies=[
     "requests>=2.32.0",
@@ -332,7 +330,7 @@ For local development, create `requirements.txt`:
 runpod-flash
 transformers==4.35.2
 Pillow>=10.0.0
-numpy==1.26.2
+numpy
 ```
 
 **Note:** Worker dependencies in the `Endpoint` decorator are deployed automatically. `requirements.txt` is for local development only.

diff --git a/01_getting_started/04_dependencies/cpu_worker.py b/01_getting_started/04_dependencies/cpu_worker.py
@@ -10,8 +10,8 @@
     cpu=CpuInstanceType.CPU3C_8_16,
     workers=(0, 3),
     dependencies=[
-        "pandas==2.1.3",
-        "numpy==1.26.2",
+        "pandas",
+        "numpy",
         "scipy>=1.11.0",
         "matplotlib",
     ],

diff --git a/02_ml_inference/01_text_to_speech/gpu_worker.py b/02_ml_inference/01_text_to_speech/gpu_worker.py
@@ -8,7 +8,7 @@
     name="02_01_text_to_speech_gpu",
     gpu=GpuGroup.ADA_24,
     workers=(0, 3),
-    idle_timeout=5,
+    idle_timeout=300,
     dependencies=["qwen-tts", "soundfile"],
 )
 async def generate_speech(input_data: dict) -> dict:

diff --git a/03_advanced_workers/05_load_balancer/README.md b/03_advanced_workers/05_load_balancer/README.md
@@ -86,10 +86,10 @@ curl -X POST http://localhost:8888/05_load_balancer/cpu/transform \
 Load-balanced endpoints use the `Endpoint` class with route decorators (`.get()`, `.post()`, etc.) to define HTTP routes. The decorator automatically registers the function as an HTTP endpoint on the load-balancer runtime.
 
 ```python
-from runpod_flash import Endpoint, GpuGroup
+from runpod_flash import Endpoint, GpuType
 
 # create load-balanced endpoint
-api = Endpoint(name="my-service", gpu=GpuGroup.ANY, workers=(1, 3))
+api = Endpoint(name="my-service", gpu=GpuType.NVIDIA_GEFORCE_RTX_4090, workers=(1, 3))
 
 # define HTTP routes with method decorators
 @api.get("/health")

diff --git a/03_advanced_workers/05_load_balancer/cpu_lb.py b/03_advanced_workers/05_load_balancer/cpu_lb.py
@@ -3,7 +3,11 @@
 # test directly: python cpu_lb.py
 from runpod_flash import Endpoint
 
-api = Endpoint(name="03_05_load_balancer_cpu", cpu="cpu3c-1-2")
+api = Endpoint(
+    name="03_05_load_balancer_cpu",
+    cpu="cpu3c-1-2",
+    workers=(1, 3),
+)
 
 
 @api.get("/health")

diff --git a/03_advanced_workers/05_load_balancer/gpu_lb.py b/03_advanced_workers/05_load_balancer/gpu_lb.py
@@ -1,9 +1,13 @@
 # gpu load-balanced endpoints with custom HTTP routes.
 # run with: flash run
 # test directly: python gpu_lb.py
-from runpod_flash import Endpoint, GpuGroup
+from runpod_flash import Endpoint, GpuType
 
-api = Endpoint(name="03_05_load_balancer_gpu", gpu=GpuGroup.ANY, workers=(1, 3))
+api = Endpoint(
+    name="03_05_load_balancer_gpu",
+    gpu=GpuType.NVIDIA_GEFORCE_RTX_4090,
+    workers=(1, 3),
+)
 
 
 @api.get("/health")

diff --git a/04_scaling_performance/01_autoscaling/README.md b/04_scaling_performance/01_autoscaling/README.md
@@ -38,16 +38,16 @@ curl -X POST http://localhost:8888/cpu_worker/runsync \
 
 | Strategy | workers | idle_timeout | scaler_type | scaler_value | Use Case |
 |----------|---------|-------------|-------------|-------------|----------|
-| Scale to Zero | (0, 3) | 5 min | QUEUE_DELAY | 4 | Sporadic/batch, cost-first |
-| Always On | (1, 3) | 60 min | QUEUE_DELAY | 4 | Steady traffic, latency-first |
-| High Throughput | (2, 10) | 30 min | REQUEST_COUNT | 3 | Bursty traffic, throughput-first |
+| Scale to Zero | (0, 3) | 300s | QUEUE_DELAY | 4 | Sporadic/batch, cost-first |
+| Always On | (1, 3) | 60s | QUEUE_DELAY | 4 | Steady traffic, latency-first |
+| High Throughput | (2, 10) | 30s | REQUEST_COUNT | 3 | Bursty traffic, throughput-first |
 
 ### CPU Workers (`cpu_worker.py`)
 
 | Strategy | workers | idle_timeout | Use Case |
 |----------|---------|-------------|----------|
-| Scale to Zero | (0, 5) | 5 min | Cost-optimized preprocessing |
-| Burst Ready | (1, 10) | 30 min | Always-warm API gateway |
+| Scale to Zero | (0, 5) | 1s | Cost-optimized preprocessing |
+| Burst Ready | (1, 10) | 30s | Always-warm API gateway |
 
 ## How Autoscaling Works
 
@@ -83,23 +83,23 @@ Requests arrive
 | Parameter | Type | Default | Description |
 |-----------|------|---------|-------------|
 | `workers` | int or (min, max) | (0, 1) | Worker scaling bounds |
-| `idle_timeout` | int | 60 | Minutes before idle workers terminate |
+| `idle_timeout` | int | 60 | Seconds before idle workers terminate |
 | `scaler_type` | ServerlessScalerType | QUEUE_DELAY | Scaling trigger metric |
 | `scaler_value` | int | 4 | Target value for the scaler metric |
-| `gpu` | GpuGroup or GpuType | ANY | GPU type for GPU endpoints |
+| `gpu` | GpuType or GpuGroup | -- | GPU type for GPU endpoints |
 | `cpu` | CpuInstanceType or str | -- | CPU instance type for CPU endpoints |
 
 ### Example Configurations
 
 ```python
-from runpod_flash import Endpoint, GpuGroup, ServerlessScalerType
+from runpod_flash import Endpoint, GpuType, ServerlessScalerType
 
 # scale to zero, cost-optimized
 @Endpoint(
     name="batch-worker",
-    gpu=GpuGroup.ANY,
+    gpu=GpuType.NVIDIA_GEFORCE_RTX_4090,
     workers=(0, 3),
-    idle_timeout=5,
+    idle_timeout=300,
     scaler_type=ServerlessScalerType.QUEUE_DELAY,
     scaler_value=4,
 )
@@ -108,7 +108,7 @@ async def batch_process(payload: dict) -> dict: ...
 # always-on, latency-optimized
 @Endpoint(
     name="api-worker",
-    gpu=GpuGroup.ANY,
+    gpu=GpuType.NVIDIA_GEFORCE_RTX_4090,
     workers=(1, 3),
     idle_timeout=60,
 )
@@ -117,7 +117,7 @@ async def api_process(payload: dict) -> dict: ...
 # high-throughput, burst-optimized
 @Endpoint(
     name="burst-worker",
-    gpu=GpuGroup.ANY,
+    gpu=GpuType.NVIDIA_GEFORCE_RTX_4090,
     workers=(2, 10),
     idle_timeout=30,
     scaler_type=ServerlessScalerType.REQUEST_COUNT,

diff --git a/04_scaling_performance/01_autoscaling/cpu_worker.py b/04_scaling_performance/01_autoscaling/cpu_worker.py
@@ -10,7 +10,7 @@
     name="04_01_cpu_scale_to_zero",
     cpu=CpuInstanceType.CPU3C_1_2,
     workers=(0, 5),
-    idle_timeout=5,
+    idle_timeout=1,
 )
 async def cpu_scale_to_zero(payload: dict) -> dict:
     """CPU worker with scale-to-zero -- cost-optimized preprocessing."""
@@ -39,7 +39,7 @@ async def cpu_scale_to_zero(payload: dict) -> dict:
             "token_count": len(tokens),
             "byte_size": byte_size,
         },
-        "config": {"workersMin": 0, "workersMax": 5, "idleTimeout": 5},
+        "config": {"workersMin": 0, "workersMax": 5, "idleTimeout": 1},
         "timestamp": time.strftime("%Y-%m-%dT%H:%M:%S"),
     }
 

diff --git a/04_scaling_performance/01_autoscaling/gpu_worker.py b/04_scaling_performance/01_autoscaling/gpu_worker.py
@@ -1,17 +1,17 @@
 # gpu autoscaling strategies -- scale-to-zero, always-on, high-throughput.
 # run with: flash run
 # test directly: python gpu_worker.py
-from runpod_flash import Endpoint, GpuGroup, ServerlessScalerType
+from runpod_flash import Endpoint, GpuType, ServerlessScalerType
 
 
 # --- strategy 1: scale to zero ---
 # sporadic or batch workloads where cost matters more than cold-start latency.
 # workers scale down to zero after 5 minutes of idle time.
 @Endpoint(
     name="04_01_scale_to_zero",
-    gpu=GpuGroup.ANY,
+    gpu=GpuType.NVIDIA_GEFORCE_RTX_4090,
     workers=(0, 3),
-    idle_timeout=5,
+    idle_timeout=300,
     scaler_type=ServerlessScalerType.QUEUE_DELAY,
     scaler_value=4,
 )
@@ -45,7 +45,7 @@ async def scale_to_zero_inference(payload: dict) -> dict:
             if torch.cuda.is_available()
             else "N/A",
         },
-        "config": {"workersMin": 0, "workersMax": 3, "idleTimeout": 5},
+        "config": {"workersMin": 0, "workersMax": 3, "idleTimeout": 300},
         "timestamp": time.strftime("%Y-%m-%dT%H:%M:%S"),
     }
 
@@ -55,7 +55,7 @@ async def scale_to_zero_inference(payload: dict) -> dict:
 # at least one worker stays warm to avoid cold starts.
 @Endpoint(
     name="04_01_always_on",
-    gpu=GpuGroup.ANY,
+    gpu=GpuType.NVIDIA_GEFORCE_RTX_4090,
     workers=(1, 3),
     idle_timeout=60,
     scaler_type=ServerlessScalerType.QUEUE_DELAY,
@@ -101,7 +101,7 @@ async def always_on_inference(payload: dict) -> dict:
 # starts with 2 warm workers, scales aggressively to 10 based on request count.
 @Endpoint(
     name="04_01_high_throughput",
-    gpu=GpuGroup.ANY,
+    gpu=GpuType.NVIDIA_GEFORCE_RTX_4090,
     workers=(2, 10),
     idle_timeout=30,
     scaler_type=ServerlessScalerType.REQUEST_COUNT,

diff --git a/05_data_workflows/01_network_volumes/cpu_worker.py b/05_data_workflows/01_network_volumes/cpu_worker.py
@@ -12,6 +12,7 @@
     name="05_01_cpu_worker",
     cpu="cpu3c-1-2",
     workers=(1, 3),
+    idle_timeout=120,
     volume=volume,
 )