EmbeddedLLM
diff --git a/‎examples/disagg_prefill/1p1d/README.md
Lines changed: 55 additions & 0 deletions b/‎examples/disagg_prefill/1p1d/README.md
Lines changed: 55 additions & 0 deletions
diff --git a/‎examples/disagg_prefill/configs/lmcache-decoder-config.yaml renamed to ‎examples/disagg_prefill/1p1d/configs/lmcache-decoder-config.yaml b/‎examples/disagg_prefill/configs/lmcache-decoder-config.yaml renamed to ‎examples/disagg_prefill/1p1d/configs/lmcache-decoder-config.yaml
diff --git a/‎examples/disagg_prefill/configs/lmcache-prefiller-config.yaml renamed to ‎examples/disagg_prefill/1p1d/configs/lmcache-prefiller-config.yaml b/‎examples/disagg_prefill/configs/lmcache-prefiller-config.yaml renamed to ‎examples/disagg_prefill/1p1d/configs/lmcache-prefiller-config.yaml
diff --git a/‎examples/disagg_prefill/disagg_example_nixl.sh renamed to ‎examples/disagg_prefill/1p1d/disagg_example_nixl.sh
Lines changed: 34 additions & 3 deletions b/‎examples/disagg_prefill/disagg_example_nixl.sh renamed to ‎examples/disagg_prefill/1p1d/disagg_example_nixl.sh
Lines changed: 34 additions & 3 deletions
diff --git a/‎examples/disagg_prefill/disagg_proxy_server.py renamed to ‎examples/disagg_prefill/1p1d/disagg_proxy_server.py b/‎examples/disagg_prefill/disagg_proxy_server.py renamed to ‎examples/disagg_prefill/1p1d/disagg_proxy_server.py
diff --git a/‎examples/disagg_prefill/disagg_vllm_launcher.sh renamed to ‎examples/disagg_prefill/1p1d/disagg_vllm_launcher.sh b/‎examples/disagg_prefill/disagg_vllm_launcher.sh renamed to ‎examples/disagg_prefill/1p1d/disagg_vllm_launcher.sh
diff --git a/‎examples/disagg_prefill/README.md
Lines changed: 89 additions & 24 deletions b/‎examples/disagg_prefill/README.md
Lines changed: 89 additions & 24 deletions
diff --git a/‎examples/disagg_prefill/xp1d/README.md
Lines changed: 81 additions & 0 deletions b/‎examples/disagg_prefill/xp1d/README.md
Lines changed: 81 additions & 0 deletions
diff --git a/‎examples/disagg_prefill/xp1d/configs/lmcache-decoder-config.yaml
Lines changed: 12 additions & 0 deletions b/‎examples/disagg_prefill/xp1d/configs/lmcache-decoder-config.yaml
Lines changed: 12 additions & 0 deletions
diff --git a/‎examples/disagg_prefill/xp1d/configs/lmcache-prefiller-config.yaml
Lines changed: 12 additions & 0 deletions b/‎examples/disagg_prefill/xp1d/configs/lmcache-prefiller-config.yaml
Lines changed: 12 additions & 0 deletions
@@ -0,0 +1,55 @@
+## Example of Disaggregated Prefill in vLLM v1
+
+This example demonstrates how to run LMCache with disaggregated prefill using NIXL on a single node.
+
+### Prerequisites
+
+- Install [LMCache](https://github.com/LMCache/LMCache). You can simply run `pip install lmcache`.
+- Install [NIXL](https://github.com/ai-dynamo/nixl).
+- At least 2 GPUs
+- Valid Hugging Face token (HF_TOKEN) for Llama 3.1 8B Instruct.
+
+### Usage
+
+Run
+```bash
+bash disagg_example_nixl.sh
+```
+
+The script will:
+
+1. Launch 1 decoder instance listening on port 8200
+2. Launch 1 prefill instances listening on ports 8100
+3. Launch a proxy server listening on port 9000
+
+Press `Ctrl+C` to stop the servers.
+
+to start disaggregated prefill and benchmark the performance.
+
+#### Example benchmark command
+
+If you have vLLM [benchmark_serving.py](https://github.com/vllm-project/vllm/blob/main/benchmarks/benchmark_serving.py), you can run the following command to benchmark the serving performance of the disaggregated prefill setup:
+
+```bash
+python benchmark_serving.py --port 9000 --seed $(date +%s) \
+    --model meta-llama/Llama-3.1-8B-Instruct \
+    --dataset-name random --random-input-len 7500 --random-output-len 200 \
+    --num-prompts 30 --burstiness 100 --request-rate 1 --ignore-eos
+```
+
+### Components
+
+#### Server Scripts
+- `disagg_vllm_launcher.sh` - Launches individual vLLM servers for prefill/decode, and also launches the proxy server.
+- `disagg_proxy_server.py` - FastAPI proxy server that coordinates between prefiller and decoder
+- `disagg_example_nixl.sh` - Main script to run the example
+
+#### Configuration
+- `configs/lmcache-prefiller-config.yaml` - Configuration for prefiller server
+- `configs/lmcache-decoder-config.yaml` - Configuration for decoder server
+
+#### Log Files
+The main script generates several log files:
+- `prefiller.log` - Logs from the prefill server
+- `decoder.log` - Logs from the decode server
+- `proxy.log` - Logs from the proxy server
@@ -48,9 +48,31 @@ ensure_python_library_installed() {
 
 cleanup() {
     echo "Stopping everything…"
-    trap - INT TERM        # prevent re-entrancy
-    kill -- -$$            # negative PID  ==  “this whole process-group”
-    wait                   # reap children so we don't leave zombies
+    trap - INT TERM USR1   # prevent re-entrancy
+    
+    # Kill all tracked PIDs
+    for pid in "${PIDS[@]}"; do
+        if kill -0 "$pid" 2>/dev/null; then
+            echo "Killing process $pid"
+            kill "$pid" 2>/dev/null
+        fi
+    done
+    
+    # Wait a moment for graceful shutdown
+    sleep 2
+    
+    # Force kill any remaining processes
+    for pid in "${PIDS[@]}"; do
+        if kill -0 "$pid" 2>/dev/null; then
+            echo "Force killing process $pid"
+            kill -9 "$pid" 2>/dev/null
+        fi
+    done
+    
+    # Kill the entire process group as backup
+    kill -- -$$ 2>/dev/null
+    
+    echo "All processes stopped."
     exit 0
 }
 
@@ -118,8 +140,17 @@ main() {
     wait_for_server 8200
     wait_for_server 9000
 
+    echo "================================================"
     echo "All servers are up. You can send request now..."
+    echo "Press Ctrl-C to terminate all instances."
+
+    # Keep the script running until interrupted
+    echo "Script is running. Waiting for termination signal..."
+    echo "================================================"
 
+    while true; do
+        sleep 1
+    done
 }
 
 main
@@ -1,36 +1,101 @@
-## Example of Disaggregated Prefill in vLLM v1
+# Disaggregated Prefill Examples for LMCache with vLLM v1
 
-This example demonstrates how to run LMCache with disaggregated prefill using NIXL on a single node.
+This directory contains examples demonstrating how to run LMCache with disaggregated prefill using NIXL. Disaggregated prefill allows you to separate the prefill (prompt processing) and decode (token generation) phases of LLM inference across different GPU instances, enabling better resource utilization and scalability.
 
-### Prerequisites
+## Overview
 
-- Install [LMCache](https://github.com/LMCache/LMCache). You can simply run `pip install lmcache`.
-- Install [NIXL](https://github.com/ai-dynamo/nixl).
-- At least 2 GPUs
-- Valid Hugging Face token (HF_TOKEN) for Llama 3.1 8B Instruct.
+Disaggregated prefill architecture separates the compute-intensive prefill phase from the memory-intensive decode phase:
 
-### Usage
+- **Prefill servers**: Handle prompt processing and KV cache generation
+- **Decode server**: Handles token generation using cached KV states
+- **Proxy server**: Coordinates requests between prefill and decode servers
+
+This architecture provides several benefits:
+- Better GPU utilization by matching workload characteristics to hardware
+- Improved scalability by independently scaling prefill and decode capacity
+- Reduced latency through parallel processing
+- Cost optimization by using different instance types for different phases
+
+## Available Examples
+
+### 1p1d - Single Prefill, Single Decode
+Directory: [`1p1d/`](./1p1d/)
+
+A basic setup with:
+- 1 prefill server (port 8100)
+- 1 decode server (port 8200)
+- 1 proxy server (port 9000)
+
+**Requirements**: At least 2 GPUs
+
+This is the simplest configuration to get started with disaggregated prefill.
+
+### xp1d - Multiple Prefill, Single Decode
+Directory: [`xp1d/`](./xp1d/)
+
+A scaled setup with:
+- 2 prefill servers (ports 8100, 8101)
+- 1 decode server (port 8200)
+- 1 proxy server with round-robin load balancing (port 9000)
+
+**Requirements**: At least 3 GPUs
+
+This configuration demonstrates how to scale prefill capacity while maintaining a single decode instance.
+
+## Prerequisites
+
+Before running any example, ensure you have:
+
+- [LMCache](https://github.com/LMCache/LMCache) installed: `pip install lmcache`
+- [NIXL](https://github.com/ai-dynamo/nixl) installed
+- Valid Hugging Face token (HF_TOKEN) for Llama 3.1 8B Instruct
+- Sufficient GPU resources (see individual example requirements)
+
+## Quick Start
+
+1. Choose the appropriate example based on your GPU resources:
+   - For 2 GPUs: Use [`1p1d/`](./1p1d/)
+   - For 3+ GPUs: Use [`xp1d/`](./xp1d/)
+
+2. Navigate to the chosen directory:
+   ```bash
+   cd 1p1d/  # or cd xp1d/
+   ```
+
+3. Follow the specific README instructions in that directory
+
+## Benchmarking
+
+Both examples can be benchmarked using vLLM's `benchmark_serving.py`:
 
-Run
 ```bash
-bash disagg_example_nixl.sh
+python benchmark_serving.py --port 9000 --seed $(date +%s) \
+    --model meta-llama/Llama-3.1-8B-Instruct \
+    --dataset-name random --random-input-len 7500 --random-output-len 200 \
+    --num-prompts 30 --burstiness 100 --request-rate 1 --ignore-eos
 ```
 
-to start disaggregated prefill and benchmark the performance.
+## Architecture Components
+
+Each example includes:
+
+- **Main script**: `disagg_example_*.sh` - Main entry point to run the example
+- **Launcher script**: `disagg_vllm_launcher.sh` - Launches vLLM servers and proxy
+- **Proxy server**: `disagg_proxy_server.py` - FastAPI server coordinating requests
+- **Configuration files**: YAML configs for prefill and decode servers
+- **Log files**: Generated during execution for debugging
+
+## Troubleshooting
 
-### Components
+- **GPU Memory Issues**: Ensure you have sufficient VRAM for the model on each GPU
+- **Port Conflicts**: Check that ports 8100, 8101, 8200, and 9000 are available
+- **HF Token**: Verify your Hugging Face token has access to Llama 3.1 models
+- **Dependencies**: Ensure both LMCache and NIXL are properly installed
 
-#### Server Scripts
-- `disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh` - Launches individual vLLM servers for prefill/decode, and also launches the proxy server.
-- `disagg_prefill_lmcache_v1/disagg_proxy_server.py` - FastAPI proxy server that coordinates between prefiller and decoder
-- `disagg_prefill_lmcache_v1/disagg_example_nixl.sh` - Main script to run the example
+For detailed troubleshooting, check the log files generated in each example directory.
 
-#### Configuration
-- `disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml` - Configuration for prefiller server
-- `disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml` - Configuration for decoder server
+## Further Reading
 
-#### Log Files
-The main script generates several log files:
-- `prefiller.log` - Logs from the prefill server
-- `decoder.log` - Logs from the decode server
-- `proxy.log` - Logs from the proxy server
+- [LMCache Documentation](https://github.com/LMCache/LMCache)
+- [NIXL Documentation](https://github.com/ai-dynamo/nixl)
+- [vLLM Documentation](https://docs.vllm.ai/) 
@@ -0,0 +1,81 @@
+## Example of Disaggregated Prefill in vLLM v1
+
+This example demonstrates how to run LMCache with disaggregated prefill using NIXL on a single node.
+
+### Prerequisites
+
+- Install [LMCache](https://github.com/LMCache/LMCache). You can simply run `pip install lmcache`.
+- Install [NIXL](https://github.com/ai-dynamo/nixl).
+- At least 3 GPUs
+- Valid Hugging Face token (HF_TOKEN) for Llama 3.1 8B Instruct.
+
+### Usage
+
+Run
+```bash
+bash disagg_example_xp1d.sh
+```
+
+to start disaggregated prefill and benchmark the performance.
+
+The script will:
+
+1. Launch 1 decoder instance listening on port 8200
+2. Launch 2 prefill instances listening on ports 8100 and 8101, respectively
+3. Launch a proxy server that uses round-robin to distribute requests between the prefill instances, listening on port 9000
+
+Press `Ctrl+C` to stop the servers.
+
+#### Example benchmark command
+
+If you have vLLM [benchmark_serving.py](https://github.com/vllm-project/vllm/blob/main/benchmarks/benchmark_serving.py), you can run the following command to benchmark the serving performance of the disaggregated prefill setup:
+
+```bash
+python benchmark_serving.py --port 9000 --seed $(date +%s) \
+    --model meta-llama/Llama-3.1-8B-Instruct \
+    --dataset-name random --random-input-len 7500 --random-output-len 200 \
+    --num-prompts 30 --burstiness 100 --request-rate 1 --ignore-eos
+```
+
+Expected output from the benchmark script:
+
+```plaintext
+============ Serving Benchmark Result ============
+Successful requests:                     30
+Benchmark duration (s):                  31.34
+Total input tokens:                      224970
+Total generated tokens:                  6000
+Request throughput (req/s):              0.96
+Output token throughput (tok/s):         191.44
+Total Token throughput (tok/s):          7369.36
+---------------Time to First Token----------------
+Mean TTFT (ms):                          313.41
+Median TTFT (ms):                        272.83
+P99 TTFT (ms):                           837.32
+-----Time per Output Token (excl. 1st token)------
+Mean TPOT (ms):                          8.84
+Median TPOT (ms):                        8.72
+P99 TPOT (ms):                           11.35
+---------------Inter-token Latency----------------
+Mean ITL (ms):                           8.84
+Median ITL (ms):                         8.61
+P99 ITL (ms):                            11.43
+==================================================
+```
+
+### Components
+
+#### Server Scripts
+- `disagg_vllm_launcher.sh` - Launches individual vLLM servers for prefill/decode, and also launches the proxy server.
+- `disagg_proxy_server.py` - FastAPI proxy server that coordinates between prefiller and decoder
+- `disagg_example_xp1d.sh` - Main script to run the example
+
+#### Configuration
+- `configs/lmcache-prefiller-config.yaml` - Configuration for prefiller server
+- `configs/lmcache-decoder-config.yaml` - Configuration for decoder server
+
+#### Log Files
+The main script generates several log files:
+- `prefiller1.log` and `prefiller2.log` - Logs from the prefill servers
+- `decoder.log` - Logs from the decode server
+- `proxy.log` - Logs from the proxy server
@@ -0,0 +1,12 @@
+local_cpu: False
+max_local_cpu_size: 0
+max_local_disk_size: 0
+remote_serde: NULL
+
+enable_nixl: True
+nixl_role: "receiver"
+nixl_peer_host: "localhost"
+nixl_peer_port: 55555
+nixl_buffer_size: 1073741824 # 1GB
+nixl_buffer_device: "cuda"
+nixl_enable_gc: True
@@ -0,0 +1,12 @@
+local_cpu: False
+max_local_cpu_size: 0
+max_local_disk_size: 0
+remote_serde: NULL
+
+enable_nixl: True
+nixl_role: "sender"
+nixl_peer_host: "localhost"
+nixl_peer_port: 55555
+nixl_buffer_size: 1073741824 # 1GB
+nixl_buffer_device: "cuda"
+nixl_enable_gc: True