[Profiling] Pull Over the TPU Profiler from vLLM + add profiling docs (#882)

jrplatin · web-flow · commit 8d856cf9faa9 · 2025-10-19T18:23:36.000-07:00
Signed-off-by: Jacob Platin &lt;jacobplatn@google.com&gt;
diff --git a/docs/profiling.md b/docs/profiling.md
@@ -0,0 +1,105 @@
+# Profiling
+
+There are currently three ways to profile your workload:
+
+## Using `examples/tpu_profiling.py`
+
+### vLLM TPU Profiling Script
+
+This script is a utility for profiling the performance of the vLLM engine on TPU VMs. It uses the JAX profiler to capture detailed performance traces.
+
+The profiling results can be visualized using tools like TensorBoard (with the `tensorboard-plugin-profile` package) or Perfetto UI.
+
+### How to Use
+
+#### Prerequisites
+You must install the TensorBoard profile plugin to visualize the results:
+
+```bash
+pip install tensorboard-plugin-profile
+```
+
+#### Basic Command
+The script is run from the command line, specifying the workload parameters and any necessary vLLM engine arguments.
+
+```bash
+python3 examples/tpu_profiling.py --model <your-model-name> [OPTIONS]
+```
+
+#### Key Arguments
+* `--model`: (Required) The name or path of the model to profile.
+* `--input-len`: The length of the input prompt tokens per request
+* `--output-len`: The number of tokens to generate per request.
+* `--batch-size`: The number of requests.
+* `--profile-result-dir`: The directory where the JAX profiler output will be saved.
+* The script also accepts all standard vLLM `EngineArgs` (e.g., `--tensor-parallel-size`, `--dtype`).
+
+#### Examples
+
+**1. Profile a Prefill Operation:**
+To profile a single request with a long input prompt (e.g., 1024 tokens), set `--input-len` high and `--batch-size` to 1.
+
+```bash
+python3 examples/tpu_profiling.py \
+  --model google/gemma-2b \
+  --input-len 1024 \
+  --output-len 1 \
+  --batch-size 1
+```
+
+**2. Profile a Decoding Operation:**
+To profile a large batch of single-token decoding steps, set `--input-len` and `--output-len` to 1 and use a large `--batch-size`.
+
+```bash
+python3 examples/tpu_profiling.py \
+  --model google/gemma-2b \
+  --input-len 1 \
+  --output-len 1 \
+  --batch-size 256
+```
+
+## Using `PHASED_PROFILING_DIR`
+If you set the following environment variable:
+
+```
+
+PHASED_PROFILING_DIR=<DESIRED PROFILING OUTPUT DIR>
+
+```
+
+we will automatically capture profiles during three phases of your workload (assuming they are encountered):
+1. Prefill-heavy (the quotient of prefill / total scheduled tokens for the given batch is => 0.9)
+2. Decode-heavy (the quotient of prefill / total scheduled tokens for the given batch is <= 0.2)
+3. Mixed (the quotient of prefill / total scheduled tokens for the given batch is between 0.4 and 0.6)
+
+To aid in your analysis, we will also log the batch composition for the profiled batches.
+
+## Using `USE_JAX_PROFILER_SERVER`
+If you set the following environment variable:
+
+```
+
+USE_JAX_PROFILER_SERVER=True
+
+```
+
+you can instead manually decide when to capture a profile and for how long, which can helpful if your workload (e.g. E2E benchmarking) is
+large and taking a profile of the entire workload (i.e. using the above method) will generate a massive tracing file.
+
+You can additionally set the desired profiling port (default is `9999`):
+
+```
+
+JAX_PROFILER_SERVER_PORT=XXXX
+
+```
+
+In order to use this approach, you can do the following:
+
+1. Run your typical `vllm serve` or `offline_inference` command (making sure to set `USE_JAX_PROFILER_SERVER=True`)
+2. Run your benchmarking command (`python benchmark_serving.py...`)
+3. Once the warmup has completed and your benchmark is running, start a new tensorboard instance with your `logdir` set to the desired output location of your profiles (e.g. `tensorboard --logdir=profiles/llama3-mmlu/`)
+4. Open the tensorboard instance and navigate to the `profile` page (e.g. `http://localhost:6006/#profile`)
+5. Click `Capture Profile` and, in the `Profile Service URL(s) or TPU name` box, enter `localhost:XXXX` where `XXXX` is your `JAX_PROFILER_SERVER_PORT` (default is `9999`)
+
+6. Enter the desired amount of time (in ms)
diff --git a/examples/tpu_profiling.py b/examples/tpu_profiling.py
@@ -0,0 +1,114 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Implements profiling for vLLM on TPU VMs using the JAX profiler.
+# NOTE: you will need the tensorboard-plugin-profile python package to
+# visualize the results in TensorBoard.
+# Please see docs/profiler.md for more details.
+# Usage example for prefilling 1 request of 1024 tokens:
+# python3 examples/tpu_profiling.py --input-len 1024 --output-len 1   --batch-size 1
+# Usage example for decoding 256 requests of 1 token each:
+# python3 examples/tpu_profiling.py --input-len 1 --output-len 1 --batch-size=256
+
+import argparse
+import dataclasses
+import os
+import time
+
+import numpy as np
+from tqdm import tqdm
+from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import EngineArgs
+from vllm.inputs import PromptType
+from vllm.utils import FlexibleArgumentParser
+
+DURATION_MS = int(os.getenv("VLLM_TPU_PROFILE_DURATION_MS", 3000))
+DELAY_MS = int(os.getenv("VLLM_TPU_PROFILE_DELAY_MS", 0))
+
+
+def main(args: argparse.Namespace):
+    print(args)
+
+    # Profile
+    profile_dir = args.profile_result_dir
+    print(f"Profiling (results will be saved to '{profile_dir}')...")
+    os.environ["VLLM_TORCH_PROFILER_DIR"] = profile_dir
+
+    engine_args = EngineArgs.from_cli_args(args)
+    llm = LLM(**dataclasses.asdict(engine_args))
+
+    sampling_params = SamplingParams(
+        temperature=0.0,
+        ignore_eos=True,
+        max_tokens=args.output_len,
+    )
+    print(sampling_params)
+    dummy_prompt_token_ids = np.random.randint(10000,
+                                               size=(args.batch_size,
+                                                     args.input_len))
+    dummy_prompts: list[PromptType] = [{
+        "prompt_token_ids": batch
+    } for batch in dummy_prompt_token_ids.tolist()]
+
+    def run_to_completion():
+        start_time = time.perf_counter()
+        llm.generate(dummy_prompts,
+                     sampling_params=sampling_params,
+                     use_tqdm=False)
+        end_time = time.perf_counter()
+        latency = end_time - start_time
+        return latency
+
+    # Warmup
+    print("Warming up...")
+    warmup_latencies = []
+    for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
+        warmup_latencies.append(run_to_completion())
+    print(f"Average warmup latency: {np.mean(warmup_latencies):.4f}s")
+
+    # Enable tracing on server
+    llm.start_profile()
+    if DELAY_MS == 0:
+        time.sleep(1.0)
+    profile_latencies = []
+    for _ in tqdm(range(args.num_iters), desc="Profile iterations"):
+        profile_latencies.append(run_to_completion())
+    llm.stop_profile()
+    print(f"Average profile latency: {np.mean(profile_latencies):.4f}s")
+
+    return
+
+
+def parse_args():
+    parser = FlexibleArgumentParser(
+        description="Benchmark the latency of processing a single batch of "
+        "requests till completion.")
+    parser.add_argument("--input-len", type=int, default=32)
+    parser.add_argument("--output-len", type=int, default=128)
+    parser.add_argument("--batch-size", type=int, default=8)
+    parser.add_argument(
+        "--num-iters-warmup",
+        type=int,
+        default=5,
+        help="Number of iterations to run for warmup.",
+    )
+    parser.add_argument(
+        "--num-iters",
+        type=int,
+        default=1,
+        help="Number of iterations to run for profiling.",
+    )
+    parser.add_argument(
+        "--profile-result-dir",
+        type=str,
+        default="profiles",
+        help=("path to save the JAX profiler output. Can be visualized "
+              "with ui.perfetto.dev, Tensorboard, or XProf"),
+    )
+
+    parser = EngineArgs.add_cli_args(parser)
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/tpu_inference/runner/utils.py b/tpu_inference/runner/utils.py
@@ -414,7 +414,7 @@ def step(self, batch_composition_stats: dict) -> None:
         have_seen_all_phases = all(self.inference_phase_seen.values())
         # We want to start profiling only after the first trial request
         is_past_initial_request = batch_composition_stats[
-            "num_reqs"] >= 1 and batch_composition_stats[
+            "num_reqs"] > 1 and batch_composition_stats[
                 "total_num_scheduled_tokens"] > 1
         if is_past_initial_request and (not have_seen_all_phases
                                         or self.current_phase != ""):