Enable ViT torch.compile + CUDA Graph #33

maxyanghu · 2026-02-02T17:27:19Z

is there a scenario where we don't need to pad? If not, I suggest removing this flag

This flag allows us to catch more grid size (i.e. token buckets) where exact match does not work. So far, enabling padding besides exact match has been giving better performance than exact match only.

yeah so If that's the case I guess we can just use padding by default.

maxyanghu · 2026-02-02T18:38:31Z

Is it possible to launch multiple CUDA graphs for multi-image batches? I know we may need multiple sets of input buffers and need to handle synchronization per stream. But maybe we could find a common batch size that reoccurs heavily that could justifies this cost?

launch multiple cuda graphs for multi-image batches - we already do this, just each cuda graph haddle one image in the batc. find a common batch size that reoccurs - I'm not sure the benchmark reuses a certain batch sizes, need to check.

-Original file line number
+Diff line change
@@ Expand Up / @@ -412,6 +412,22 @@ def call_module( @@
                     i for i, x in enumerate(args) if isinstance(x, torch.SymInt)
                 ]
+                # Check if we should use piecewise backend for this compilation
+                # For encoder with encoder_cudagraph_piecewise=False, skip piecewise
+                # backend entirely to avoid shape tracking issues. The encoder will
+                # use torch.compile directly and EncoderCudaGraphManager handles
+                # full cudagraph capture separately.
+                encoder_skip_piecewise = self.vllm_backend.is_encoder and not getattr(
+                    self.compilation_config, "encoder_cudagraph_piecewise", False
+                )
+                if encoder_skip_piecewise:
+                    # For encoder without piecewise mode, just use the compiled
+                    # submodule directly. EncoderCudaGraphManager will capture
+                    # the full graph later.
+                    self.module.__dict__[target] = submod
+                    return output
                 # Lazy import here to avoid circular import
                 from .piecewise_backend import PiecewiseBackend
@@ Expand All / @@ -424,10 +440,13 @@ def call_module( @@
                     self.vllm_backend,
                 )
-                if (
+                # Check if we should use piecewise cudagraphs for this compilation
+                use_piecewise_cudagraph = (
                     self.compilation_config.cudagraph_mode.has_piecewise_cudagraphs()
                     and not self.compilation_config.use_inductor_graph_partition
-                ):
+                )
+                if use_piecewise_cudagraph:
                     # We're using Dynamo-based piecewise splitting, so we wrap
                     # the whole subgraph with a static graph wrapper.
                     from .cuda_graph import CUDAGraphOptions
@@ Expand Down Expand Up / @@ -555,6 +574,13 @@ def __init__( @@
             # in future we need PostGradPassManager.uuid() to be executed
             # only at compile time.
             self.inductor_config = deepcopy(self.compilation_config.inductor_compile_config)
+            # Disable cache for encoder compilation to avoid assertion errors
+            # with simple graphs (e.g., Conv3d) that don't produce AOT artifacts.
+            # This skips the save in InductorStandaloneAdaptor.compile().
+            if self.is_encoder:
+                self.inductor_config["force_disable_caches"] = True
             # `torch.compile` is JIT compiled, so we don't need to
             # do anything here
@@ Expand Down Expand Up / @@ -716,6 +742,10 @@ def __call__( @@
             if self.compilation_config.use_inductor_graph_partition:
                 # Let Inductor decide partitioning; avoid FX-level pre-splitting.
                 fx_split_ops: list[str] = []
+            elif self.is_encoder:
+                # For encoder compilation, use encoder-specific splitting ops
+                # to enable piecewise cudagraph (attention in eager, rest in graph)
+                fx_split_ops = self.compilation_config.get_encoder_splitting_ops()
             else:
                 fx_split_ops = self.compilation_config.splitting_ops or []
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -72,8 +72,35 @@ def __init__( @@
             log_string = f"PiecewiseBackend: compile_ranges: {self.compile_ranges}"
             logger.debug_once(log_string)
-            self.compile_sizes = self.compilation_config.compile_sizes
-            log_string = f"PiecewiseBackend: compile_sizes: {self.compile_sizes}"
+            # Use encoder-specific capture sizes for encoder compilation
+            self.compile_sizes: list[Any] | None = None
+            if self.is_encoder_compilation:
+                encoder_capture_sizes = (
+                    self.compilation_config.encoder_cudagraph_capture_sizes
+                )
+                if encoder_capture_sizes is not None:
+                    # Convert from output tokens to input patches
+                    # encoder_cudagraph_capture_sizes is specified in output tokens
+                    # but runtime_shape (from sym_shape_indices) is in input patches
+                    merge_size_sq = self.compilation_config.encoder_spatial_merge_size**2
+                    self.compile_sizes = [
+                        size * merge_size_sq for size in encoder_capture_sizes
+                    ]
+                    logger.debug_once(
+                        "PiecewiseBackend: converted encoder capture sizes from "
+                        "output tokens %s to input patches %s (merge_size²=%d)",
+                        tuple(encoder_capture_sizes),
+                        tuple(self.compile_sizes),
+                        merge_size_sq,
+                    )
+                else:
+                    self.compile_sizes = None
+            else:
+                self.compile_sizes = self.compilation_config.compile_sizes
+            log_string = (
+                f"PiecewiseBackend: compile_sizes: {self.compile_sizes} "
+                f"(is_encoder={self.is_encoder_compilation})"
+            )
             logger.debug_once(log_string)
             self.sym_shape_indices = sym_shape_indices
@@ Expand Down Expand Up / @@ -143,15 +170,13 @@ def _maybe_compile_for_range_entry( @@
                 range_entry.compiled = True
                 self.to_be_compiled_ranges.remove(range_entry.compile_range)
+                is_exact_size = range_entry.compile_range.is_single_size()
                 # args are real arguments
                 # fakify for range, real args for concrete size.
                 # For concrete size, we clear the shape env in
                 # compiler_manager.compile() so no need to fakify.
-                args_list = (
-                    self._fakify_args(args)
-                    if not range_entry.compile_range.is_single_size()
-                    else list(args)
-                )
+                args_list = self._fakify_args(args) if not is_exact_size else list(args)
                 range_entry.runnable = self.vllm_backend.compiler_manager.compile(
                     self.graph,
                     args_list,
@@ Expand All @@
             # If not found, we search for the range entry
             # that contains the runtime shape.
             if self.compile_sizes is None:
+                logger.debug(
+                    "PIECEWISE: compile_sizes is None, shape=%d, is_encoder=%s",
+                    runtime_shape,
+                    self.is_encoder_compilation,
+                )
                 return None
             if runtime_shape in self.compile_sizes:
+                # Exact match with capture size - will use cudagraph
+                logger.debug(
+                    "PIECEWISE: exact match shape=%d in compile_sizes, is_encoder=%s",
+                    runtime_shape,
+                    self.is_encoder_compilation,
+                )
                 return self.range_entries[Range(start=runtime_shape, end=runtime_shape)]
             else:
+                # No exact match - fall back to compile_ranges (no cudagraph)
                 for range in self.compile_ranges:
                     if runtime_shape in range:
+                        logger.debug(
+                            "PIECEWISE: shape=%d not in compile_sizes, "
+                            "using compile_range=%s (NO CUDAGRAPH), is_encoder=%s",
+                            runtime_shape,
+                            range,
+                            self.is_encoder_compilation,
+                        )
                         return self.range_entries[range]
+                # Shape not in any range - will cause assertion error
+                logger.warning(
+                    "PIECEWISE: shape=%d not in compile_sizes=%s or "
+                    "compile_ranges=%s, is_encoder=%s",
+                    runtime_shape,
+                    self.compile_sizes,
+                    self.compile_ranges,
+                    self.is_encoder_compilation,
+                )
             return None
         def __call__(self, *args: Any) -> Any:
             runtime_shape = args[self.sym_shape_indices[0]]
             range_entry = self._find_range_for_shape(runtime_shape)
             assert range_entry is not None, (
                 f"Shape: {runtime_shape} out of considered ranges: {self.compile_ranges}"
             )
-            self._maybe_compile_for_range_entry(range_entry, args)
-            return range_entry.runnable(*args)
+            self._maybe_compile_for_range_entry(range_entry, args)  # type: ignore[arg-type]
+            return range_entry.runnable(*args)  # type: ignore[union-attr]

-Original file line number
+Diff line change
@@ Expand Up / @@ -438,6 +438,91 @@ class CompilationConfig: @@
         on selected platforms. Disabled by default until more models
         are supported/tested to work."""
+        # Encoder (ViT) CUDA graph settings
+        cudagraph_mm_encoder: bool = False
+        """Whether to enable CUDA graph capture for multimodal encoders (ViT).
+        When enabled, CUDA graphs are captured for the vision encoder to eliminate
+        kernel launch overhead. Requires fixed input sizes via bucketing.
+        Experimental feature - use with caution."""
+        encoder_cudagraph_bucket_sizes: list[int] | None = None
+        """Square grid side lengths for padded CUDA graph execution. Each size N
+        creates a bucket grid (1, N, N). Inputs with max(H, W) <= N are padded to
+        fit the bucket. Example: [32, 64, 94, 128, 188, 256, 312] captures grids
+        (1, 32, 32), (1, 64, 64), etc. Used with encoder_cudagraph_padded_mode=True."""
+        encoder_cudagraph_grid_configs: list[tuple[int, int, int]] | str | None = None
+        """Grid configurations (T, H, W in patch units) for exact-match CUDA graph
+        capture. Can be a list of tuples or preset "custom" (top 30 most common grids,
+.9% exact match coverage). If None, uses "custom" as default."""
+        encoder_cudagraph_padded_mode: bool = True
+        """Whether to use padded execution for encoder CUDA graphs.
+        When True, inputs smaller than a captured bucket are padded to fit.
+        Padded: pixel_values, pos_embeds, rotary_embeds (with zeros).
+        NOT padded: cu_seqlens, max_seqlen (set to actual values so flash
+        attention only processes real tokens). Output is trimmed to actual size.
+        When False, only exact grid matches use CUDA graphs."""
+        encoder_cudagraph_max_grid_size: int = 256
+        """Maximum grid dimension (H or W) for encoder CUDA graph capture.
+        Grids with H > max or W > max are skipped to limit GPU memory usage.
+        Memory scales roughly with H*W:
+        - 128x128: ~0.8 GiB
+        - 188x188: ~1.7 GiB
+        - 256x256: ~3.2 GiB
+        Set lower (e.g., 128, 188, 218) on memory-constrained systems.
+        Default 256 captures all grids in CUSTOM_GRID_CONFIGS."""
+        encoder_cudagraph_verbose: bool = False
+        """Enable verbose logging for encoder CUDA graph execution.
+        When True, logs each ViT input size and CUDA graph hit/miss/padded status.
+        Useful for debugging and analyzing CUDA graph utilization.
+        When False, only logs summary stats at the end of execution."""
+        encoder_cudagraph_one_by_one: bool = True
+        """Enable one-by-one image processing for multi-image batches.
+        When True (default), multi-image batches are processed individually to
+        maximize CUDA graph hit rate.
+        When False, multi-image batches are processed together in eager mode,
+        which may be faster when CUDA graph overhead (sync, memory) outweighs
+        the kernel launch savings.
+        Set to False if you observe throughput regression with encoder CUDA graphs."""
+        encoder_cudagraph_batch_sizes: list[int] | None = None
+        """Batch sizes for grouped batched CUDA graph capture.
+        When set (e.g., [4]), captures graphs for processing multiple images
+        together. Images are grouped by similar grid sizes and padded to the
+        largest grid in each group. Single graph replay for the whole group.
+        Example: [4] captures batch_size=4 graphs only (1-3 images use eager).
+        Default None uses legacy one-by-one mode (batch_size=1 per image)."""
+        encoder_cudagraph_piecewise: bool = False
+        """Enable piecewise CUDA graph mode for encoder (ViT).
+        When True, torch.compile splits the encoder graph at attention ops, so:
+        - Non-attention ops (norm, MLP, patch_embed, merger) are captured in CUDA graphs
+        - Attention ops run in eager mode with original batch structure
+        This allows batching multiple images together while still benefiting from
+        CUDA graphs for the non-attention parts. More efficient than one-by-one
+        processing when batch sizes vary.
+        Requires compile_mm_encoder=True. Mutually exclusive with cudagraph_mm_encoder."""
+        encoder_cudagraph_capture_sizes: list[int] | None = None
+        """CUDA graph capture sizes (token counts) for encoder piecewise mode.
+        These are the total token counts at which CUDA graphs are captured.
+        For Qwen3-VL with spatial_merge_size=2:
+        - (1, 32, 32) grid → 1024 patches → 256 output tokens
+        - (1, 64, 64) grid → 4096 patches → 1024 output tokens
+        - (1, 94, 94) grid → 8836 patches → 2209 output tokens
+        Example: [256, 512, 1024, 2048, 4096, 8192, 16384]
+        If None, encoder piecewise mode will use compile_ranges only (no cudagraph)."""
+        encoder_spatial_merge_size: int = 2
+        """Spatial merge size for vision encoder (e.g., 2 for Qwen3-VL).
+        This converts encoder_cudagraph_capture_sizes (output tokens) to input patches.
+        Input patches = output tokens * spatial_merge_size².
+        Default is 2, which is common for Qwen-VL family models."""
         # Inductor capture
         compile_sizes: list[int | str] | None = None
         """Sizes to compile for inductor. In addition
@@ Expand Down Expand Up / @@ -622,6 +707,15 @@ class CompilationConfig: @@
             "vllm::sparse_attn_indexer",
         ]
+        # Encoder (ViT) attention ops; used for piecewise cudagraphs on encoders
+        # These ops depend on batch structure (cu_seqlens), so they must be
+        # excluded from cudagraph capture to allow batching multiple images.
+        _encoder_attention_ops: ClassVar[list[str]] = [
+            "vllm::flash_attn_maxseqlen_wrapper",
+            "vllm::fa4_flash_attn_maxseqlen_wrapper",
+            "vllm::flashinfer_wrapper",
+        ]
         def compute_hash(self) -> str:
             """
             Provide a hash that uniquely identifies all the configs
@@ Expand Down Expand Up / @@ -1023,6 +1117,15 @@ def splitting_ops_contain_attention(self) -> bool: @@
                 op in self.splitting_ops for op in self._attention_ops
             )
+        def get_encoder_splitting_ops(self) -> list[str]:
+            """Get splitting ops for encoder (ViT) compilation.
+            For piecewise cudagraph on encoders, we split at attention ops
+            so that non-attention ops (norm, MLP) can be captured in cudagraphs
+            while attention runs in eager mode with batched images.
+            """
+            return list(self._encoder_attention_ops)
         def is_attention_compiled_piecewise(self) -> bool:
             if not self.splitting_ops_contain_attention():
                 return False
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -1249,6 +1249,7 @@ def _set_compile_ranges(self): @@
                         and x > 1
                     ):
                         computed_compile_ranges_split_points.append(x)
             compilation_config.compile_ranges_split_points = sorted(
                 computed_compile_ranges_split_points
             )
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -686,9 +686,11 @@ def forward( @@
             if isinstance(grid_thw, list):
                 grid_thw_list = grid_thw
                 grid_thw = np.array(grid_thw, dtype=np.int32)
+            elif isinstance(grid_thw, np.ndarray):
+                grid_thw_list = grid_thw.tolist()
             else:
                 grid_thw_list = grid_thw.tolist()
-                grid_thw = grid_thw.numpy()
+                grid_thw = grid_thw.cpu().numpy()
             # compute position embedding
             rotary_pos_emb_cos, rotary_pos_emb_sin = self.rot_pos_emb(grid_thw_list)
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Enable ViT torch.compile + CUDA Graph #33

Uh oh!

Diff view

Diff view

There are no files selected for viewing

maxyanghu Feb 2, 2026

Uh oh!

b-mu Feb 2, 2026

Uh oh!

maxyanghu Feb 2, 2026

Uh oh!

maxyanghu Feb 2, 2026

Uh oh!

b-mu Feb 2, 2026

Uh oh!

Uh oh!

-Original file line number
+Diff line change
@@ Expand Up / @@ -144,10 +144,10 @@ def forward( @@
             # Add qk-norm
             q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim, self.head_dim)
             q_by_head = self.q_norm(q_by_head)
-            q = q_by_head.view(q.shape)
+            q = q_by_head.flatten(-2)
             k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim, self.head_dim)
             k_by_head = self.k_norm(k_by_head)
-            k = k_by_head.view(k.shape)
+            k = k_by_head.flatten(-2)
             q, k = self.rotary_emb(positions, q, k)
             attn_output = self.attn(q, k, v)
             output, _ = self.o_proj(attn_output)
@@ Expand Down @@

Enable ViT torch.compile + CUDA Graph #33

Are you sure you want to change the base?

Uh oh!

Enable ViT torch.compile + CUDA Graph #33

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

maxyanghu Feb 2, 2026

Choose a reason for hiding this comment

Uh oh!

b-mu Feb 2, 2026

Choose a reason for hiding this comment

Uh oh!

maxyanghu Feb 2, 2026

Choose a reason for hiding this comment

Uh oh!

maxyanghu Feb 2, 2026

Choose a reason for hiding this comment

Uh oh!

b-mu Feb 2, 2026

Choose a reason for hiding this comment

Uh oh!

Uh oh!