lint

bnellnm · bnellnm · commit b04e5d314f17 · 2025-05-07T15:23:38.000Z
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/requirements/test.txt b/requirements/test.txt
@@ -27,6 +27,10 @@ argcomplete==3.5.1
     # via datamodel-code-generator
 arrow==1.3.0
     # via isoduration
+async-timeout==5.0.1
+    # via
+    #   aiohttp
+    #   redis
 attrs==24.2.0
     # via
     #   aiohttp
@@ -126,6 +130,11 @@ encodec==0.1.1
     # via vocos
 evaluate==0.4.3
     # via lm-eval
+exceptiongroup==1.2.2
+    # via
+    #   anyio
+    #   hypothesis
+    #   pytest
 fastparquet==2024.11.0
     # via genai-perf
 fastrlock==0.8.2
@@ -623,7 +632,6 @@ setuptools==77.0.3
     # via
     #   mamba-ssm
     #   pytablewriter
-    #   torch
     #   triton
 shellingham==1.5.4
     # via typer
@@ -683,8 +691,13 @@ tokenizers==0.21.1
     # via
     #   -r requirements/test.in
     #   transformers
+toml==0.10.2
+    # via datamodel-code-generator
 tomli==2.2.1
-    # via schemathesis
+    # via
+    #   black
+    #   pytest
+    #   schemathesis
 tomli-w==1.2.0
     # via schemathesis
 torch==2.7.0+cu128
@@ -756,12 +769,16 @@ types-python-dateutil==2.9.0.20241206
     # via arrow
 typing-extensions==4.12.2
     # via
+    #   anyio
+    #   black
     #   huggingface-hub
     #   librosa
     #   mistral-common
+    #   multidict
     #   pqdm
     #   pydantic
     #   pydantic-core
+    #   rich
     #   torch
     #   typer
 tzdata==2024.2
diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py
@@ -522,13 +522,10 @@ def pplx_moe(pgi, dp_size, a, w1, w2, topk_weight, topk_ids):
 def _batched_moe(pgi, dp_size, a, w1, w2, topk_weight, topk_ids):
     assert torch.cuda.current_device() == pgi.local_rank
 
-    hidden_dim = a.shape[1]
     num_experts = w1.shape[0]
-    block_size = 128
     device = pgi.device
     rank = pgi.rank
     world_size = pgi.world_size
-    topk = topk_ids.shape[1]
     max_num_tokens = rank_chunk(a.shape[0], 0, world_size)
 
     dispatch_combine = BatchedDispatchCombine(
diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py
@@ -328,7 +328,8 @@ def _get_shape_env() -> AlwaysHitShapeEnv:
             assert hash_str is not None, (
                 f"failed to get the hash of the compiled graph: {file_path}")
             assert file_path is not None, (
-                "failed to get the file path of the compiled graph: {file_path}")
+                "failed to get the file path of the compiled graph: {file_path}"
+            )
         return compiled_graph, (hash_str, file_path)
 
     def load(self,
diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -514,31 +514,18 @@ def dispatch(
                                             dtype=torch.int,
                                             device=a1.device)
 
-        rem_experts = num_experts % self.world_size
-        num_local_experts = ((num_experts // self.world_size) +
-                             (1 if self.rank < rem_experts else 0))
+        assert num_experts % self.world_size == 0
+
+        num_local_experts = num_experts // self.world_size
 
         b_a1 = torch.zeros(
             (num_local_experts, self.max_num_tokens, hidden_dim),
             dtype=a1.dtype,
             device=a1.device)
 
-        first_expert = (((num_experts // self.world_size) * self.rank) +
-                        rem_experts - self.rank)
+        first_expert = num_local_experts * self.rank
         last_expert = first_expert + num_local_experts
 
-        # rhs = torch.empty((self.max_num_tokens, hidden_dim),
-        #                   dtype=a1.dtype, device=a1.device)
-
-        # for expert_id in range(first_expert, last_expert):
-        #     topks = torch.any(topk_ids == expert_id, dim=1).flatten()
-        #     rows = torch.count_nonzero(topks.flatten())
-        #     #rhs[:rows] = a1[:topks.numel()][topks]
-        #     topks_idx = topks.nonzero()
-        #     torch.index_select(a1, dim=0, index=topks_idx.flatten(), out=rhs[:rows])
-        #     b_a1[expert_id - first_expert, :rows, :] = rhs[:rows]
-        #     tokens_per_expert[expert_id - first_expert] = rows
-
         for expert_id in range(first_expert, last_expert):
             topks = torch.any(topk_ids == expert_id, dim=1).flatten()
             rows = torch.count_nonzero(topks.flatten())
@@ -558,24 +545,14 @@ def combine(
     ) -> None:
         num_tokens = topk_ids.shape[0]
         num_local_experts = fused_expert_output.shape[0]
-        topk = topk_weights.shape[1]
         K = fused_expert_output.shape[-1]
         assert output.shape[0] == num_tokens and output.shape[1] == K
 
         output.fill_(0)
 
-        first_expert = num_local_experts * self.rank  # NOT QUITE RIGHT
+        first_expert = num_local_experts * self.rank
         last_expert = first_expert + num_local_experts
 
-        # for expert_id in range(first_expert, last_expert):
-        #     topkws = topk_ids == expert_id
-        #     topks = torch.any(topkws, dim=1).flatten()
-        #     outrhs = output[topks]
-        #     rhs = fused_expert_output[expert_id - first_expert, :outrhs.shape[0], :]
-        #     if not apply_router_weight_on_input:
-        #         rhs.mul_(topk_weights[topkws].view(rhs.shape[0], 1))
-        #     output[topks] = outrhs + rhs
-
         for expert_id in range(first_expert, last_expert):
             topkws = topk_ids == expert_id
             topks = torch.any(topkws, dim=1).flatten()
@@ -661,20 +638,20 @@ def apply(
         num_experts = global_num_experts
         out = _resize_cache(workspace13,
                             (num_experts, max_num_tokens * num_dp, hidden_dim))
-        num_local_experts = w1.shape[0]  #expert_num_tokens.numel()
+        num_local_experts = w1.shape[0]
         assert num_local_experts == w1.shape[
             0], f"{num_local_experts} == {w1.shape[0]}"
 
         N = w1.shape[1] // 2
 
         # Not cudagraph friendly
-        # assert (torch.cuda.is_current_stream_capturing() or
-        #         torch.all(expert_num_tokens <= max_num_tokens)), (
-        #             f"{expert_num_tokens} <= {max_num_tokens}")
+        assert (torch.cuda.is_current_stream_capturing()
+                or torch.all(expert_num_tokens <= max_num_tokens)), (
+                    f"{expert_num_tokens} <= {max_num_tokens}")
 
         for expert in range(num_local_experts):
             # Indexing expert_num_tokens doesn't work w/cudagraphs
-            if True or torch.cuda.is_current_stream_capturing():
+            if torch.cuda.is_current_stream_capturing():
                 num = max_num_tokens * num_dp
             else:
                 num = int(expert_num_tokens[expert].item())
@@ -821,12 +798,14 @@ def apply(
                                          block_shape=self.block_shape)
 
         # Fix activations
-        # assert activation == "silu"
-        # invoke_batched_silu_and_mul(output=intermediate_cache2,
-        #                             input=intermediate_cache1,
-        #                             expert_num_tokens=expert_num_tokens)
-        self.activation(activation, intermediate_cache2.view(-1, N // 2),
-                        intermediate_cache1.view(-1, N))
+        if True:
+            assert activation == "silu"
+            invoke_batched_silu_and_mul(output=intermediate_cache2,
+                                        input=intermediate_cache1,
+                                        expert_num_tokens=expert_num_tokens)
+        else:
+            self.activation(activation, intermediate_cache2.view(-1, N // 2),
+                            intermediate_cache1.view(-1, N))
 
         #qintermediate_cache2 = intermediate_cache2
         a2q_scale = a2_scale
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -68,55 +68,68 @@ def use_pplx_kernels(self):
     def make(tp_size_: int, dp_size_: int,
              vllm_parallel_config: ParallelConfig) -> "FusedMoEParallelConfig":
         """
-        Determine MoE parallel configuration. Based on the input tp_size_, dp_size_,
-        ep_size_ and vllm's parallel config, determine what level's of parallelism
-        to use in the fused moe layer.
+        Determine MoE parallel configuration. Based on the input tp_size_,
+        dp_size_, ep_size_ and vllm's parallel config, determine what
+        level's of parallelism to use in the fused moe layer.
 
         Args:
             tp_size_ (int): tp_size passed into the FusedMoE constructor.
             dp_size_ (int): dp_size passed into the FusedMoE constructor.
             ep_size_ (int): ep_size passed into the FusedMoE constructor.
-            vllm_parallel_config (ParallelConfig): vllm's parallel config  object.
+            vllm_parallel_config (ParallelConfig): vllm's parallel config
+            object.
 
         Examples:
         When there is no parallelism requested, i.e. tp_size_ = dp_size_ = 1,
         we simply return the sizes unaltered and the ranks set to 0.
 
-        Expert Parallelism is considered only when either dp_size_ or tp_size_ is non trivial.
+        Expert Parallelism is considered only when either dp_size_ or tp_size_
+        is non trivial.
 
-        When TP = 2, DP = 1 and EP = False, the configuration on different devices,
-            - device 0 : TP = {2, 0} DP = {1, 0} EP = {1, 0} // legend : {size, rank}
+        When TP = 2, DP = 1 and EP = False, the configuration on different
+        devices,
+            - device 0 : TP = {2, 0} DP = {1, 0} EP = {1, 0} //
+                         legend : {size, rank}
             - device 1 : TP = {2, 1} DP = {1, 0} EP = {1, 0}
             - Comment : Tensors are sharded across 2 devices.
 
-        When TP = 1, DP = 2 and EP = False, the configuration on different devices,
+        When TP = 1, DP = 2 and EP = False, the configuration on different
+        devices,
             - device 0 : TP = {2, 0} DP = {2, 0} EP = {1, 0}
             - device 1 : TP = {2, 1} DP = {2, 1} EP = {1, 0}
-            - Comment: There are 2 engine instances and the tensors are sharded across 2 decvices.
+            - Comment: There are 2 engine instances and the tensors are sharded
+              across 2 decvices.
 
-        When TP = 2, DP = 2 and EP = False, the configuration on different devices,
+        When TP = 2, DP = 2 and EP = False, the configuration on different
+        devices,
             - device 0: TP = {4, 0} DP = {2, 0} EP = {1, 0}
             - device 1: TP = {4, 1} DP = {2, 0} EP = {1, 0}
             - device 2: TP = {4, 2} DP = {2, 1} EP = {1, 0}
             - device 3: TP = {4, 3} DP = {2, 1} EP = {1, 0}
-            - Comment: There are 2 engine instances and the tensors are sharded across 4 devices.
+            - Comment: There are 2 engine instances and the tensors are sharded
+              across 4 devices.
 
-        When, TP = 2, DP = 1 and EP = True, the configuration on different devices,
+        When, TP = 2, DP = 1 and EP = True, the configuration on different
+        devices,
             - device 0: TP = {1, 0} DP = {1, 0} EP = {2, 0}
             - device 1: TP = {1, 0} DP = {1, 0} EP = {2, 1}
             - Comment: The experts are split between the 2 devices.
 
-        When, TP = 1, DP = 2 and EP = True, the configuration on different devices,
+        When, TP = 1, DP = 2 and EP = True, the configuration on different
+        devices,
             - device 0: TP = {1, 0} DP = {2, 0} EP = {2, 0}
             - device 1: TP = {1, 0} DP = {2, 1} EP = {2, 1}
-            - Comment: There are 2 engine instances and the experts are split between the 2 devices.
+            - Comment: There are 2 engine instances and the experts are split
+              between the 2 devices.
 
-        When TP = 2, DP = 2 and EP = True, the configuration on different devices,
+        When TP = 2, DP = 2 and EP = True, the configuration on different
+        devices,
             - device 0: TP = {1, 0} DP = {2, 0} EP = {4, 0}
             - device 1: TP = {1, 0} DP = {2, 0} EP = {4, 1}
             - device 2: TP = {1, 0} DP = {2, 1} EP = {4, 2}
             - device 3: TP = {1, 0} DP = {2, 1} EP = {4, 3}
-            - Comment: There are 2 engine instances and the experts are split between the 4 devices.
+            - Comment: There are 2 engine instances and the experts are split
+              between the 4 devices.
         """
 
         def flatten_tp_across_dp(dp_rank: int):
@@ -127,7 +140,8 @@ def flatten_tp_across_dp(dp_rank: int):
             tp_rank = dp_rank * tp_size_ + tp_rank
             return tp_size, tp_rank
 
-        use_ep = dp_size_ * tp_size_ > 1 and vllm_parallel_config.enable_expert_parallel
+        use_ep = (dp_size_ * tp_size_ > 1
+                  and vllm_parallel_config.enable_expert_parallel)
 
         dp_size = dp_size_
         dp_rank = get_dp_group().rank_in_group
@@ -143,8 +157,8 @@ def flatten_tp_across_dp(dp_rank: int):
                                           use_ep=False)
         # DP + EP / TP + EP / DP + TP + EP
         assert use_ep
-        # In EP, each device owns a set of experts fully. There is no tensor parallel.
-        # Update tp_size, tp_rank, ep_size and ep_rank to reflect that.
+        # In EP, each device owns a set of experts fully. There is no tensor
+        # parallel update tp_size, tp_rank, ep_size and ep_rank to reflect that.
         ep_size = tp_size
         ep_rank = tp_rank
         return FusedMoEParallelConfig(tp_size=1,
@@ -719,12 +733,13 @@ def __init__(
         self.params_dtype = params_dtype
 
         vllm_config = get_current_vllm_config()
-        self.moe_parallel_config: FusedMoEParallelConfig = FusedMoEParallelConfig.make(
-            tp_size_=(tp_size if tp_size is not None else
-                      get_tensor_model_parallel_world_size()),
-            dp_size_=(dp_size
-                      if dp_size is not None else get_dp_group().world_size),
-            vllm_parallel_config=vllm_config.parallel_config)
+        self.moe_parallel_config: FusedMoEParallelConfig = (
+            FusedMoEParallelConfig.make(
+                tp_size_=(tp_size if tp_size is not None else
+                          get_tensor_model_parallel_world_size()),
+                dp_size_=(dp_size if dp_size is not None else
+                          get_dp_group().world_size),
+                vllm_parallel_config=vllm_config.parallel_config))
 
         self.global_num_experts = num_experts
 
@@ -1184,8 +1199,9 @@ def must_reduce_shared_outputs(self) -> bool:
     def maybe_all_reduce_tensor_model_parallel(
             self, final_hidden_states: torch.Tensor):
         """
-        The pplx combine kernel reduce across GPU ranks by default. The pplx kernels are
-        used when EP is enabled. In that case, this function is a no-op.
+        The pplx combine kernel reduce across GPU ranks by default. The pplx
+        kernels are used when EP is enabled. In that case, this function is a
+        no-op.
         """
         if self.dp_size > 1 and self.use_ep and has_pplx:
             return final_hidden_states
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
@@ -145,7 +145,8 @@ def __init__(
                 # to reduce the shared_output result. Instead we reduce
                 # at the end of the forward pass.
                 # With EP and the pplx kernels - this is no longer viable
-                # as all GPU ranks in DP, produce the complete set of hidden_states.
+                # as all GPU ranks in DP, produce the complete set of
+                # hidden_states.
                 # Therefore reduce the shared experts early.
                 reduce_results=self.experts.must_reduce_shared_outputs(),
                 prefix=f"{prefix}.shared_experts",
@@ -178,7 +179,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
                     * (1. / self.routed_scaling_factor)
 
         if self.tp_size > 1:
-            final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel(
+            final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel(  # noqa E501
                 final_hidden_states)
 
         return final_hidden_states.view(num_tokens, hidden_dim)
diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py
@@ -100,7 +100,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         final_hidden_states = self.experts(hidden_states, router_logits)
 
         if self.tp_size > 1:
-            final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel(
+            final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel(  # noqa E501
                 final_hidden_states)
 
         return final_hidden_states.view(orig_shape)
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
@@ -154,7 +154,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         if shared_output is not None:
             final_hidden_states = final_hidden_states + shared_output
         if self.tp_size > 1:
-            final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel(
+            final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel(  # noqa E501
                 final_hidden_states)
 
         return final_hidden_states.view(orig_shape)
diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
@@ -135,7 +135,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
                                            router_logits=router_logits)
         final_hidden_states = final_hidden_states
         if self.tp_size > 1:
-            final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel(
+            final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel(  # noqa E501
                 final_hidden_states)
 
         return final_hidden_states.view(orig_shape)
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
@@ -157,7 +157,7 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
                 logger.info(
                     "Forcing kv cache block size to 64 for FlashMLA backend.")
 
-        if (False and parallel_config.data_parallel_size > 1
+        if (parallel_config.data_parallel_size > 1
                 and compilation_config.use_cudagraph):
             logger.info(
                 "Data Parallel: Forcing enforce eager to be True since DP is "
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -1542,7 +1542,6 @@ def _dummy_run(
                 self.drafter.dummy_run(num_tokens)
 
         logit_indices = np.cumsum(num_scheduled_tokens) - 1
-        #logit_indices = torch.from_numpy(logit_indices).to(hidden_states.device)
         return hidden_states[logit_indices]
 
     @torch.inference_mode()