Fix vllm break:Support LoRA with speculative decoding:#21068

leo-pony · leo-pony · commit 557aa1f41593 · 2025-11-14T17:54:02.000+08:00
Signed-off-by: leo-pony &lt;nengjunma@outlook.com&gt;
diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml
@@ -185,7 +185,7 @@ jobs:
           #s pytest -sv tests/e2e/multicard/test_external_launcher.py
           #s pytest -sv tests/e2e/multicard/test_single_request_aclgraph.py
           #s pytest -sv tests/e2e/multicard/test_fused_moe_allgather_ep.py
-          #s pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py
+          pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py
 
           # To avoid oom, we need to run the test in a single process.
           pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
diff --git a/vllm_ascend/worker/npu_input_batch.py b/vllm_ascend/worker/npu_input_batch.py
@@ -834,7 +834,7 @@ def _make_prompt_token_ids_tensor(self) -> torch.Tensor:
                                               non_blocking=True)
 
     def make_lora_inputs(
-        self, num_scheduled_tokens: np.ndarray
+        self, num_scheduled_tokens: np.ndarray, num_sampled_tokens: np.ndarray
     ) -> tuple[tuple[int, ...], tuple[int, ...], set[LoRARequest]]:
         """
         Given the num_scheduled_tokens for each request in the batch, return