From ca1b731170612a95cd36c205417b1fdee198d16f Mon Sep 17 00:00:00 2001 From: Xiongfei Wei Date: Thu, 23 Oct 2025 16:21:35 +0000 Subject: [PATCH 1/4] Try to manually release the TPU resource. Signed-off-by: Xiongfei Wei --- .buildkite/pipeline_jax.yml | 2 +- docker/Dockerfile | 2 +- tests/lora/test_lora.py | 26 +++++++++++++++----------- 3 files changed, 17 insertions(+), 13 deletions(-) diff --git a/.buildkite/pipeline_jax.yml b/.buildkite/pipeline_jax.yml index dc18454d7..30a474dcd 100644 --- a/.buildkite/pipeline_jax.yml +++ b/.buildkite/pipeline_jax.yml @@ -156,7 +156,7 @@ steps: commands: - | .buildkite/scripts/run_in_docker.sh \ - bash -c 'MODEL_IMPL_TYPE=vllm TPU_BACKEND_TYPE=jax python3 -m pytest -s -v -x /workspace/tpu_inference/tests/lora/test_lora.py -k multi_lora' + bash -c 'MODEL_IMPL_TYPE=vllm TPU_BACKEND_TYPE=jax python3 -m pytest -s -v -x /workspace/tpu_inference/tests/lora/test_lora.py' # ----------------------------------------------------------------- diff --git a/docker/Dockerfile b/docker/Dockerfile index f82997de6..a46f075c9 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -10,7 +10,7 @@ RUN pip uninstall -y torch torch_xla torchvision # Install some basic utilities RUN apt-get update && apt-get install -y \ - git \ + git lsof \ libopenblas-base libopenmpi-dev libomp-dev # Build vLLM diff --git a/tests/lora/test_lora.py b/tests/lora/test_lora.py index 20df2e68a..82fe59166 100644 --- a/tests/lora/test_lora.py +++ b/tests/lora/test_lora.py @@ -1,5 +1,6 @@ # https://github.com/vllm-project/vllm/blob/ed10f3cea199a7a1f3532fbe367f5c5479a6cae9/tests/tpu/lora/test_lora.py import os +import subprocess import pytest import vllm @@ -16,17 +17,6 @@ # 100 training iterations with a training batch size of 100. -@pytest.fixture(scope="function", autouse=True) -def use_v1_only(monkeypatch: pytest.MonkeyPatch): - """ - Since Multi-LoRA is only supported on the v1 TPU backend, set VLLM_USE_V1=1 - for all tests in this file - """ - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") - yield - - def setup_vllm(num_loras: int, tp: int = 1) -> vllm.LLM: return vllm.LLM(model="Qwen/Qwen2.5-3B-Instruct", max_model_len=256, @@ -38,6 +28,20 @@ def setup_vllm(num_loras: int, tp: int = 1) -> vllm.LLM: max_lora_rank=8) +@pytest.fixture(autouse=True) +def run_after_each_test(): + # --- Setup code (runs before each test) --- + # print("\nSetting up...") + yield # This is where the test runs + # --- Teardown code (runs after each test) --- + command = "lsof -t /dev/vfio/* | xargs kill" + results = subprocess.run(command, + shell=True, + capture_output=True, + text=True) + print(f"Killing TPU resources: {results.stdout}, {results.stderr}") + + # For multi-chip test, we only use TP=2 because the base model Qwen/Qwen2.5-3B-Instruct has 2 kv heads and the current attention kernel requires it to be divisible by tp_size. TP = [2] if os.environ.get("USE_V6E8_QUEUE", False) else [1] From 5db832205fc446d4ceb3371ada641acecff30861 Mon Sep 17 00:00:00 2001 From: Xiongfei Wei Date: Thu, 23 Oct 2025 17:50:32 +0000 Subject: [PATCH 2/4] also release tpu at set up. Signed-off-by: Xiongfei Wei --- tests/lora/test_lora.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tests/lora/test_lora.py b/tests/lora/test_lora.py index 82fe59166..fba079b58 100644 --- a/tests/lora/test_lora.py +++ b/tests/lora/test_lora.py @@ -32,6 +32,14 @@ def setup_vllm(num_loras: int, tp: int = 1) -> vllm.LLM: def run_after_each_test(): # --- Setup code (runs before each test) --- # print("\nSetting up...") + command = "lsof -t /dev/vfio/* | xargs kill" + results = subprocess.run(command, + shell=True, + capture_output=True, + text=True) + print( + f"Setting up: Killing TPU resources: {results.stdout}, {results.stderr}" + ) yield # This is where the test runs # --- Teardown code (runs after each test) --- command = "lsof -t /dev/vfio/* | xargs kill" @@ -39,7 +47,9 @@ def run_after_each_test(): shell=True, capture_output=True, text=True) - print(f"Killing TPU resources: {results.stdout}, {results.stderr}") + print( + f"Tear down: Killing TPU resources: {results.stdout}, {results.stderr}" + ) # For multi-chip test, we only use TP=2 because the base model Qwen/Qwen2.5-3B-Instruct has 2 kv heads and the current attention kernel requires it to be divisible by tp_size. From c4bd0a31331a22d8e7eab17cbf16bd47bc9f226b Mon Sep 17 00:00:00 2001 From: Xiongfei Wei Date: Sat, 25 Oct 2025 02:02:52 +0000 Subject: [PATCH 3/4] Try: delete the instance and sleep. Signed-off-by: Xiongfei Wei --- tests/lora/test_lora.py | 35 ++++++++++------------------------- 1 file changed, 10 insertions(+), 25 deletions(-) diff --git a/tests/lora/test_lora.py b/tests/lora/test_lora.py index fba079b58..2f3632a0d 100644 --- a/tests/lora/test_lora.py +++ b/tests/lora/test_lora.py @@ -1,6 +1,6 @@ # https://github.com/vllm-project/vllm/blob/ed10f3cea199a7a1f3532fbe367f5c5479a6cae9/tests/tpu/lora/test_lora.py import os -import subprocess +import time import pytest import vllm @@ -28,30 +28,6 @@ def setup_vllm(num_loras: int, tp: int = 1) -> vllm.LLM: max_lora_rank=8) -@pytest.fixture(autouse=True) -def run_after_each_test(): - # --- Setup code (runs before each test) --- - # print("\nSetting up...") - command = "lsof -t /dev/vfio/* | xargs kill" - results = subprocess.run(command, - shell=True, - capture_output=True, - text=True) - print( - f"Setting up: Killing TPU resources: {results.stdout}, {results.stderr}" - ) - yield # This is where the test runs - # --- Teardown code (runs after each test) --- - command = "lsof -t /dev/vfio/* | xargs kill" - results = subprocess.run(command, - shell=True, - capture_output=True, - text=True) - print( - f"Tear down: Killing TPU resources: {results.stdout}, {results.stderr}" - ) - - # For multi-chip test, we only use TP=2 because the base model Qwen/Qwen2.5-3B-Instruct has 2 kv heads and the current attention kernel requires it to be divisible by tp_size. TP = [2] if os.environ.get("USE_V6E8_QUEUE", False) else [1] @@ -81,6 +57,9 @@ def test_single_lora(tp): assert answer.isdigit() assert int(answer) == 2 + del llm + time.sleep(10) + @pytest.mark.parametrize("tp", TP) def test_lora_hotswapping(tp): @@ -113,6 +92,9 @@ def test_lora_hotswapping(tp): assert answer.isdigit() assert int(answer) == i + 1, f"Expected {i + 1}, got {answer}" + del llm + time.sleep(10) + @pytest.mark.parametrize("tp", TP) def test_multi_lora(tp): @@ -146,3 +128,6 @@ def test_multi_lora(tp): assert int( output.strip() [0]) == i + 1, f"Expected {i + 1}, got {int(output.strip()[0])}" + + del llm + time.sleep(10) From 39b178678ea1dd2ff90785e0160596a682ec5a98 Mon Sep 17 00:00:00 2001 From: Xiongfei Wei Date: Mon, 27 Oct 2025 16:46:04 +0000 Subject: [PATCH 4/4] revert the chnage in Dockerfile. Signed-off-by: Xiongfei Wei --- docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index a46f075c9..f82997de6 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -10,7 +10,7 @@ RUN pip uninstall -y torch torch_xla torchvision # Install some basic utilities RUN apt-get update && apt-get install -y \ - git lsof \ + git \ libopenblas-base libopenmpi-dev libomp-dev # Build vLLM