From ca1b731170612a95cd36c205417b1fdee198d16f Mon Sep 17 00:00:00 2001
From: Xiongfei Wei <isaacwxf23@gmail.com>
Date: Thu, 23 Oct 2025 16:21:35 +0000
Subject: [PATCH 1/4] Try to manually release the TPU resource.

Signed-off-by: Xiongfei Wei <isaacwxf23@gmail.com>
---
 .buildkite/pipeline_jax.yml |  2 +-
 docker/Dockerfile           |  2 +-
 tests/lora/test_lora.py     | 26 +++++++++++++++-----------
 3 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/.buildkite/pipeline_jax.yml b/.buildkite/pipeline_jax.yml
index dc18454d7..30a474dcd 100644
--- a/.buildkite/pipeline_jax.yml
+++ b/.buildkite/pipeline_jax.yml
@@ -156,7 +156,7 @@ steps:
      commands:
        - |
          .buildkite/scripts/run_in_docker.sh \
-           bash -c 'MODEL_IMPL_TYPE=vllm TPU_BACKEND_TYPE=jax python3 -m pytest -s -v -x /workspace/tpu_inference/tests/lora/test_lora.py -k multi_lora'
+           bash -c 'MODEL_IMPL_TYPE=vllm TPU_BACKEND_TYPE=jax python3 -m pytest -s -v -x /workspace/tpu_inference/tests/lora/test_lora.py'
 
 
   # -----------------------------------------------------------------
diff --git a/docker/Dockerfile b/docker/Dockerfile
index f82997de6..a46f075c9 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -10,7 +10,7 @@ RUN pip uninstall -y torch torch_xla torchvision
 
 # Install some basic utilities
 RUN apt-get update && apt-get install -y \
-    git \
+    git lsof \
     libopenblas-base libopenmpi-dev libomp-dev
 
 # Build vLLM
diff --git a/tests/lora/test_lora.py b/tests/lora/test_lora.py
index 20df2e68a..82fe59166 100644
--- a/tests/lora/test_lora.py
+++ b/tests/lora/test_lora.py
@@ -1,5 +1,6 @@
 # https://github.com/vllm-project/vllm/blob/ed10f3cea199a7a1f3532fbe367f5c5479a6cae9/tests/tpu/lora/test_lora.py
 import os
+import subprocess
 
 import pytest
 import vllm
@@ -16,17 +17,6 @@
 # 100 training iterations with a training batch size of 100.
 
 
-@pytest.fixture(scope="function", autouse=True)
-def use_v1_only(monkeypatch: pytest.MonkeyPatch):
-    """
-    Since Multi-LoRA is only supported on the v1 TPU backend, set VLLM_USE_V1=1
-    for all tests in this file
-    """
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        yield
-
-
 def setup_vllm(num_loras: int, tp: int = 1) -> vllm.LLM:
     return vllm.LLM(model="Qwen/Qwen2.5-3B-Instruct",
                     max_model_len=256,
@@ -38,6 +28,20 @@ def setup_vllm(num_loras: int, tp: int = 1) -> vllm.LLM:
                     max_lora_rank=8)
 
 
+@pytest.fixture(autouse=True)
+def run_after_each_test():
+    # --- Setup code (runs before each test) ---
+    # print("\nSetting up...")
+    yield  # This is where the test runs
+    # --- Teardown code (runs after each test) ---
+    command = "lsof -t /dev/vfio/* | xargs kill"
+    results = subprocess.run(command,
+                             shell=True,
+                             capture_output=True,
+                             text=True)
+    print(f"Killing TPU resources: {results.stdout}, {results.stderr}")
+
+
 # For multi-chip test, we only use TP=2 because the base model Qwen/Qwen2.5-3B-Instruct has 2 kv heads and the current attention kernel requires it to be divisible by tp_size.
 TP = [2] if os.environ.get("USE_V6E8_QUEUE", False) else [1]
 

From 5db832205fc446d4ceb3371ada641acecff30861 Mon Sep 17 00:00:00 2001
From: Xiongfei Wei <isaacwxf23@gmail.com>
Date: Thu, 23 Oct 2025 17:50:32 +0000
Subject: [PATCH 2/4] also release tpu at set up.

Signed-off-by: Xiongfei Wei <isaacwxf23@gmail.com>
---
 tests/lora/test_lora.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/tests/lora/test_lora.py b/tests/lora/test_lora.py
index 82fe59166..fba079b58 100644
--- a/tests/lora/test_lora.py
+++ b/tests/lora/test_lora.py
@@ -32,6 +32,14 @@ def setup_vllm(num_loras: int, tp: int = 1) -> vllm.LLM:
 def run_after_each_test():
     # --- Setup code (runs before each test) ---
     # print("\nSetting up...")
+    command = "lsof -t /dev/vfio/* | xargs kill"
+    results = subprocess.run(command,
+                             shell=True,
+                             capture_output=True,
+                             text=True)
+    print(
+        f"Setting up: Killing TPU resources: {results.stdout}, {results.stderr}"
+    )
     yield  # This is where the test runs
     # --- Teardown code (runs after each test) ---
     command = "lsof -t /dev/vfio/* | xargs kill"
@@ -39,7 +47,9 @@ def run_after_each_test():
                              shell=True,
                              capture_output=True,
                              text=True)
-    print(f"Killing TPU resources: {results.stdout}, {results.stderr}")
+    print(
+        f"Tear down: Killing TPU resources: {results.stdout}, {results.stderr}"
+    )
 
 
 # For multi-chip test, we only use TP=2 because the base model Qwen/Qwen2.5-3B-Instruct has 2 kv heads and the current attention kernel requires it to be divisible by tp_size.

From c4bd0a31331a22d8e7eab17cbf16bd47bc9f226b Mon Sep 17 00:00:00 2001
From: Xiongfei Wei <isaacwxf23@gmail.com>
Date: Sat, 25 Oct 2025 02:02:52 +0000
Subject: [PATCH 3/4] Try: delete the instance and sleep.

Signed-off-by: Xiongfei Wei <isaacwxf23@gmail.com>
---
 tests/lora/test_lora.py | 35 ++++++++++-------------------------
 1 file changed, 10 insertions(+), 25 deletions(-)

diff --git a/tests/lora/test_lora.py b/tests/lora/test_lora.py
index fba079b58..2f3632a0d 100644
--- a/tests/lora/test_lora.py
+++ b/tests/lora/test_lora.py
@@ -1,6 +1,6 @@
 # https://github.com/vllm-project/vllm/blob/ed10f3cea199a7a1f3532fbe367f5c5479a6cae9/tests/tpu/lora/test_lora.py
 import os
-import subprocess
+import time
 
 import pytest
 import vllm
@@ -28,30 +28,6 @@ def setup_vllm(num_loras: int, tp: int = 1) -> vllm.LLM:
                     max_lora_rank=8)
 
 
-@pytest.fixture(autouse=True)
-def run_after_each_test():
-    # --- Setup code (runs before each test) ---
-    # print("\nSetting up...")
-    command = "lsof -t /dev/vfio/* | xargs kill"
-    results = subprocess.run(command,
-                             shell=True,
-                             capture_output=True,
-                             text=True)
-    print(
-        f"Setting up: Killing TPU resources: {results.stdout}, {results.stderr}"
-    )
-    yield  # This is where the test runs
-    # --- Teardown code (runs after each test) ---
-    command = "lsof -t /dev/vfio/* | xargs kill"
-    results = subprocess.run(command,
-                             shell=True,
-                             capture_output=True,
-                             text=True)
-    print(
-        f"Tear down: Killing TPU resources: {results.stdout}, {results.stderr}"
-    )
-
-
 # For multi-chip test, we only use TP=2 because the base model Qwen/Qwen2.5-3B-Instruct has 2 kv heads and the current attention kernel requires it to be divisible by tp_size.
 TP = [2] if os.environ.get("USE_V6E8_QUEUE", False) else [1]
 
@@ -81,6 +57,9 @@ def test_single_lora(tp):
     assert answer.isdigit()
     assert int(answer) == 2
 
+    del llm
+    time.sleep(10)
+
 
 @pytest.mark.parametrize("tp", TP)
 def test_lora_hotswapping(tp):
@@ -113,6 +92,9 @@ def test_lora_hotswapping(tp):
         assert answer.isdigit()
         assert int(answer) == i + 1, f"Expected {i + 1}, got {answer}"
 
+    del llm
+    time.sleep(10)
+
 
 @pytest.mark.parametrize("tp", TP)
 def test_multi_lora(tp):
@@ -146,3 +128,6 @@ def test_multi_lora(tp):
         assert int(
             output.strip()
             [0]) == i + 1, f"Expected {i + 1}, got {int(output.strip()[0])}"
+
+    del llm
+    time.sleep(10)

From 39b178678ea1dd2ff90785e0160596a682ec5a98 Mon Sep 17 00:00:00 2001
From: Xiongfei Wei <isaacwxf23@gmail.com>
Date: Mon, 27 Oct 2025 16:46:04 +0000
Subject: [PATCH 4/4] revert the chnage in Dockerfile.

Signed-off-by: Xiongfei Wei <isaacwxf23@gmail.com>
---
 docker/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index a46f075c9..f82997de6 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -10,7 +10,7 @@ RUN pip uninstall -y torch torch_xla torchvision
 
 # Install some basic utilities
 RUN apt-get update && apt-get install -y \
-    git lsof \
+    git \
     libopenblas-base libopenmpi-dev libomp-dev
 
 # Build vLLM