Add the bgmv tests (#942)

vanbasten23 · web-flow · commit 53db3b42467c · 2025-10-27T18:43:52.000-07:00
Signed-off-by: Xiongfei Wei &lt;isaacwxf23@gmail.com&gt;
diff --git a/.buildkite/pipeline_jax.yml b/.buildkite/pipeline_jax.yml
@@ -119,7 +119,7 @@ steps:
      commands:
        - |
          .buildkite/scripts/run_in_docker.sh \
-           bash -c 'MODEL_IMPL_TYPE=vllm TPU_BACKEND_TYPE=jax python3 -m pytest -s -v -x /workspace/tpu_inference/tests/lora/test_lora.py'
+           bash -c 'MODEL_IMPL_TYPE=vllm TPU_BACKEND_TYPE=jax python3 -m pytest -s -v -x /workspace/tpu_inference/tests/lora/'
 
    - label: "E2E MLPerf tests for JAX + vLLM models on multiple chips"
      key: test_11
diff --git a/tests/lora/test_bgmv.py b/tests/lora/test_bgmv.py
@@ -0,0 +1,43 @@
+import jax
+import torch
+import torchax
+
+from tpu_inference.lora.torch_lora_ops import bgmv_torch
+
+
+def test_bgmv_torch():
+    num_tokens = 16
+    hidden_size = 128
+    max_loras = 9
+    max_lora_rank = 8
+
+    with torchax.default_env(), jax.default_device(jax.devices("tpu")[0]):
+        inputs = torch.rand(num_tokens, hidden_size, device='jax')
+        loras = torch.rand(max_loras,
+                           1,
+                           max_lora_rank,
+                           hidden_size,
+                           device='jax')
+        idxs = torch.randint(0, max_loras, (num_tokens, ), device='jax')
+
+        actual = bgmv_torch(inputs, loras, idxs)
+        expected = _ref_bgmv_torch(inputs, loras, idxs)
+        torch.testing.assert_close(actual, expected, atol=3e-2, rtol=1e-3)
+
+
+def _ref_bgmv_torch(inputs, loras, idxs):
+    if len(loras.shape) == 4:
+        loras = loras.squeeze(axis=1)
+
+    # Another equivalent ref impl is as the 2 lines below.
+    # selected_loras = loras[idxs]
+    # return torch.einsum('td,tld->tl', inputs, selected_loras)
+    num_tokens, _ = inputs.shape
+    outputs = []
+    for i in range(num_tokens):
+        input = inputs[i]  # [hidden_size]
+        lora = loras[idxs[i]]  # [max_lora_rank, hidden_size]
+        out = torch.matmul(lora, input)
+        outputs.append(out)
+
+    return torch.stack(outputs, axis=0)