Reduce the host overhead for LoRA (#930)

vanbasten23 · sierraisland · commit f17eca7e6c74 · 2025-10-28T23:32:42.000Z
Signed-off-by: Xiongfei Wei &lt;isaacwxf23@gmail.com&gt;
diff --git a/tpu_inference/layers/vllm/sharding.py b/tpu_inference/layers/vllm/sharding.py
@@ -24,11 +24,9 @@ def shard_model_to_tpu(model: torch.nn.Module,
                        mesh: Mesh) -> dict[str, torchax.torch.Tensor]:
     """
     Shard the model weights and move them to TPU.
-
     At the same time, also turn the weight tensors into torchax tensors so that
     jax code can interop with it and the overall program can be traced and
     compiled in XLA.
-
     Args:
         model: A PyTorch model whose weights are on CPU main memory.
         mesh: JAX mesh object for sharding.
@@ -51,6 +49,18 @@ def shard_model_to_tpu(model: torch.nn.Module,
         return {**params, **buffers}
 
 
+def update_lora(model: torch.nn.Module,
+                initial_params_buffers) -> dict[str, torchax.torch.Tensor]:
+    params, buffers = _extract_all_params_buffers(model)
+    params_buffers = {**params, **buffers}
+    for k, v in params_buffers.items():
+        if 'lora_a_stacked' in k or 'lora_b_stacked' in k:
+            assert k in initial_params_buffers, f"{k} not in initial_params_buffers"
+            initial_params_buffers[k] = v
+
+    return initial_params_buffers
+
+
 def _extract_all_params_buffers(model: torch.nn.Module):
     return dict(model.named_parameters()), dict(model.named_buffers())
 
@@ -116,11 +126,11 @@ def _shard_base_linear_lora_replicated(layer: BaseLinearLayerWithLoRA,
 # TODO: Add custom sharding logic for following lora layers
 def _shard_merged_column_parallel_linear_lora(
         layer: MergedColumnParallelLinearWithLoRA, mesh: Mesh) -> None:
+    assert layer.n_slices > 0, "layer.n_slices should be greater than 0"
     # lora_a_stacked[i] has shape [max_loras, 1, max_lora_rank, in_features]
     sharded_lora_a_tpu = torch.nn.ParameterList()
     sharded_lora_b_tpu = torch.nn.ParameterList()
 
-    assert layer.n_slices > 0, "layer.n_slices should be greater than 0"
     # lora_b_stacked[i] has shape [max_loras, 1, out_features, max_lora_rank]
     lora_b_partition_spec = P(None, None, 'model', None)
     lora_b_sharding = NamedSharding(mesh, lora_b_partition_spec)
diff --git a/tpu_inference/runner/lora_utils.py b/tpu_inference/runner/lora_utils.py
@@ -7,7 +7,7 @@
 from vllm.lora.layers.base_linear import BaseLinearLayerWithLoRA
 from vllm.lora.request import LoRARequest
 
-from tpu_inference.layers.vllm.sharding import shard_model_to_tpu
+from tpu_inference.layers.vllm.sharding import update_lora
 
 if TYPE_CHECKING:
     from tpu_inference.runner.tpu_jax_runner import TPUModelRunner
@@ -41,8 +41,8 @@ def set_active_loras(self, num_scheduled_tokens_per_req,
         self.runner._set_active_loras(prompt_lora_mapping, token_lora_mapping,
                                       lora_requests)
 
-        params_and_buffers = shard_model_to_tpu(self.runner.model.model,
-                                                self.runner.mesh)
+        params_and_buffers = update_lora(
+            self.runner.model.model, initial_params_buffers=self.runner.state)
         self.runner.state = jax_view(params_and_buffers)
 
     def extract_lora_metadata(self):
diff --git a/tpu_inference/worker/tpu_worker_jax.py b/tpu_inference/worker/tpu_worker_jax.py
@@ -241,7 +241,9 @@ def add_lora(
     def profile(self, is_start: bool = True):
         if is_start:
             options = jax.profiler.ProfileOptions()
+            # default: https://docs.jax.dev/en/latest/profiling.html#general-options
             options.python_tracer_level = os.getenv("PYTHON_TRACER_LEVEL", 0)
+            options.host_tracer_level = os.getenv("HOST_TRACER_LEVEL", 1)
             jax.profiler.start_trace(self.profile_dir,
                                      profiler_options=options)
         else: