Implemented the weight sync for torch tensors.

datenglin · copybara-github · commit b194b32ddd92 · 2026-06-04T17:27:22.000-07:00
PiperOrigin-RevId: 926213635
diff --git a/api/torch/weight_synchronizer.py b/api/torch/weight_synchronizer.py
@@ -52,6 +52,68 @@ def pull_weights(self, source: str) -> None:
     """Inference server pulling current weights from the source peer coordinate E2E."""
     self._impl.PullWeights(source)
 
+  def d2h(self) -> None:
+    """Triggers asynchronous Device-to-Host (D2H) copy of current weights to Host buffer."""
+    self._impl.D2h()
+
+  def pull_weights_chunk(
+      self,
+      source: str,
+      src_shard_idx: int,
+      src_offset_bytes: int,
+      dst_shard_idx: int,
+      dst_offset_bytes: int,
+      size_bytes: int,
+  ) -> None:
+    """Inference server pulling a specific byte range directly from a source worker peer.
+
+    Args:
+      source: "host:port" coordinate of the source peer.
+      src_shard_idx: Target source device shard index to read.
+      src_offset_bytes: Offset in bytes inside source shard staging buffer.
+      dst_shard_idx: Local destination device shard index to write.
+      dst_offset_bytes: Offset in bytes inside local destination staging buffer.
+      size_bytes: Number of bytes to transfer.
+    """
+    self._impl.PullWeightsChunk(
+        source,
+        src_shard_idx,
+        src_offset_bytes,
+        dst_shard_idx,
+        dst_offset_bytes,
+        size_bytes,
+    )
+
+  def h2d_chunk(
+      self,
+      shard_idx: int,
+      host_offset_bytes: int,
+      device_offset_bytes: int,
+      size_bytes: int,
+  ) -> None:
+    """Triggers asynchronous Host-to-Device (H2D) chunk copy directly to Device HBM.
+
+    Args:
+      shard_idx: Target shard index.
+      host_offset_bytes: Source offset in Host staging buffer.
+      device_offset_bytes: Destination offset in Device memory.
+      size_bytes: Number of bytes to copy.
+    """
+    self._impl.H2dChunk(
+        shard_idx, host_offset_bytes, device_offset_bytes, size_bytes
+    )
+
+  def get_host_buffer(
+      self, layer_idx: int = 0, shard_idx: int = 0
+  ) -> torch.Tensor:
+    """Returns a zero-copy Host-side CPU PyTorch Tensor view of the C++ staging buffer.
+
+    Args:
+      layer_idx: Target layer index to fetch.
+      shard_idx: Target shard index to fetch.
+    """
+    return self._impl.get_host_buffer(layer_idx, shard_idx)
+
   @property
   def local_port(self) -> Optional[int]:
     """Returns assigned ephemeral listener port coordinates."""
diff --git a/frameworks/torch/weight_synchronizer_module.cc b/frameworks/torch/weight_synchronizer_module.cc
@@ -60,6 +60,74 @@ NB_MODULE(_weight_synchronizer, m) {
             }
           },
           nb::arg("source"), nb::call_guard<nb::gil_scoped_release>())
+      .def(
+          "D2h",
+          [](WeightSynchronizer& self) {
+            auto status_or_future = self.D2h();
+            if (!status_or_future.ok()) {
+              throw std::runtime_error(
+                  "WeightSynchronizer D2H failed: " +
+                  std::string(status_or_future.status().message()));
+            }
+            absl::Status status = status_or_future.value().Await().status();
+            if (!status.ok()) {
+              throw std::runtime_error("WeightSynchronizer D2H copy failed: " +
+                                       std::string(status.message()));
+            }
+          },
+          nb::call_guard<nb::gil_scoped_release>())
+      .def(
+          "H2dChunk",
+          [](WeightSynchronizer& self, size_t shard_idx,
+             size_t host_offset_bytes, size_t device_offset_bytes,
+             size_t size_bytes) {
+            auto status_or_future = self.H2dChunk(
+                shard_idx, host_offset_bytes, device_offset_bytes, size_bytes);
+            if (!status_or_future.ok()) {
+              throw std::runtime_error(
+                  "WeightSynchronizer H2dChunk failed: " +
+                  std::string(status_or_future.status().message()));
+            }
+            absl::Status status = status_or_future.value().Await().status();
+            if (!status.ok()) {
+              throw std::runtime_error(
+                  "WeightSynchronizer H2dChunk copy failed: " +
+                  std::string(status.message()));
+            }
+          },
+          nb::arg("shard_idx"), nb::arg("host_offset_bytes"),
+          nb::arg("device_offset_bytes"), nb::arg("size_bytes"),
+          nb::call_guard<nb::gil_scoped_release>())
+      .def(
+          "PullWeightsChunk",
+          [](WeightSynchronizer& self, const std::string& source,
+             size_t src_shard_idx, size_t src_offset_bytes,
+             size_t dst_shard_idx, size_t dst_offset_bytes, size_t size_bytes) {
+            absl::Status s = self.PullWeightsChunk(
+                source, src_shard_idx, src_offset_bytes, dst_shard_idx,
+                dst_offset_bytes, size_bytes);
+            if (!s.ok()) {
+              throw std::runtime_error(
+                  "WeightSynchronizer PullWeightsChunk failed: " +
+                  std::string(s.message()));
+            }
+          },
+          nb::arg("source"), nb::arg("src_shard_idx"),
+          nb::arg("src_offset_bytes"), nb::arg("dst_shard_idx"),
+          nb::arg("dst_offset_bytes"), nb::arg("size_bytes"),
+          nb::call_guard<nb::gil_scoped_release>())
+      .def(
+          "get_host_buffer",
+          [](WeightSynchronizer& self, size_t layer_idx, size_t shard_idx) {
+            const uint8_t* ptr = self.GetHostBufferPtr(layer_idx, shard_idx);
+            if (!ptr) {
+              throw std::runtime_error("Invalid layer or shard index");
+            }
+            size_t size = self.slice_byte_size() + 256 * 1024;
+            return at::from_blob(const_cast<uint8_t*>(ptr),
+                                 {static_cast<int64_t>(size)}, at::kByte);
+          },
+          nb::arg("layer_idx") = 0, nb::arg("shard_idx") = 0)
       .def_prop_ro("local_port", &WeightSynchronizer::local_port)
       .def_prop_ro("num_layers", &WeightSynchronizer::num_layers)
       .def_prop_ro("num_shards", &WeightSynchronizer::num_shards)
diff --git a/weight_sync/BUILD b/weight_sync/BUILD
@@ -39,6 +39,8 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:status_macros",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/types:span",
+        "@xla//xla/pjrt:pjrt_client",
         "@xla//xla/pjrt/c:pjrt_c_api_hdrs",
         "@xla//xla/pjrt/c:pjrt_c_api_raw_buffer_extension_hdrs",
         "@xla//xla/tsl/platform:errors",
diff --git a/weight_sync/weight_synchronizer_base.cc b/weight_sync/weight_synchronizer_base.cc