From 2c04df94cfd34b37b9ce2c27f937afdba8e79ecc Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Thu, 24 Oct 2024 02:08:07 +0300
Subject: [PATCH 01/55] working simple benchmark

---
 csrc/multidevice/communicator.cpp      |   2 +
 tests/cpp/test_multidevice_overlap.cpp | 103 +++++++++++++++++++++++++
 2 files changed, 105 insertions(+)

diff --git a/csrc/multidevice/communicator.cpp b/csrc/multidevice/communicator.cpp
index 8197ea224f4..ae6fc1fd9b4 100644
--- a/csrc/multidevice/communicator.cpp
+++ b/csrc/multidevice/communicator.cpp
@@ -196,6 +196,8 @@ Communicator::Communicator(
     return;
   }
 
+  cudaSetDevice(local_rank_);
+
 #ifdef NVFUSER_DISTRIBUTED
   c10d::TCPStoreOptions store_opts;
   {
diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp
index 39cab67cd13..5def14c8045 100644
--- a/tests/cpp/test_multidevice_overlap.cpp
+++ b/tests/cpp/test_multidevice_overlap.cpp
@@ -15,6 +15,7 @@
 #include <ir/utils.h>
 #include <ops/all_ops.h>
 #include <tests/cpp/multidevice.h>
+#include <cuda_runtime.h>
 
 namespace nvfuser {
 
@@ -40,6 +41,108 @@ void synchronizeStreams(const std::vector<c10::cuda::CUDAStream>& streams) {
 
 } // namespace
 
+using OverlapBenchmarkParams = std::tuple<
+    CommunicatorBackend,
+    /*S=*/int64_t,
+    /*M=*/int64_t,
+    /*K=*/int64_t,
+    /*N=*/int64_t,
+    /*number_of_streams=*/int64_t>;
+
+class OverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterface<OverlapBenchmarkParams> {
+ protected:
+  static std::unordered_map<std::string, float> times;
+
+  static void TearDownTestSuite() {
+    auto rank = Communicator::getInstance().deviceId();
+    for (auto it: times) {
+      std::cout << "rank " << rank << ": " << it.first << ": " << it.second << std::endl;
+    }
+  }
+};
+
+std::unordered_map<std::string, float> OverlapBenchmark::times = {};
+
+TEST_P(OverlapBenchmark, DummyBenchmark) {
+  constexpr int64_t number_of_warmups = 120;
+  constexpr int64_t number_of_iterations = 500;
+  const int64_t D = communicator_->size();
+  auto [backend,
+        S,
+        M,
+        K,
+        N,
+        number_of_streams] = GetParam();
+
+  GTEST_ASSERT_EQ(M % S, 0);
+
+  auto world = communicator_->getWorld(backend);
+
+  std::vector<c10::cuda::CUDAStream> streams =
+      createStreams(number_of_streams, communicator_->deviceId());
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(communicator_->device());
+  auto ta = at::randn({S, M/S,K}, options);
+  auto ta_unsharded = at::empty({S, D, M/S,K}, options);
+  auto tb = at::randn({K,N}, options);
+
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+
+  for (const auto& iteration :
+       c10::irange(number_of_warmups + number_of_iterations)) {
+    if (iteration == number_of_warmups) {
+      cudaEventRecord(start);
+    }
+    for (auto j : c10::irange(S)) {
+      int64_t stream_index = j % streams.size();
+      setCurrentCUDAStream(streams.at(stream_index));
+
+      auto ta_j = ta.select(0, j);
+      auto ta_unsharded_j = ta_unsharded.select(0, j);
+
+      // communication
+      world->_allgather_base(ta_unsharded_j, ta_j)->wait();
+      // compute
+      auto tc_j = torch::matmul(ta_unsharded_j,tb);
+    }
+    setCurrentCUDAStream(c10::cuda::getDefaultCUDAStream(communicator_->deviceId()));
+    synchronizeStreams(streams);
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  float milliseconds = 0;
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  milliseconds /= number_of_iterations;
+
+  std::string test_name = ::testing::UnitTest::GetInstance()->current_test_info()->name();
+  times.insert({test_name, milliseconds});
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    ,
+    OverlapBenchmark,
+    testing::Combine(
+    testing::Values(CommunicatorBackend::kNccl, CommunicatorBackend::kUcc),
+    /*S=*/testing::Values(1,2,4,8),
+    /*M=*/testing::Values(pow(2,10), pow(2,15)),
+    /*K=*/testing::Values(pow(2,10), pow(2,15)),
+    /*N=*/testing::Values(pow(2,10)),
+    /*number_of_streams=*/testing::Values(3, 8)),
+    [](const testing::TestParamInfo<OverlapBenchmarkParams>& info)
+        -> std::string {
+      std::ostringstream os;
+      os << /*backend*/std::get<0>(info.param) << "_"
+         << "S" << std::get<1>(info.param) << "_"
+         << "M" << std::get<2>(info.param) << "_"
+         << "K" << std::get<3>(info.param) << "_"
+         << "N" << std::get<4>(info.param) << "_"
+         << "Streams" << std::get<5>(info.param);
+      return os.str();
+    });
+
+
 struct OverlapTestParams {
   // Tensors sizes
   int64_t M = std::pow(2, 6);

From af36cf14ac7622945ba6ec6ff8ce68434cc94230 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Fri, 25 Oct 2024 03:54:42 +0300
Subject: [PATCH 02/55] minor

---
 tests/cpp/test_multidevice_overlap.cpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp
index 5def14c8045..76e8192d1fd 100644
--- a/tests/cpp/test_multidevice_overlap.cpp
+++ b/tests/cpp/test_multidevice_overlap.cpp
@@ -51,7 +51,7 @@ using OverlapBenchmarkParams = std::tuple<
 
 class OverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterface<OverlapBenchmarkParams> {
  protected:
-  static std::unordered_map<std::string, float> times;
+  static std::map<std::string, float> times;
 
   static void TearDownTestSuite() {
     auto rank = Communicator::getInstance().deviceId();
@@ -61,11 +61,13 @@ class OverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterf
   }
 };
 
-std::unordered_map<std::string, float> OverlapBenchmark::times = {};
+std::map<std::string, float> OverlapBenchmark::times = {};
 
 TEST_P(OverlapBenchmark, DummyBenchmark) {
-  constexpr int64_t number_of_warmups = 120;
-  constexpr int64_t number_of_iterations = 500;
+  int64_t number_of_warmups = 50;
+  constexpr int64_t number_of_iterations = 100;
+
+
   const int64_t D = communicator_->size();
   auto [backend,
         S,
@@ -118,6 +120,7 @@ TEST_P(OverlapBenchmark, DummyBenchmark) {
 
   std::string test_name = ::testing::UnitTest::GetInstance()->current_test_info()->name();
   times.insert({test_name, milliseconds});
+  std::cout << "rank " << communicator_->deviceId() << ", " << test_name << " : " << milliseconds << std::endl;
 }
 
 INSTANTIATE_TEST_SUITE_P(

From 68b858a7fcd16e0f79fd62cafe6401496c924c60 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Fri, 25 Oct 2024 07:22:06 -0700
Subject: [PATCH 03/55] test script

---
 bench/process_outputs                  |  2 ++
 bench/test                             | 35 ++++++++++++++++++++++++++
 tests/cpp/test_multidevice_overlap.cpp |  4 +--
 3 files changed, 39 insertions(+), 2 deletions(-)
 create mode 100644 bench/process_outputs
 create mode 100755 bench/test

diff --git a/bench/process_outputs b/bench/process_outputs
new file mode 100644
index 00000000000..139597f9cb0
--- /dev/null
+++ b/bench/process_outputs
@@ -0,0 +1,2 @@
+
+
diff --git a/bench/test b/bench/test
new file mode 100755
index 00000000000..f0d5728fb4b
--- /dev/null
+++ b/bench/test
@@ -0,0 +1,35 @@
+#!/bin/bash
+EXPERIMENT=tl_nccl
+DATE=$(date +%Y%m%d-%H%M)
+LOG_BASE="/opt/pytorch/Fuser/bench/logs"
+
+export LOGS="${LOG_BASE}/${EXPERIMENT}_${DATE}"
+
+mkdir -p $LOGS
+LOG_FILE_INFO="${LOGS}/info"
+echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO
+
+NP=8
+BACKEND=UCC
+S=*
+M=*
+K=*
+N=*
+Streams=*
+export GTEST_FILTER="OverlapBenchmark.DummyBenchmark/${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}"
+echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO
+
+MPIFLAGS=" -np $NP"
+MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1"
+# MPIFLAGS+=" -x UCC_CL_BASIC_TLS=^sharp,mlx5"
+# MPIFLAGS+=" -x UCC_COLL_TRACE=info"
+MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl"
+echo "mpi flags: $MPIFLAGS" | tee -a $LOG_FILE_INFO
+
+TEST_CMD="$BUILD_DIRECTORY/test_multidevice --gtest_filter=${GTEST_FILTER}"
+echo "test cmd: $TEST_CMD" | tee -a $LOG_FILE_INFO
+
+CMD="mpirun $MPIFLAGS $TEST_CMD"
+echo $CMD | tee -a $LOG_FILE_INFO
+$CMD | tee -a $LOG_FILE_INFO
+
diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp
index 76e8192d1fd..b8c998618b4 100644
--- a/tests/cpp/test_multidevice_overlap.cpp
+++ b/tests/cpp/test_multidevice_overlap.cpp
@@ -128,11 +128,11 @@ INSTANTIATE_TEST_SUITE_P(
     OverlapBenchmark,
     testing::Combine(
     testing::Values(CommunicatorBackend::kNccl, CommunicatorBackend::kUcc),
-    /*S=*/testing::Values(1,2,4,8),
+    /*S=*/testing::Values(1,2,4,8, 16, 32),
     /*M=*/testing::Values(pow(2,10), pow(2,15)),
     /*K=*/testing::Values(pow(2,10), pow(2,15)),
     /*N=*/testing::Values(pow(2,10)),
-    /*number_of_streams=*/testing::Values(3, 8)),
+    /*number_of_streams=*/testing::Values(3, 8, 32)),
     [](const testing::TestParamInfo<OverlapBenchmarkParams>& info)
         -> std::string {
       std::ostringstream os;

From 0c3493b6c1782d27b5f417d2237751a0b37bf8df Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Mon, 28 Oct 2024 13:13:09 +0200
Subject: [PATCH 04/55] minor

---
 bench/process_outputs                  | 5 +++++
 bench/test                             | 2 +-
 tests/cpp/test_multidevice_overlap.cpp | 5 ++++-
 3 files changed, 10 insertions(+), 2 deletions(-)
 mode change 100644 => 100755 bench/process_outputs

diff --git a/bench/process_outputs b/bench/process_outputs
old mode 100644
new mode 100755
index 139597f9cb0..c1781394dbc
--- a/bench/process_outputs
+++ b/bench/process_outputs
@@ -1,2 +1,7 @@
+#!/bin/bash
 
+FILE="/opt/pytorch/Fuser/bench/logs/${1}/info"
 
+cat $FILE | grep "rank 0: "  #| awk '{print $4}'
+
+# | grep -E 'Streams32\b'
\ No newline at end of file
diff --git a/bench/test b/bench/test
index f0d5728fb4b..b6375719387 100755
--- a/bench/test
+++ b/bench/test
@@ -10,7 +10,7 @@ LOG_FILE_INFO="${LOGS}/info"
 echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO
 
 NP=8
-BACKEND=UCC
+BACKEND=NCCL
 S=*
 M=*
 K=*
diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp
index b8c998618b4..2febd097b62 100644
--- a/tests/cpp/test_multidevice_overlap.cpp
+++ b/tests/cpp/test_multidevice_overlap.cpp
@@ -55,8 +55,11 @@ class OverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterf
 
   static void TearDownTestSuite() {
     auto rank = Communicator::getInstance().deviceId();
+    if (rank != 0) {
+      return;
+    }
     for (auto it: times) {
-      std::cout << "rank " << rank << ": " << it.first << ": " << it.second << std::endl;
+      std::cout << "time " << rank << ": " << it.first << ": " << it.second << std::endl;
     }
   }
 };

From b30b44bb897c0ec290f37f0e0e02d82ceea3421f Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Tue, 29 Oct 2024 09:46:09 -0700
Subject: [PATCH 05/55] add nsight profiling

---
 bench/test                             | 36 ++++++++++++++++----------
 tests/cpp/test_multidevice_overlap.cpp |  7 +++++
 2 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/bench/test b/bench/test
index b6375719387..8ce85c8ff0f 100755
--- a/bench/test
+++ b/bench/test
@@ -1,35 +1,45 @@
 #!/bin/bash
-EXPERIMENT=tl_nccl
+EXPERIMENT=profile
 DATE=$(date +%Y%m%d-%H%M)
 LOG_BASE="/opt/pytorch/Fuser/bench/logs"
 
 export LOGS="${LOG_BASE}/${EXPERIMENT}_${DATE}"
 
 mkdir -p $LOGS
-LOG_FILE_INFO="${LOGS}/info"
+LOG_FILE_INFO="${LOGS}/info.txt"
 echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO
 
 NP=8
 BACKEND=NCCL
-S=*
-M=*
-K=*
-N=*
-Streams=*
-export GTEST_FILTER="OverlapBenchmark.DummyBenchmark/${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}"
+S=4
+M=32768
+K=32768
+N=1024
+Streams=8
+GTEST_PREFIX="OverlapBenchmark.DummyBenchmark/"
+GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}"
+export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}"
 echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO
-
+``
 MPIFLAGS=" -np $NP"
 MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1"
 # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=^sharp,mlx5"
-# MPIFLAGS+=" -x UCC_COLL_TRACE=info"
+MPIFLAGS+=" -x UCC_COLL_TRACE=info"
 MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl"
+# MPIFLAGS+=" -x NCCL_DEBUG=TRACE" #INFO
+MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1"
 echo "mpi flags: $MPIFLAGS" | tee -a $LOG_FILE_INFO
 
 TEST_CMD="$BUILD_DIRECTORY/test_multidevice --gtest_filter=${GTEST_FILTER}"
 echo "test cmd: $TEST_CMD" | tee -a $LOG_FILE_INFO
 
-CMD="mpirun $MPIFLAGS $TEST_CMD"
-echo $CMD | tee -a $LOG_FILE_INFO
-$CMD | tee -a $LOG_FILE_INFO
+MPICMD="mpirun $MPIFLAGS $TEST_CMD"
+echo $MPICMD | tee -a $LOG_FILE_INFO
+
+NSYSCMD="nsys profile --stats=false -w true -t cublas,cuda,nvtx,osrt,mpi,ucx -o ${LOGS}/${GTEST_POSTFIX} --capture-range-end stop --capture-range=cudaProfilerApi --cudabacktrace=memory,sync,kernel,other"
+
+CMD="${NSYSCMD} ${MPICMD}"
+sudo /bin/sh -c "echo '1' > /proc/sys/kernel/perf_event_paranoid"
+echo $CMD | tee -a ${LOG_FILE_INFO}
+$CMD | tee -a ${LOG_FILE_INFO}
 
diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp
index 2febd097b62..189a0da732c 100644
--- a/tests/cpp/test_multidevice_overlap.cpp
+++ b/tests/cpp/test_multidevice_overlap.cpp
@@ -15,6 +15,7 @@
 #include <ir/utils.h>
 #include <ops/all_ops.h>
 #include <tests/cpp/multidevice.h>
+#include <cuda_profiler_api.h>
 #include <cuda_runtime.h>
 
 namespace nvfuser {
@@ -97,6 +98,9 @@ TEST_P(OverlapBenchmark, DummyBenchmark) {
 
   for (const auto& iteration :
        c10::irange(number_of_warmups + number_of_iterations)) {
+    if (iteration == 10) {
+      cudaProfilerStart();;
+    }
     if (iteration == number_of_warmups) {
       cudaEventRecord(start);
     }
@@ -114,6 +118,9 @@ TEST_P(OverlapBenchmark, DummyBenchmark) {
     }
     setCurrentCUDAStream(c10::cuda::getDefaultCUDAStream(communicator_->deviceId()));
     synchronizeStreams(streams);
+    if (iteration == 15) {
+      cudaProfilerStop();;
+    }
   }
   cudaEventRecord(stop);
   cudaEventSynchronize(stop);

From 0592a139918072d66c13790741310ecc195abe45 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Thu, 31 Oct 2024 06:39:08 -0700
Subject: [PATCH 06/55] nsight and tl/nccl/ sync mode

---
 bench/test | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/bench/test b/bench/test
index 8ce85c8ff0f..b51daa63ebd 100755
--- a/bench/test
+++ b/bench/test
@@ -1,16 +1,16 @@
 #!/bin/bash
-EXPERIMENT=profile
+EXPERIMENT=profile_driver
 DATE=$(date +%Y%m%d-%H%M)
 LOG_BASE="/opt/pytorch/Fuser/bench/logs"
 
 export LOGS="${LOG_BASE}/${EXPERIMENT}_${DATE}"
 
 mkdir -p $LOGS
-LOG_FILE_INFO="${LOGS}/info.txt"
+export LOG_FILE_INFO="${LOGS}/info.txt"
 echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO
 
 NP=8
-BACKEND=NCCL
+BACKEND=UCC
 S=4
 M=32768
 K=32768
@@ -28,6 +28,7 @@ MPIFLAGS+=" -x UCC_COLL_TRACE=info"
 MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl"
 # MPIFLAGS+=" -x NCCL_DEBUG=TRACE" #INFO
 MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1"
+MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=driver"
 echo "mpi flags: $MPIFLAGS" | tee -a $LOG_FILE_INFO
 
 TEST_CMD="$BUILD_DIRECTORY/test_multidevice --gtest_filter=${GTEST_FILTER}"

From 0037b1e9b1398b9518a80011b1601f7e4f6cda5a Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Mon, 4 Nov 2024 05:12:10 -0800
Subject: [PATCH 07/55] add cuStreamWriteValue but linkage error

---
 bench/test                             |  7 ++++---
 tests/cpp/test_multidevice_overlap.cpp | 29 ++++++++++++++++++++++----
 2 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/bench/test b/bench/test
index b51daa63ebd..2856cff9074 100755
--- a/bench/test
+++ b/bench/test
@@ -1,5 +1,5 @@
 #!/bin/bash
-EXPERIMENT=profile_driver
+EXPERIMENT=profile_ncc_max_connection2
 DATE=$(date +%Y%m%d-%H%M)
 LOG_BASE="/opt/pytorch/Fuser/bench/logs"
 
@@ -10,7 +10,7 @@ export LOG_FILE_INFO="${LOGS}/info.txt"
 echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO
 
 NP=8
-BACKEND=UCC
+BACKEND=NCCL
 S=4
 M=32768
 K=32768
@@ -28,7 +28,8 @@ MPIFLAGS+=" -x UCC_COLL_TRACE=info"
 MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl"
 # MPIFLAGS+=" -x NCCL_DEBUG=TRACE" #INFO
 MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1"
-MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=driver"
+MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event"
+MPIFLAGS+=" -x CUDA_DEVICE_MAX_CONNECTIONS=2"
 echo "mpi flags: $MPIFLAGS" | tee -a $LOG_FILE_INFO
 
 TEST_CMD="$BUILD_DIRECTORY/test_multidevice --gtest_filter=${GTEST_FILTER}"
diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp
index 189a0da732c..8fdaf8afdd9 100644
--- a/tests/cpp/test_multidevice_overlap.cpp
+++ b/tests/cpp/test_multidevice_overlap.cpp
@@ -15,6 +15,7 @@
 #include <ir/utils.h>
 #include <ops/all_ops.h>
 #include <tests/cpp/multidevice.h>
+#include <cuda.h>
 #include <cuda_profiler_api.h>
 #include <cuda_runtime.h>
 
@@ -48,7 +49,8 @@ using OverlapBenchmarkParams = std::tuple<
     /*M=*/int64_t,
     /*K=*/int64_t,
     /*N=*/int64_t,
-    /*number_of_streams=*/int64_t>;
+    /*number_of_streams=*/int64_t,
+    /*add_cuStreamWriteValue32=*/bool>;
 
 class OverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterface<OverlapBenchmarkParams> {
  protected:
@@ -78,7 +80,8 @@ TEST_P(OverlapBenchmark, DummyBenchmark) {
         M,
         K,
         N,
-        number_of_streams] = GetParam();
+        number_of_streams,
+        add_cuStreamWriteValue32] = GetParam();
 
   GTEST_ASSERT_EQ(M % S, 0);
 
@@ -96,6 +99,13 @@ TEST_P(OverlapBenchmark, DummyBenchmark) {
   cudaEventCreate(&start);
   cudaEventCreate(&stop);
 
+  // CUdeviceptr pDevice;
+  // void* ptr;
+  // if (add_cuStreamWriteValue32) {
+  //   cudaMallocHost(&ptr, 32);
+  //   cudaHostGetDevicePointer((void**)&pDevice, ptr, 0);
+  // }
+
   for (const auto& iteration :
        c10::irange(number_of_warmups + number_of_iterations)) {
     if (iteration == 10) {
@@ -113,6 +123,11 @@ TEST_P(OverlapBenchmark, DummyBenchmark) {
 
       // communication
       world->_allgather_base(ta_unsharded_j, ta_j)->wait();
+
+      // if (add_cuStreamWriteValue32) {
+      //   cuStreamWriteValue32((CUstream)streams.at(stream_index), (CUdeviceptr)pDevice, (cuuint32_t)1, (unsigned int)0);
+      // }
+
       // compute
       auto tc_j = torch::matmul(ta_unsharded_j,tb);
     }
@@ -131,6 +146,10 @@ TEST_P(OverlapBenchmark, DummyBenchmark) {
   std::string test_name = ::testing::UnitTest::GetInstance()->current_test_info()->name();
   times.insert({test_name, milliseconds});
   std::cout << "rank " << communicator_->deviceId() << ", " << test_name << " : " << milliseconds << std::endl;
+
+  // if (add_cuStreamWriteValue32) {
+  //   cudaFree(ptr);
+  // }
 }
 
 INSTANTIATE_TEST_SUITE_P(
@@ -142,7 +161,8 @@ INSTANTIATE_TEST_SUITE_P(
     /*M=*/testing::Values(pow(2,10), pow(2,15)),
     /*K=*/testing::Values(pow(2,10), pow(2,15)),
     /*N=*/testing::Values(pow(2,10)),
-    /*number_of_streams=*/testing::Values(3, 8, 32)),
+    /*number_of_streams=*/testing::Values(3, 8, 32),
+    /*add_cuStreamWriteValue32*/testing::Values(false)),
     [](const testing::TestParamInfo<OverlapBenchmarkParams>& info)
         -> std::string {
       std::ostringstream os;
@@ -151,7 +171,8 @@ INSTANTIATE_TEST_SUITE_P(
          << "M" << std::get<2>(info.param) << "_"
          << "K" << std::get<3>(info.param) << "_"
          << "N" << std::get<4>(info.param) << "_"
-         << "Streams" << std::get<5>(info.param);
+         << "Streams" << std::get<5>(info.param) << "_"
+         << ((std::get<6>(info.param))? "With" : "Without") << "cuStreamWriteValue32";
       return os.str();
     });
 

From ec71e233de02deefb609bd81d2bd7dd6b3f2451f Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Mon, 4 Nov 2024 06:27:56 -0800
Subject: [PATCH 08/55] multiple pgs

---
 bench/test                             | 13 +++++++------
 tests/cpp/test_multidevice_overlap.cpp | 17 ++++++++++++-----
 2 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/bench/test b/bench/test
index 2856cff9074..5433bbee9ce 100755
--- a/bench/test
+++ b/bench/test
@@ -1,5 +1,5 @@
 #!/bin/bash
-EXPERIMENT=profile_ncc_max_connection2
+EXPERIMENT=profile_baseline_NCCL
 DATE=$(date +%Y%m%d-%H%M)
 LOG_BASE="/opt/pytorch/Fuser/bench/logs"
 
@@ -11,25 +11,26 @@ echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO
 
 NP=8
 BACKEND=NCCL
-S=4
+S=1
 M=32768
 K=32768
 N=1024
 Streams=8
+Pgs=1
 GTEST_PREFIX="OverlapBenchmark.DummyBenchmark/"
-GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}"
+GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_Pgs${Pgs}"
 export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}"
 echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO
 ``
 MPIFLAGS=" -np $NP"
 MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1"
 # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=^sharp,mlx5"
-MPIFLAGS+=" -x UCC_COLL_TRACE=info"
+# MPIFLAGS+=" -x UCC_COLL_TRACE=info"
 MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl"
 # MPIFLAGS+=" -x NCCL_DEBUG=TRACE" #INFO
 MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1"
-MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event"
-MPIFLAGS+=" -x CUDA_DEVICE_MAX_CONNECTIONS=2"
+# MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event"
+# MPIFLAGS+=" -x CUDA_DEVICE_MAX_CONNECTIONS=2"
 echo "mpi flags: $MPIFLAGS" | tee -a $LOG_FILE_INFO
 
 TEST_CMD="$BUILD_DIRECTORY/test_multidevice --gtest_filter=${GTEST_FILTER}"
diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp
index 8fdaf8afdd9..ff79bb45609 100644
--- a/tests/cpp/test_multidevice_overlap.cpp
+++ b/tests/cpp/test_multidevice_overlap.cpp
@@ -50,7 +50,8 @@ using OverlapBenchmarkParams = std::tuple<
     /*K=*/int64_t,
     /*N=*/int64_t,
     /*number_of_streams=*/int64_t,
-    /*add_cuStreamWriteValue32=*/bool>;
+    /*add_cuStreamWriteValue32=*/bool,
+    /*number_of_pgs=*/int64_t>;
 
 class OverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterface<OverlapBenchmarkParams> {
  protected:
@@ -81,11 +82,13 @@ TEST_P(OverlapBenchmark, DummyBenchmark) {
         K,
         N,
         number_of_streams,
-        add_cuStreamWriteValue32] = GetParam();
+        add_cuStreamWriteValue32,
+        number_of_pgs] = GetParam();
 
   GTEST_ASSERT_EQ(M % S, 0);
 
-  auto world = communicator_->getWorld(backend);
+  std::vector<RankType> all_ranks(communicator_->size());
+  std::iota(all_ranks.begin(), all_ranks.end(), 0);
 
   std::vector<c10::cuda::CUDAStream> streams =
       createStreams(number_of_streams, communicator_->deviceId());
@@ -118,6 +121,8 @@ TEST_P(OverlapBenchmark, DummyBenchmark) {
       int64_t stream_index = j % streams.size();
       setCurrentCUDAStream(streams.at(stream_index));
 
+      auto world = communicator_->getBackendForTeam(all_ranks, backend, std::to_string(j % number_of_pgs));
+
       auto ta_j = ta.select(0, j);
       auto ta_unsharded_j = ta_unsharded.select(0, j);
 
@@ -162,7 +167,8 @@ INSTANTIATE_TEST_SUITE_P(
     /*K=*/testing::Values(pow(2,10), pow(2,15)),
     /*N=*/testing::Values(pow(2,10)),
     /*number_of_streams=*/testing::Values(3, 8, 32),
-    /*add_cuStreamWriteValue32*/testing::Values(false)),
+    /*add_cuStreamWriteValue32*/testing::Values(false),
+    /*number_of_pgs=*/testing::Values(1, 2, 4, 8)),
     [](const testing::TestParamInfo<OverlapBenchmarkParams>& info)
         -> std::string {
       std::ostringstream os;
@@ -172,7 +178,8 @@ INSTANTIATE_TEST_SUITE_P(
          << "K" << std::get<3>(info.param) << "_"
          << "N" << std::get<4>(info.param) << "_"
          << "Streams" << std::get<5>(info.param) << "_"
-         << ((std::get<6>(info.param))? "With" : "Without") << "cuStreamWriteValue32";
+         << ((std::get<6>(info.param))? "WithcuStreamWriteValue32_" : "")
+         << "Pgs" << std::get<7>(info.param);
       return os.str();
     });
 

From a15fdfc9d84258d38442a78110d57be1a121598c Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Mon, 4 Nov 2024 06:39:54 -0800
Subject: [PATCH 09/55] reenable cuStreamValue32

---
 bench/test                             |  9 +++++----
 csrc/driver_api.h                      |  1 +
 tests/cpp/test_multidevice_overlap.cpp | 26 +++++++++++++-------------
 3 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/bench/test b/bench/test
index 5433bbee9ce..4f3559e283a 100755
--- a/bench/test
+++ b/bench/test
@@ -1,5 +1,5 @@
 #!/bin/bash
-EXPERIMENT=profile_baseline_NCCL
+EXPERIMENT=profile_cuStreamWrite_NCCL
 DATE=$(date +%Y%m%d-%H%M)
 LOG_BASE="/opt/pytorch/Fuser/bench/logs"
 
@@ -11,14 +11,15 @@ echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO
 
 NP=8
 BACKEND=NCCL
-S=1
+S=8
 M=32768
 K=32768
 N=1024
 Streams=8
 Pgs=1
+cuStreamWrite=WithcuStreamWriteValue32_
 GTEST_PREFIX="OverlapBenchmark.DummyBenchmark/"
-GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_Pgs${Pgs}"
+GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}"
 export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}"
 echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO
 ``
@@ -39,7 +40,7 @@ echo "test cmd: $TEST_CMD" | tee -a $LOG_FILE_INFO
 MPICMD="mpirun $MPIFLAGS $TEST_CMD"
 echo $MPICMD | tee -a $LOG_FILE_INFO
 
-NSYSCMD="nsys profile --stats=false -w true -t cublas,cuda,nvtx,osrt,mpi,ucx -o ${LOGS}/${GTEST_POSTFIX} --capture-range-end stop --capture-range=cudaProfilerApi --cudabacktrace=memory,sync,kernel,other"
+# NSYSCMD="nsys profile --stats=false -w true -t cublas,cuda,nvtx,osrt,mpi,ucx -o ${LOGS}/${GTEST_POSTFIX} --capture-range-end stop --capture-range=cudaProfilerApi --cudabacktrace=memory,sync,kernel,other"
 
 CMD="${NSYSCMD} ${MPICMD}"
 sudo /bin/sh -c "echo '1' > /proc/sys/kernel/perf_event_paranoid"
diff --git a/csrc/driver_api.h b/csrc/driver_api.h
index b8c413a4054..8105cf855c2 100644
--- a/csrc/driver_api.h
+++ b/csrc/driver_api.h
@@ -32,6 +32,7 @@ namespace nvfuser {
   fn(cuModuleGetFunction);                \
   fn(cuModuleLoadDataEx);                 \
   fn(cuModuleUnload);                     \
+  fn(cuStreamWriteValue32);               \
   fn(cuOccupancyMaxActiveBlocksPerMultiprocessor)
 
 #if (CUDA_VERSION >= 12000)
diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp
index ff79bb45609..fef6e9bf468 100644
--- a/tests/cpp/test_multidevice_overlap.cpp
+++ b/tests/cpp/test_multidevice_overlap.cpp
@@ -102,12 +102,12 @@ TEST_P(OverlapBenchmark, DummyBenchmark) {
   cudaEventCreate(&start);
   cudaEventCreate(&stop);
 
-  // CUdeviceptr pDevice;
-  // void* ptr;
-  // if (add_cuStreamWriteValue32) {
-  //   cudaMallocHost(&ptr, 32);
-  //   cudaHostGetDevicePointer((void**)&pDevice, ptr, 0);
-  // }
+  CUdeviceptr pDevice;
+  void* ptr;
+  if (add_cuStreamWriteValue32) {
+    cudaMallocHost(&ptr, 32);
+    cudaHostGetDevicePointer((void**)&pDevice, ptr, 0);
+  }
 
   for (const auto& iteration :
        c10::irange(number_of_warmups + number_of_iterations)) {
@@ -129,9 +129,9 @@ TEST_P(OverlapBenchmark, DummyBenchmark) {
       // communication
       world->_allgather_base(ta_unsharded_j, ta_j)->wait();
 
-      // if (add_cuStreamWriteValue32) {
-      //   cuStreamWriteValue32((CUstream)streams.at(stream_index), (CUdeviceptr)pDevice, (cuuint32_t)1, (unsigned int)0);
-      // }
+      if (add_cuStreamWriteValue32) {
+        cuStreamWriteValue32((CUstream)streams.at(stream_index), (CUdeviceptr)pDevice, (cuuint32_t)1, (unsigned int)0);
+      }
 
       // compute
       auto tc_j = torch::matmul(ta_unsharded_j,tb);
@@ -152,9 +152,9 @@ TEST_P(OverlapBenchmark, DummyBenchmark) {
   times.insert({test_name, milliseconds});
   std::cout << "rank " << communicator_->deviceId() << ", " << test_name << " : " << milliseconds << std::endl;
 
-  // if (add_cuStreamWriteValue32) {
-  //   cudaFree(ptr);
-  // }
+  if (add_cuStreamWriteValue32) {
+    cudaFree(ptr);
+  }
 }
 
 INSTANTIATE_TEST_SUITE_P(
@@ -167,7 +167,7 @@ INSTANTIATE_TEST_SUITE_P(
     /*K=*/testing::Values(pow(2,10), pow(2,15)),
     /*N=*/testing::Values(pow(2,10)),
     /*number_of_streams=*/testing::Values(3, 8, 32),
-    /*add_cuStreamWriteValue32*/testing::Values(false),
+    /*add_cuStreamWriteValue32*/testing::Values(false, true),
     /*number_of_pgs=*/testing::Values(1, 2, 4, 8)),
     [](const testing::TestParamInfo<OverlapBenchmarkParams>& info)
         -> std::string {

From 6682a33b366b3f21a1ced568106e8a3b475c8567 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Mon, 4 Nov 2024 07:57:44 -0800
Subject: [PATCH 10/55] add tl/cuda and ec/cuda flags in bash test script

---
 bench/test | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/bench/test b/bench/test
index 4f3559e283a..5ad427b4876 100755
--- a/bench/test
+++ b/bench/test
@@ -1,5 +1,5 @@
 #!/bin/bash
-EXPERIMENT=profile_cuStreamWrite_NCCL
+EXPERIMENT=profile_UCC_TL_CUDA
 DATE=$(date +%Y%m%d-%H%M)
 LOG_BASE="/opt/pytorch/Fuser/bench/logs"
 
@@ -10,14 +10,14 @@ export LOG_FILE_INFO="${LOGS}/info.txt"
 echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO
 
 NP=8
-BACKEND=NCCL
-S=8
+BACKEND=UCC
+S=4
 M=32768
 K=32768
 N=1024
 Streams=8
 Pgs=1
-cuStreamWrite=WithcuStreamWriteValue32_
+# cuStreamWrite=WithcuStreamWriteValue32_
 GTEST_PREFIX="OverlapBenchmark.DummyBenchmark/"
 GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}"
 export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}"
@@ -25,11 +25,21 @@ echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO
 ``
 MPIFLAGS=" -np $NP"
 MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1"
+MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl"
+# MPIFLAGS+=" -x UCC_CL_BASIC_TLS=cuda"
+# MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_NUM_WORKERS=8"
+# MPIFLAGS+=" -x UCC_EC_CUDA_USE_COOPERATIVE_LAUNCH=0"
+# MPIFLAGS+=" -x UCC_EC_CUDA_STREAM_TASK_MODE=kernel"
+# MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_COPY_LARGE_THRESH=1M"
+# MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_NUM_THREADS=512"
+# MPIFLAGS+=" -x UCC_TL_CUDA_SCRATCH_SIZE=32mb"
+# MPIFLAGS+=" -x UCC_TL_CUDA_ALLGATHER_RING_MAX_RINGS=32"
+# MPIFLAGS+=" -x UCC_TL_CUDA_ALLGATHER_RING_NUM_CHUNKS=32"
+
 # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=^sharp,mlx5"
 # MPIFLAGS+=" -x UCC_COLL_TRACE=info"
-MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl"
+# MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1"
 # MPIFLAGS+=" -x NCCL_DEBUG=TRACE" #INFO
-MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1"
 # MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event"
 # MPIFLAGS+=" -x CUDA_DEVICE_MAX_CONNECTIONS=2"
 echo "mpi flags: $MPIFLAGS" | tee -a $LOG_FILE_INFO

From b01f1f4fe236be4144182cac5cbdcef15c559337 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Mon, 4 Nov 2024 08:40:14 -0800
Subject: [PATCH 11/55] add option to unfuse loops

---
 bench/test                             |  5 +++--
 tests/cpp/test_multidevice_overlap.cpp | 27 ++++++++++++++++++++------
 2 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/bench/test b/bench/test
index 5ad427b4876..2102c1eb743 100755
--- a/bench/test
+++ b/bench/test
@@ -11,15 +11,16 @@ echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO
 
 NP=8
 BACKEND=UCC
-S=4
+S=8
 M=32768
 K=32768
 N=1024
 Streams=8
 Pgs=1
+UNFUSE="_unfused"
 # cuStreamWrite=WithcuStreamWriteValue32_
 GTEST_PREFIX="OverlapBenchmark.DummyBenchmark/"
-GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}"
+GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}${UNFUSE}"
 export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}"
 echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO
 ``
diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp
index fef6e9bf468..d4b9c757f7a 100644
--- a/tests/cpp/test_multidevice_overlap.cpp
+++ b/tests/cpp/test_multidevice_overlap.cpp
@@ -51,7 +51,8 @@ using OverlapBenchmarkParams = std::tuple<
     /*N=*/int64_t,
     /*number_of_streams=*/int64_t,
     /*add_cuStreamWriteValue32=*/bool,
-    /*number_of_pgs=*/int64_t>;
+    /*number_of_pgs=*/int64_t,
+    /*unfuse_loops=*/bool>;
 
 class OverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterface<OverlapBenchmarkParams> {
  protected:
@@ -83,7 +84,8 @@ TEST_P(OverlapBenchmark, DummyBenchmark) {
         N,
         number_of_streams,
         add_cuStreamWriteValue32,
-        number_of_pgs] = GetParam();
+        number_of_pgs,
+        unfuse_loops] = GetParam();
 
   GTEST_ASSERT_EQ(M % S, 0);
 
@@ -132,9 +134,20 @@ TEST_P(OverlapBenchmark, DummyBenchmark) {
       if (add_cuStreamWriteValue32) {
         cuStreamWriteValue32((CUstream)streams.at(stream_index), (CUdeviceptr)pDevice, (cuuint32_t)1, (unsigned int)0);
       }
+      if (unfuse_loops == false) {
+        // compute
+        auto tc_j = torch::matmul(ta_unsharded_j,tb);
+      }
+    }
+    if (unfuse_loops) {
+      for (auto j : c10::irange(S)) {
+        int64_t stream_index = j % streams.size();
+        setCurrentCUDAStream(streams.at(stream_index));
+        auto ta_unsharded_j = ta_unsharded.select(0, j);
 
-      // compute
-      auto tc_j = torch::matmul(ta_unsharded_j,tb);
+        // compute
+        auto tc_j = torch::matmul(ta_unsharded_j,tb);
+      }
     }
     setCurrentCUDAStream(c10::cuda::getDefaultCUDAStream(communicator_->deviceId()));
     synchronizeStreams(streams);
@@ -168,7 +181,8 @@ INSTANTIATE_TEST_SUITE_P(
     /*N=*/testing::Values(pow(2,10)),
     /*number_of_streams=*/testing::Values(3, 8, 32),
     /*add_cuStreamWriteValue32*/testing::Values(false, true),
-    /*number_of_pgs=*/testing::Values(1, 2, 4, 8)),
+    /*number_of_pgs=*/testing::Values(1, 2, 4, 8),
+    /*unfuse_loops=*/testing::Values(false, true)),
     [](const testing::TestParamInfo<OverlapBenchmarkParams>& info)
         -> std::string {
       std::ostringstream os;
@@ -179,7 +193,8 @@ INSTANTIATE_TEST_SUITE_P(
          << "N" << std::get<4>(info.param) << "_"
          << "Streams" << std::get<5>(info.param) << "_"
          << ((std::get<6>(info.param))? "WithcuStreamWriteValue32_" : "")
-         << "Pgs" << std::get<7>(info.param);
+         << "Pgs" << std::get<7>(info.param)
+         << ((std::get<8>(info.param))? "_unfused" : "");
       return os.str();
     });
 

From ea7fd37d61ad310c5dcb2d8ca599d8212003ff44 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Tue, 5 Nov 2024 02:53:36 -0800
Subject: [PATCH 12/55] add cuda graphs. Only working for NCCL and S1 bc there
 is a syncStream in nccl

---
 bench/test                             | 13 ++--
 tests/cpp/test_multidevice_overlap.cpp | 84 ++++++++++++++++----------
 2 files changed, 60 insertions(+), 37 deletions(-)

diff --git a/bench/test b/bench/test
index 2102c1eb743..8a64225d9e9 100755
--- a/bench/test
+++ b/bench/test
@@ -1,5 +1,5 @@
 #!/bin/bash
-EXPERIMENT=profile_UCC_TL_CUDA
+EXPERIMENT=profile_cudaGraph_NCCL_S1
 DATE=$(date +%Y%m%d-%H%M)
 LOG_BASE="/opt/pytorch/Fuser/bench/logs"
 
@@ -10,17 +10,18 @@ export LOG_FILE_INFO="${LOGS}/info.txt"
 echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO
 
 NP=8
-BACKEND=UCC
-S=8
+BACKEND=NCCL
+S=1
 M=32768
 K=32768
 N=1024
 Streams=8
 Pgs=1
-UNFUSE="_unfused"
+# UNFUSE="_unfused"
+GRAPH="_WithCudaGraph"
 # cuStreamWrite=WithcuStreamWriteValue32_
 GTEST_PREFIX="OverlapBenchmark.DummyBenchmark/"
-GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}${UNFUSE}"
+GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}${UNFUSE}${GRAPH}"
 export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}"
 echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO
 ``
@@ -51,7 +52,7 @@ echo "test cmd: $TEST_CMD" | tee -a $LOG_FILE_INFO
 MPICMD="mpirun $MPIFLAGS $TEST_CMD"
 echo $MPICMD | tee -a $LOG_FILE_INFO
 
-# NSYSCMD="nsys profile --stats=false -w true -t cublas,cuda,nvtx,osrt,mpi,ucx -o ${LOGS}/${GTEST_POSTFIX} --capture-range-end stop --capture-range=cudaProfilerApi --cudabacktrace=memory,sync,kernel,other"
+NSYSCMD="nsys profile --stats=false -w true -t cublas,cuda,nvtx,osrt,mpi,ucx -o ${LOGS}/${GTEST_POSTFIX} --capture-range-end stop --capture-range=cudaProfilerApi --cudabacktrace=memory,sync,kernel,other"
 
 CMD="${NSYSCMD} ${MPICMD}"
 sudo /bin/sh -c "echo '1' > /proc/sys/kernel/perf_event_paranoid"
diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp
index d4b9c757f7a..c93987890b4 100644
--- a/tests/cpp/test_multidevice_overlap.cpp
+++ b/tests/cpp/test_multidevice_overlap.cpp
@@ -6,6 +6,7 @@
  */
 // clang-format on
 #include <ATen/Functions.h>
+#include <ATen/cuda/CUDAGraph.h>
 #include <c10/cuda/CUDAStream.h>
 #include <c10/util/ArrayRef.h>
 #include <fusion.h>
@@ -52,7 +53,8 @@ using OverlapBenchmarkParams = std::tuple<
     /*number_of_streams=*/int64_t,
     /*add_cuStreamWriteValue32=*/bool,
     /*number_of_pgs=*/int64_t,
-    /*unfuse_loops=*/bool>;
+    /*unfuse_loops=*/bool,
+    /*use_cuda_graph=*/bool>;
 
 class OverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterface<OverlapBenchmarkParams> {
  protected:
@@ -72,8 +74,11 @@ class OverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterf
 std::map<std::string, float> OverlapBenchmark::times = {};
 
 TEST_P(OverlapBenchmark, DummyBenchmark) {
-  int64_t number_of_warmups = 50;
+  constexpr int64_t number_of_warmups = 50;
   constexpr int64_t number_of_iterations = 100;
+  constexpr int64_t iteration_profiler_start = 10;
+  constexpr int64_t iteration_profiler_end = 15;
+  constexpr int64_t iteration_cuda_graph_capture = 5;
 
 
   const int64_t D = communicator_->size();
@@ -85,7 +90,8 @@ TEST_P(OverlapBenchmark, DummyBenchmark) {
         number_of_streams,
         add_cuStreamWriteValue32,
         number_of_pgs,
-        unfuse_loops] = GetParam();
+        unfuse_loops,
+        use_cuda_graph] = GetParam();
 
   GTEST_ASSERT_EQ(M % S, 0);
 
@@ -94,6 +100,7 @@ TEST_P(OverlapBenchmark, DummyBenchmark) {
 
   std::vector<c10::cuda::CUDAStream> streams =
       createStreams(number_of_streams, communicator_->deviceId());
+  setCurrentCUDAStream(streams.at(0));
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(communicator_->device());
   auto ta = at::randn({S, M/S,K}, options);
@@ -104,6 +111,8 @@ TEST_P(OverlapBenchmark, DummyBenchmark) {
   cudaEventCreate(&start);
   cudaEventCreate(&stop);
 
+  at::cuda::CUDAGraph cuda_graph;
+
   CUdeviceptr pDevice;
   void* ptr;
   if (add_cuStreamWriteValue32) {
@@ -113,45 +122,56 @@ TEST_P(OverlapBenchmark, DummyBenchmark) {
 
   for (const auto& iteration :
        c10::irange(number_of_warmups + number_of_iterations)) {
-    if (iteration == 10) {
+    if (iteration == iteration_profiler_start) {
       cudaProfilerStart();;
     }
     if (iteration == number_of_warmups) {
       cudaEventRecord(start);
     }
-    for (auto j : c10::irange(S)) {
-      int64_t stream_index = j % streams.size();
-      setCurrentCUDAStream(streams.at(stream_index));
-
-      auto world = communicator_->getBackendForTeam(all_ranks, backend, std::to_string(j % number_of_pgs));
-
-      auto ta_j = ta.select(0, j);
-      auto ta_unsharded_j = ta_unsharded.select(0, j);
-
-      // communication
-      world->_allgather_base(ta_unsharded_j, ta_j)->wait();
-
-      if (add_cuStreamWriteValue32) {
-        cuStreamWriteValue32((CUstream)streams.at(stream_index), (CUdeviceptr)pDevice, (cuuint32_t)1, (unsigned int)0);
+    if (iteration <= iteration_cuda_graph_capture) {
+      if (iteration == iteration_cuda_graph_capture) {
+        cuda_graph.capture_begin();
       }
-      if (unfuse_loops == false) {
-        // compute
-        auto tc_j = torch::matmul(ta_unsharded_j,tb);
-      }
-    }
-    if (unfuse_loops) {
       for (auto j : c10::irange(S)) {
         int64_t stream_index = j % streams.size();
         setCurrentCUDAStream(streams.at(stream_index));
+
+        auto world = communicator_->getBackendForTeam(all_ranks, backend, std::to_string(j % number_of_pgs));
+
+        auto ta_j = ta.select(0, j);
         auto ta_unsharded_j = ta_unsharded.select(0, j);
 
-        // compute
-        auto tc_j = torch::matmul(ta_unsharded_j,tb);
+        // communication
+        world->_allgather_base(ta_unsharded_j, ta_j)->wait();
+
+        if (add_cuStreamWriteValue32) {
+          cuStreamWriteValue32((CUstream)streams.at(stream_index), (CUdeviceptr)pDevice, (cuuint32_t)1, (unsigned int)0);
+        }
+        if (unfuse_loops == false) {
+          // compute
+          auto tc_j = torch::matmul(ta_unsharded_j,tb);
+        }
+      }
+      if (unfuse_loops) {
+        for (auto j : c10::irange(S)) {
+          int64_t stream_index = j % streams.size();
+          setCurrentCUDAStream(streams.at(stream_index));
+          auto ta_unsharded_j = ta_unsharded.select(0, j);
+
+          // compute
+          auto tc_j = torch::matmul(ta_unsharded_j,tb);
+        }
       }
+      if (iteration == iteration_cuda_graph_capture) {
+        cuda_graph.capture_end();
+      } else {
+        setCurrentCUDAStream(streams.at(0));
+        synchronizeStreams(streams);
+      }
+    } else {
+      cuda_graph.replay();
     }
-    setCurrentCUDAStream(c10::cuda::getDefaultCUDAStream(communicator_->deviceId()));
-    synchronizeStreams(streams);
-    if (iteration == 15) {
+    if (iteration == iteration_profiler_end) {
       cudaProfilerStop();;
     }
   }
@@ -182,7 +202,8 @@ INSTANTIATE_TEST_SUITE_P(
     /*number_of_streams=*/testing::Values(3, 8, 32),
     /*add_cuStreamWriteValue32*/testing::Values(false, true),
     /*number_of_pgs=*/testing::Values(1, 2, 4, 8),
-    /*unfuse_loops=*/testing::Values(false, true)),
+    /*unfuse_loops=*/testing::Values(false, true),
+    /*use_cuda_graph=*/testing::Values(false)), // cuda graphs not supported: ucc does not supports it (segfault) and nccl PG has a "syncStream" that throws
     [](const testing::TestParamInfo<OverlapBenchmarkParams>& info)
         -> std::string {
       std::ostringstream os;
@@ -194,7 +215,8 @@ INSTANTIATE_TEST_SUITE_P(
          << "Streams" << std::get<5>(info.param) << "_"
          << ((std::get<6>(info.param))? "WithcuStreamWriteValue32_" : "")
          << "Pgs" << std::get<7>(info.param)
-         << ((std::get<8>(info.param))? "_unfused" : "");
+         << ((std::get<8>(info.param))? "_unfused" : "")
+         << ((std::get<9>(info.param))? "_WithCudaGraph" : "");
       return os.str();
     });
 

From 9dddac2a6320e315f1300febc624a03e084aa54f Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Mon, 25 Nov 2024 16:51:59 -0800
Subject: [PATCH 13/55] write matmul to sliced output

---
 tests/cpp/test_multidevice_overlap.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp
index c93987890b4..5600041dc7d 100644
--- a/tests/cpp/test_multidevice_overlap.cpp
+++ b/tests/cpp/test_multidevice_overlap.cpp
@@ -73,7 +73,7 @@ class OverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterf
 
 std::map<std::string, float> OverlapBenchmark::times = {};
 
-TEST_P(OverlapBenchmark, DummyBenchmark) {
+TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) {
   constexpr int64_t number_of_warmups = 50;
   constexpr int64_t number_of_iterations = 100;
   constexpr int64_t iteration_profiler_start = 10;
@@ -106,6 +106,7 @@ TEST_P(OverlapBenchmark, DummyBenchmark) {
   auto ta = at::randn({S, M/S,K}, options);
   auto ta_unsharded = at::empty({S, D, M/S,K}, options);
   auto tb = at::randn({K,N}, options);
+  auto tc = at::empty({S, D, M/S, N}, options);
 
   cudaEvent_t start, stop;
   cudaEventCreate(&start);
@@ -140,6 +141,7 @@ TEST_P(OverlapBenchmark, DummyBenchmark) {
 
         auto ta_j = ta.select(0, j);
         auto ta_unsharded_j = ta_unsharded.select(0, j);
+        auto tc_j = ta_unsharded.select(0, j);
 
         // communication
         world->_allgather_base(ta_unsharded_j, ta_j)->wait();
@@ -149,7 +151,7 @@ TEST_P(OverlapBenchmark, DummyBenchmark) {
         }
         if (unfuse_loops == false) {
           // compute
-          auto tc_j = torch::matmul(ta_unsharded_j,tb);
+          torch::matmul_out(tc_j, ta_unsharded_j,tb);
         }
       }
       if (unfuse_loops) {
@@ -157,9 +159,10 @@ TEST_P(OverlapBenchmark, DummyBenchmark) {
           int64_t stream_index = j % streams.size();
           setCurrentCUDAStream(streams.at(stream_index));
           auto ta_unsharded_j = ta_unsharded.select(0, j);
+          auto tc_j = ta_unsharded.select(0, j);
 
           // compute
-          auto tc_j = torch::matmul(ta_unsharded_j,tb);
+          torch::matmul_out(tc_j, ta_unsharded_j,tb);
         }
       }
       if (iteration == iteration_cuda_graph_capture) {

From faf8bbe6b9c1ddf19b31069d66387b060481e9bf Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Thu, 28 Nov 2024 08:12:18 -0800
Subject: [PATCH 14/55] wip cuStreamWriteValue not working

---
 bench/test                             |  14 +-
 tests/cpp/test_multidevice_overlap.cpp | 219 +++++++++++++++++++++++--
 2 files changed, 216 insertions(+), 17 deletions(-)

diff --git a/bench/test b/bench/test
index 8a64225d9e9..c27cb9ce74b 100755
--- a/bench/test
+++ b/bench/test
@@ -1,5 +1,5 @@
 #!/bin/bash
-EXPERIMENT=profile_cudaGraph_NCCL_S1
+EXPERIMENT=profile_NCCL_with_cuStreamValue
 DATE=$(date +%Y%m%d-%H%M)
 LOG_BASE="/opt/pytorch/Fuser/bench/logs"
 
@@ -10,17 +10,17 @@ export LOG_FILE_INFO="${LOGS}/info.txt"
 echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO
 
 NP=8
-BACKEND=NCCL
-S=1
+BACKEND=UCC
+S=8
 M=32768
 K=32768
 N=1024
 Streams=8
 Pgs=1
 # UNFUSE="_unfused"
-GRAPH="_WithCudaGraph"
+# GRAPH="_WithCudaGraph"
 # cuStreamWrite=WithcuStreamWriteValue32_
-GTEST_PREFIX="OverlapBenchmark.DummyBenchmark/"
+GTEST_PREFIX="OverlapBenchmark.PipelinedAGMatmulBenchmark/"
 GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}${UNFUSE}${GRAPH}"
 export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}"
 echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO
@@ -42,7 +42,7 @@ MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl"
 # MPIFLAGS+=" -x UCC_COLL_TRACE=info"
 # MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1"
 # MPIFLAGS+=" -x NCCL_DEBUG=TRACE" #INFO
-# MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event"
+MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event"
 # MPIFLAGS+=" -x CUDA_DEVICE_MAX_CONNECTIONS=2"
 echo "mpi flags: $MPIFLAGS" | tee -a $LOG_FILE_INFO
 
@@ -52,7 +52,7 @@ echo "test cmd: $TEST_CMD" | tee -a $LOG_FILE_INFO
 MPICMD="mpirun $MPIFLAGS $TEST_CMD"
 echo $MPICMD | tee -a $LOG_FILE_INFO
 
-NSYSCMD="nsys profile --stats=false -w true -t cublas,cuda,nvtx,osrt,mpi,ucx -o ${LOGS}/${GTEST_POSTFIX} --capture-range-end stop --capture-range=cudaProfilerApi --cudabacktrace=memory,sync,kernel,other"
+# NSYSCMD="nsys profile --stats=false -w true -t cublas,cuda,nvtx,osrt,mpi,ucx -o ${LOGS}/${GTEST_POSTFIX} --capture-range-end stop --capture-range=cudaProfilerApi --cudabacktrace=memory,sync,kernel,other"
 
 CMD="${NSYSCMD} ${MPICMD}"
 sudo /bin/sh -c "echo '1' > /proc/sys/kernel/perf_event_paranoid"
diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp
index 5600041dc7d..0d55580a11a 100644
--- a/tests/cpp/test_multidevice_overlap.cpp
+++ b/tests/cpp/test_multidevice_overlap.cpp
@@ -20,6 +20,8 @@
 #include <cuda_profiler_api.h>
 #include <cuda_runtime.h>
 
+#define CUSTOM_PG_WITH_INTERNAL_STREAM_ACCESS 1
+
 namespace nvfuser {
 
 namespace {
@@ -44,6 +46,190 @@ void synchronizeStreams(const std::vector<c10::cuda::CUDAStream>& streams) {
 
 } // namespace
 
+TEST_F(NVFuserTest, cuStreamWriteValue32) {
+  constexpr cuuint32_t value = 3;
+  cudaError_t error;
+  CUdeviceptr pDevice;
+  volatile cuuint32_t* ptr;
+  error = cudaSetDevice(0);
+  ASSERT_EQ(error, 0);
+  error = cudaMallocHost((void**)&ptr, sizeof(cuuint32_t));
+  ASSERT_EQ(error, 0);
+  error = cudaHostGetDevicePointer((void**)&pDevice, (void*)ptr, 0);
+  ASSERT_EQ(error, 0);
+
+  at::cuda::CUDAStream c10_stream = at::cuda::getStreamFromPool(
+              /*isHighPriority=*/true, /*device_index*/0);
+  CUstream stream = c10_stream.stream();
+  CUresult st;
+  st = cuStreamWriteValue32(stream, pDevice, value, /*flag=*/0);
+  ASSERT_EQ(st, 0);
+
+  torch::cuda::synchronize();
+  cuuint32_t ptr2;
+  error = cudaMemcpy(&ptr2, (void*)pDevice, sizeof(cuuint32_t), cudaMemcpyDeviceToHost);
+  ASSERT_EQ(error, 0);
+  ASSERT_EQ(ptr2, value);
+
+
+  int i = 0;
+  while (i < 10000000) {
+    if (*ptr == value) {
+      std::cout << " BREAK " << *ptr <<std::endl;
+      break;
+    }
+    if (i % 1000000 == 0) {
+      std::cout << "waiting, read value = " << *ptr <<std::endl;
+    }
+    i++;
+  }
+}
+
+using DummyOverlapBenchmarkParams = std::tuple<
+    CommunicatorBackend,
+    /*M=*/int64_t,
+    /*K=*/int64_t,
+    /*N=*/int64_t,
+    /*L(communication msgsize)=*/int64_t,
+    /*number_of_streams=*/int64_t,
+    /*add_cuStreamWriteValue32=*/bool,
+    /*number_of_pgs=*/int64_t>;
+
+class DummyOverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterface<DummyOverlapBenchmarkParams> {
+ protected:
+  static std::map<std::string, float> times;
+
+  static void TearDownTestSuite() {
+    auto rank = Communicator::getInstance().deviceId();
+    if (rank != 0) {
+      return;
+    }
+    for (auto it: times) {
+      std::cout << "time " << rank << ": " << it.first << ": " << it.second << std::endl;
+    }
+  }
+};
+
+std::map<std::string, float> DummyOverlapBenchmark::times = {};
+
+TEST_P(DummyOverlapBenchmark, PipelinedAGMatmulBenchmark) {
+  constexpr int64_t number_of_warmups = 50;
+  constexpr int64_t number_of_iterations = 100;
+  constexpr int64_t iteration_profiler_start = 10;
+  constexpr int64_t iteration_profiler_end = 15;
+
+
+  auto [backend,
+        M,
+        K,
+        N,
+        L,
+        number_of_streams,
+        add_cuStreamWriteValue32,
+        number_of_pgs] = GetParam();
+
+  std::vector<RankType> all_ranks(communicator_->size());
+  std::iota(all_ranks.begin(), all_ranks.end(), 0);
+
+  std::vector<c10::cuda::CUDAStream> streams =
+      createStreams(number_of_streams, communicator_->deviceId());
+  setCurrentCUDAStream(streams.at(0));
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(communicator_->device());
+  auto ta = at::randn({M, K}, options);
+  auto tb = at::randn({K, N}, options);
+  auto tc = at::empty({M, N}, options);
+  auto src = at::randn({L}, options);
+  auto dst = at::empty({L * communicator_->size()}, options);
+
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+
+  CUdeviceptr pDevice;
+  void* ptr;
+  if (add_cuStreamWriteValue32) {
+    cudaMallocHost(&ptr, 32);
+    cudaHostGetDevicePointer((void**)&pDevice, ptr, 0);
+  }
+
+  for (const auto& iteration :
+       c10::irange(number_of_warmups + number_of_iterations)) {
+    if (iteration == iteration_profiler_start) {
+      cudaProfilerStart();;
+    }
+    if (iteration == number_of_warmups) {
+      cudaEventRecord(start);
+    }
+    int64_t stream_index = iteration % streams.size();
+    setCurrentCUDAStream(streams.at(stream_index));
+
+    auto world = communicator_->getBackendForTeam(all_ranks, backend, std::to_string(iteration % number_of_pgs));
+
+    // communication
+    world->_allgather_base(dst, src)->wait();
+
+    // compute
+    torch::matmul_out(tc, ta, tb);
+
+    if (add_cuStreamWriteValue32) {
+
+      cuStreamWriteValue32(
+#if CUSTOM_PG_WITH_INTERNAL_STREAM_ACCESS
+        (CUstream)world->getCudaStream(communicator_->device()).stream(),
+#else
+        (CUstream)streams.at(stream_index).stream(), 
+#endif
+        (CUdeviceptr)pDevice, (cuuint32_t)1, (unsigned int)0);
+    }
+
+    setCurrentCUDAStream(streams.at(0));
+    synchronizeStreams(streams);
+    if (iteration == iteration_profiler_end) {
+      cudaProfilerStop();;
+    }
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  float milliseconds = 0;
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  milliseconds /= number_of_iterations;
+
+  std::string test_name = ::testing::UnitTest::GetInstance()->current_test_info()->name();
+  times.insert({test_name, milliseconds});
+  std::cout << "rank " << communicator_->deviceId() << ", " << test_name << " : " << milliseconds << std::endl;
+
+  if (add_cuStreamWriteValue32) {
+    cudaFree(ptr);
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    ,
+    DummyOverlapBenchmark,
+    testing::Combine(
+    testing::Values(CommunicatorBackend::kNccl, CommunicatorBackend::kUcc),
+    /*M=*/testing::Values(pow(2,10), pow(2,15)),
+    /*K=*/testing::Values(pow(2,10), pow(2,15)),
+    /*N=*/testing::Values(pow(2,10)),
+    /*L=*/testing::Values(pow(2,15)),
+    /*number_of_streams=*/testing::Values(1, 8),
+    /*add_cuStreamWriteValue32*/testing::Values(false, true),
+    /*number_of_pgs=*/testing::Values(1, 2, 4, 8)),
+    [](const testing::TestParamInfo<DummyOverlapBenchmarkParams>& info)
+        -> std::string {
+      std::ostringstream os;
+      os << /*backend*/std::get<0>(info.param) << "_"
+         << "M" << std::get<1>(info.param) << "_"
+         << "K" << std::get<2>(info.param) << "_"
+         << "N" << std::get<3>(info.param) << "_"
+         << "L" << std::get<4>(info.param) << "_"
+         << "Streams" << std::get<5>(info.param) << "_"
+         << ((std::get<6>(info.param))? "WithcuStreamWriteValue32_" : "")
+         << "Pgs" << std::get<7>(info.param);
+      return os.str();
+    });
+
 using OverlapBenchmarkParams = std::tuple<
     CommunicatorBackend,
     /*S=*/int64_t,
@@ -115,10 +301,10 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) {
   at::cuda::CUDAGraph cuda_graph;
 
   CUdeviceptr pDevice;
-  void* ptr;
+  cuuint32_t* ptr;
   if (add_cuStreamWriteValue32) {
-    cudaMallocHost(&ptr, 32);
-    cudaHostGetDevicePointer((void**)&pDevice, ptr, 0);
+    cudaMallocHost((void**)&ptr, sizeof(cuuint32_t));
+    cudaHostGetDevicePointer((void**)&pDevice, (void*)ptr, 0);
   }
 
   for (const auto& iteration :
@@ -129,8 +315,8 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) {
     if (iteration == number_of_warmups) {
       cudaEventRecord(start);
     }
-    if (iteration <= iteration_cuda_graph_capture) {
-      if (iteration == iteration_cuda_graph_capture) {
+    if (!use_cuda_graph || (iteration <= iteration_cuda_graph_capture)) {
+      if (use_cuda_graph && (iteration == iteration_cuda_graph_capture)) {
         cuda_graph.capture_begin();
       }
       for (auto j : c10::irange(S)) {
@@ -141,13 +327,22 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) {
 
         auto ta_j = ta.select(0, j);
         auto ta_unsharded_j = ta_unsharded.select(0, j);
-        auto tc_j = ta_unsharded.select(0, j);
+        auto tc_j = tc.select(0, j);
 
         // communication
         world->_allgather_base(ta_unsharded_j, ta_j)->wait();
 
         if (add_cuStreamWriteValue32) {
-          cuStreamWriteValue32((CUstream)streams.at(stream_index), (CUdeviceptr)pDevice, (cuuint32_t)1, (unsigned int)0);
+          if (!communicator_->deviceId()){
+            std::cout << "writing to stream " << world->getCudaStream(communicator_->device()).stream() << " the value " << (cuuint32_t)(iteration * S + j) << ", communicator_->device()=" << communicator_->device() << ", world=" << world << ", number_of_pgs=" << number_of_pgs << " with MACRO=" << CUSTOM_PG_WITH_INTERNAL_STREAM_ACCESS << std::endl;
+          }
+          cuStreamWriteValue32(
+#if CUSTOM_PG_WITH_INTERNAL_STREAM_ACCESS
+            (CUstream)world->getCudaStream(communicator_->device()).stream(),
+#else
+            // (CUstream)streams.at(stream_index).stream(),
+#endif
+            (CUdeviceptr)pDevice, (cuuint32_t)(iteration * S + j), (unsigned int)0);
         }
         if (unfuse_loops == false) {
           // compute
@@ -159,13 +354,13 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) {
           int64_t stream_index = j % streams.size();
           setCurrentCUDAStream(streams.at(stream_index));
           auto ta_unsharded_j = ta_unsharded.select(0, j);
-          auto tc_j = ta_unsharded.select(0, j);
+          auto tc_j = tc.select(0, j);
 
           // compute
           torch::matmul_out(tc_j, ta_unsharded_j,tb);
         }
       }
-      if (iteration == iteration_cuda_graph_capture) {
+      if (use_cuda_graph && (iteration == iteration_cuda_graph_capture)) {
         cuda_graph.capture_end();
       } else {
         setCurrentCUDAStream(streams.at(0));
@@ -189,7 +384,11 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) {
   std::cout << "rank " << communicator_->deviceId() << ", " << test_name << " : " << milliseconds << std::endl;
 
   if (add_cuStreamWriteValue32) {
-    cudaFree(ptr);
+    std::cout << "RANK " << communicator_->device() << " entering while loop. Max index=" << (number_of_warmups + number_of_iterations)*S + S << std::endl;
+    while (*ptr < (cuuint32_t)(number_of_warmups + number_of_iterations)*S + S - 1) {
+      std::cout << "RANK " << communicator_->device() << " waiting at index=" << *ptr << std::endl;
+    }
+    cudaFree((void*)ptr);
   }
 }
 

From a6b5fd75896d26a15fc0e2b6a8a66e9e81e60016 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Mon, 2 Dec 2024 05:45:31 -0800
Subject: [PATCH 15/55] dummy benchmark

---
 bench/test                             |  30 ++++---
 tests/cpp/test_multidevice_overlap.cpp | 110 ++++---------------------
 2 files changed, 35 insertions(+), 105 deletions(-)

diff --git a/bench/test b/bench/test
index c27cb9ce74b..cff8d8b34bb 100755
--- a/bench/test
+++ b/bench/test
@@ -1,5 +1,5 @@
 #!/bin/bash
-EXPERIMENT=profile_NCCL_with_cuStreamValue
+EXPERIMENT=Dummy_profile_NCCL_P2P_NET_CHUNKSIZE_LARGE
 DATE=$(date +%Y%m%d-%H%M)
 LOG_BASE="/opt/pytorch/Fuser/bench/logs"
 
@@ -10,27 +10,33 @@ export LOG_FILE_INFO="${LOGS}/info.txt"
 echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO
 
 NP=8
-BACKEND=UCC
+BACKEND=NCCL
 S=8
-M=32768
+M=131072 #32768
 K=32768
-N=1024
+N=32768 #1024
+L=32768
 Streams=8
 Pgs=1
 # UNFUSE="_unfused"
 # GRAPH="_WithCudaGraph"
 # cuStreamWrite=WithcuStreamWriteValue32_
-GTEST_PREFIX="OverlapBenchmark.PipelinedAGMatmulBenchmark/"
-GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}${UNFUSE}${GRAPH}"
+# GTEST_PREFIX="OverlapBenchmark.PipelinedAGMatmulBenchmark/"
+GTEST_PREFIX="DummyOverlapBenchmark.PipelinedAGMatmulBenchmark/"
+# GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}${UNFUSE}${GRAPH}"
+GTEST_POSTFIX="${BACKEND}_M${M}_K${K}_N${N}_L${L}"
 export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}"
 echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO
-``
+
 MPIFLAGS=" -np $NP"
 MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1"
-MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl"
+# MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl"
 # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=cuda"
+# MPIFLAGS+=" -x UCC_CL_BASIC_TLS=ucp"
+# MPIFLAGS+=" -x UCX_RNDV_THRESH=0 -x UCX_TLS=ib,cuda_copy"
 # MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_NUM_WORKERS=8"
 # MPIFLAGS+=" -x UCC_EC_CUDA_USE_COOPERATIVE_LAUNCH=0"
+# MPIFLAGS+=" -x UCC_EC_CUDA_STREAM_TASK_MODE=driver"
 # MPIFLAGS+=" -x UCC_EC_CUDA_STREAM_TASK_MODE=kernel"
 # MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_COPY_LARGE_THRESH=1M"
 # MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_NUM_THREADS=512"
@@ -39,10 +45,12 @@ MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl"
 # MPIFLAGS+=" -x UCC_TL_CUDA_ALLGATHER_RING_NUM_CHUNKS=32"
 
 # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=^sharp,mlx5"
-# MPIFLAGS+=" -x UCC_COLL_TRACE=info"
+# MPIFLAGS+=" -x UCC_COLL_TRACE=debug"
+# MPIFLAGS+=" -x UCC_LOG_LEVEL=debug"
 # MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1"
+MPIFLAGS+=" -x NCCL_P2P_NET_CHUNKSIZE=2MB"
 # MPIFLAGS+=" -x NCCL_DEBUG=TRACE" #INFO
-MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event"
+# MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event"
 # MPIFLAGS+=" -x CUDA_DEVICE_MAX_CONNECTIONS=2"
 echo "mpi flags: $MPIFLAGS" | tee -a $LOG_FILE_INFO
 
@@ -52,7 +60,7 @@ echo "test cmd: $TEST_CMD" | tee -a $LOG_FILE_INFO
 MPICMD="mpirun $MPIFLAGS $TEST_CMD"
 echo $MPICMD | tee -a $LOG_FILE_INFO
 
-# NSYSCMD="nsys profile --stats=false -w true -t cublas,cuda,nvtx,osrt,mpi,ucx -o ${LOGS}/${GTEST_POSTFIX} --capture-range-end stop --capture-range=cudaProfilerApi --cudabacktrace=memory,sync,kernel,other"
+NSYSCMD="nsys profile --stats=false -w true -t cublas,cuda,nvtx,osrt,mpi,ucx -o ${LOGS}/${GTEST_POSTFIX} --capture-range-end stop --capture-range=cudaProfilerApi --cudabacktrace=memory,sync,kernel,other"
 
 CMD="${NSYSCMD} ${MPICMD}"
 sudo /bin/sh -c "echo '1' > /proc/sys/kernel/perf_event_paranoid"
diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp
index 0d55580a11a..85059b89a31 100644
--- a/tests/cpp/test_multidevice_overlap.cpp
+++ b/tests/cpp/test_multidevice_overlap.cpp
@@ -20,7 +20,7 @@
 #include <cuda_profiler_api.h>
 #include <cuda_runtime.h>
 
-#define CUSTOM_PG_WITH_INTERNAL_STREAM_ACCESS 1
+#define CUSTOM_PG_WITH_INTERNAL_STREAM_ACCESS 0
 
 namespace nvfuser {
 
@@ -46,54 +46,12 @@ void synchronizeStreams(const std::vector<c10::cuda::CUDAStream>& streams) {
 
 } // namespace
 
-TEST_F(NVFuserTest, cuStreamWriteValue32) {
-  constexpr cuuint32_t value = 3;
-  cudaError_t error;
-  CUdeviceptr pDevice;
-  volatile cuuint32_t* ptr;
-  error = cudaSetDevice(0);
-  ASSERT_EQ(error, 0);
-  error = cudaMallocHost((void**)&ptr, sizeof(cuuint32_t));
-  ASSERT_EQ(error, 0);
-  error = cudaHostGetDevicePointer((void**)&pDevice, (void*)ptr, 0);
-  ASSERT_EQ(error, 0);
-
-  at::cuda::CUDAStream c10_stream = at::cuda::getStreamFromPool(
-              /*isHighPriority=*/true, /*device_index*/0);
-  CUstream stream = c10_stream.stream();
-  CUresult st;
-  st = cuStreamWriteValue32(stream, pDevice, value, /*flag=*/0);
-  ASSERT_EQ(st, 0);
-
-  torch::cuda::synchronize();
-  cuuint32_t ptr2;
-  error = cudaMemcpy(&ptr2, (void*)pDevice, sizeof(cuuint32_t), cudaMemcpyDeviceToHost);
-  ASSERT_EQ(error, 0);
-  ASSERT_EQ(ptr2, value);
-
-
-  int i = 0;
-  while (i < 10000000) {
-    if (*ptr == value) {
-      std::cout << " BREAK " << *ptr <<std::endl;
-      break;
-    }
-    if (i % 1000000 == 0) {
-      std::cout << "waiting, read value = " << *ptr <<std::endl;
-    }
-    i++;
-  }
-}
-
 using DummyOverlapBenchmarkParams = std::tuple<
     CommunicatorBackend,
     /*M=*/int64_t,
     /*K=*/int64_t,
     /*N=*/int64_t,
-    /*L(communication msgsize)=*/int64_t,
-    /*number_of_streams=*/int64_t,
-    /*add_cuStreamWriteValue32=*/bool,
-    /*number_of_pgs=*/int64_t>;
+    /*L(communication msgsize)=*/int64_t>;
 
 class DummyOverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterface<DummyOverlapBenchmarkParams> {
  protected:
@@ -123,17 +81,16 @@ TEST_P(DummyOverlapBenchmark, PipelinedAGMatmulBenchmark) {
         M,
         K,
         N,
-        L,
-        number_of_streams,
-        add_cuStreamWriteValue32,
-        number_of_pgs] = GetParam();
+        L] = GetParam();
 
   std::vector<RankType> all_ranks(communicator_->size());
   std::iota(all_ranks.begin(), all_ranks.end(), 0);
+  auto world = communicator_->getBackendForTeam(all_ranks, backend);
 
   std::vector<c10::cuda::CUDAStream> streams =
-      createStreams(number_of_streams, communicator_->deviceId());
-  setCurrentCUDAStream(streams.at(0));
+      createStreams(2, communicator_->deviceId());
+  auto& compute_stream = streams.at(0);
+  auto& communication_stream = streams.at(1);
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(communicator_->device());
   auto ta = at::randn({M, K}, options);
@@ -146,13 +103,6 @@ TEST_P(DummyOverlapBenchmark, PipelinedAGMatmulBenchmark) {
   cudaEventCreate(&start);
   cudaEventCreate(&stop);
 
-  CUdeviceptr pDevice;
-  void* ptr;
-  if (add_cuStreamWriteValue32) {
-    cudaMallocHost(&ptr, 32);
-    cudaHostGetDevicePointer((void**)&pDevice, ptr, 0);
-  }
-
   for (const auto& iteration :
        c10::irange(number_of_warmups + number_of_iterations)) {
     if (iteration == iteration_profiler_start) {
@@ -161,33 +111,18 @@ TEST_P(DummyOverlapBenchmark, PipelinedAGMatmulBenchmark) {
     if (iteration == number_of_warmups) {
       cudaEventRecord(start);
     }
-    int64_t stream_index = iteration % streams.size();
-    setCurrentCUDAStream(streams.at(stream_index));
-
-    auto world = communicator_->getBackendForTeam(all_ranks, backend, std::to_string(iteration % number_of_pgs));
 
-    // communication
+    setCurrentCUDAStream(communication_stream);
     world->_allgather_base(dst, src)->wait();
 
     // compute
+    setCurrentCUDAStream(compute_stream);
     torch::matmul_out(tc, ta, tb);
 
-    if (add_cuStreamWriteValue32) {
-
-      cuStreamWriteValue32(
-#if CUSTOM_PG_WITH_INTERNAL_STREAM_ACCESS
-        (CUstream)world->getCudaStream(communicator_->device()).stream(),
-#else
-        (CUstream)streams.at(stream_index).stream(), 
-#endif
-        (CUdeviceptr)pDevice, (cuuint32_t)1, (unsigned int)0);
-    }
-
-    setCurrentCUDAStream(streams.at(0));
-    synchronizeStreams(streams);
     if (iteration == iteration_profiler_end) {
       cudaProfilerStop();;
     }
+    synchronizeStreams(streams);
   }
   cudaEventRecord(stop);
   cudaEventSynchronize(stop);
@@ -198,10 +133,6 @@ TEST_P(DummyOverlapBenchmark, PipelinedAGMatmulBenchmark) {
   std::string test_name = ::testing::UnitTest::GetInstance()->current_test_info()->name();
   times.insert({test_name, milliseconds});
   std::cout << "rank " << communicator_->deviceId() << ", " << test_name << " : " << milliseconds << std::endl;
-
-  if (add_cuStreamWriteValue32) {
-    cudaFree(ptr);
-  }
 }
 
 INSTANTIATE_TEST_SUITE_P(
@@ -209,13 +140,10 @@ INSTANTIATE_TEST_SUITE_P(
     DummyOverlapBenchmark,
     testing::Combine(
     testing::Values(CommunicatorBackend::kNccl, CommunicatorBackend::kUcc),
-    /*M=*/testing::Values(pow(2,10), pow(2,15)),
-    /*K=*/testing::Values(pow(2,10), pow(2,15)),
-    /*N=*/testing::Values(pow(2,10)),
-    /*L=*/testing::Values(pow(2,15)),
-    /*number_of_streams=*/testing::Values(1, 8),
-    /*add_cuStreamWriteValue32*/testing::Values(false, true),
-    /*number_of_pgs=*/testing::Values(1, 2, 4, 8)),
+    /*M=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)),
+    /*K=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)),
+    /*N=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)),
+    /*L=*/testing::Values(pow(2,10), pow(2,15), pow(2,17))),
     [](const testing::TestParamInfo<DummyOverlapBenchmarkParams>& info)
         -> std::string {
       std::ostringstream os;
@@ -223,10 +151,7 @@ INSTANTIATE_TEST_SUITE_P(
          << "M" << std::get<1>(info.param) << "_"
          << "K" << std::get<2>(info.param) << "_"
          << "N" << std::get<3>(info.param) << "_"
-         << "L" << std::get<4>(info.param) << "_"
-         << "Streams" << std::get<5>(info.param) << "_"
-         << ((std::get<6>(info.param))? "WithcuStreamWriteValue32_" : "")
-         << "Pgs" << std::get<7>(info.param);
+         << "L" << std::get<4>(info.param);
       return os.str();
     });
 
@@ -333,14 +258,11 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) {
         world->_allgather_base(ta_unsharded_j, ta_j)->wait();
 
         if (add_cuStreamWriteValue32) {
-          if (!communicator_->deviceId()){
-            std::cout << "writing to stream " << world->getCudaStream(communicator_->device()).stream() << " the value " << (cuuint32_t)(iteration * S + j) << ", communicator_->device()=" << communicator_->device() << ", world=" << world << ", number_of_pgs=" << number_of_pgs << " with MACRO=" << CUSTOM_PG_WITH_INTERNAL_STREAM_ACCESS << std::endl;
-          }
           cuStreamWriteValue32(
 #if CUSTOM_PG_WITH_INTERNAL_STREAM_ACCESS
             (CUstream)world->getCudaStream(communicator_->device()).stream(),
 #else
-            // (CUstream)streams.at(stream_index).stream(),
+            (CUstream)streams.at(stream_index).stream(),
 #endif
             (CUdeviceptr)pDevice, (cuuint32_t)(iteration * S + j), (unsigned int)0);
         }

From 8d927bf4d7537b2ae2450efd775c039c68ebffbe Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Mon, 2 Dec 2024 06:45:27 -0800
Subject: [PATCH 16/55] add pre post comms option

---
 bench/test                             | 22 ++++++++++++---------
 tests/cpp/test_multidevice_overlap.cpp | 27 ++++++++++++++++++++------
 2 files changed, 34 insertions(+), 15 deletions(-)

diff --git a/bench/test b/bench/test
index cff8d8b34bb..28532970124 100755
--- a/bench/test
+++ b/bench/test
@@ -1,5 +1,5 @@
 #!/bin/bash
-EXPERIMENT=Dummy_profile_NCCL_P2P_NET_CHUNKSIZE_LARGE
+EXPERIMENT=Dummy_profile_POST_COMM_UCC_TL_UCP_OVER_IB_LARGE
 DATE=$(date +%Y%m%d-%H%M)
 LOG_BASE="/opt/pytorch/Fuser/bench/logs"
 
@@ -10,21 +10,25 @@ export LOG_FILE_INFO="${LOGS}/info.txt"
 echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO
 
 NP=8
-BACKEND=NCCL
-S=8
+BACKEND=UCC
 M=131072 #32768
 K=32768
 N=32768 #1024
-L=32768
+
+S=8
 Streams=8
 Pgs=1
+
+L=32768
+# PRE_COMM="_pre_comm"
+POST_COMM="_post_comm"
 # UNFUSE="_unfused"
 # GRAPH="_WithCudaGraph"
 # cuStreamWrite=WithcuStreamWriteValue32_
 # GTEST_PREFIX="OverlapBenchmark.PipelinedAGMatmulBenchmark/"
 GTEST_PREFIX="DummyOverlapBenchmark.PipelinedAGMatmulBenchmark/"
 # GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}${UNFUSE}${GRAPH}"
-GTEST_POSTFIX="${BACKEND}_M${M}_K${K}_N${N}_L${L}"
+GTEST_POSTFIX="${BACKEND}_M${M}_K${K}_N${N}_L${L}${PRE_COMM}${POST_COMM}"
 export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}"
 echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO
 
@@ -32,8 +36,8 @@ MPIFLAGS=" -np $NP"
 MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1"
 # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl"
 # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=cuda"
-# MPIFLAGS+=" -x UCC_CL_BASIC_TLS=ucp"
-# MPIFLAGS+=" -x UCX_RNDV_THRESH=0 -x UCX_TLS=ib,cuda_copy"
+MPIFLAGS+=" -x UCC_CL_BASIC_TLS=ucp"
+MPIFLAGS+=" -x UCX_RNDV_THRESH=0 -x UCX_TLS=ib,cuda_copy"
 # MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_NUM_WORKERS=8"
 # MPIFLAGS+=" -x UCC_EC_CUDA_USE_COOPERATIVE_LAUNCH=0"
 # MPIFLAGS+=" -x UCC_EC_CUDA_STREAM_TASK_MODE=driver"
@@ -45,10 +49,10 @@ MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1"
 # MPIFLAGS+=" -x UCC_TL_CUDA_ALLGATHER_RING_NUM_CHUNKS=32"
 
 # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=^sharp,mlx5"
-# MPIFLAGS+=" -x UCC_COLL_TRACE=debug"
+# MPIFLAGS+=" -x UCC_COLL_TRACE=info"
 # MPIFLAGS+=" -x UCC_LOG_LEVEL=debug"
 # MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1"
-MPIFLAGS+=" -x NCCL_P2P_NET_CHUNKSIZE=2MB"
+# MPIFLAGS+=" -x NCCL_P2P_NET_CHUNKSIZE=2MB"
 # MPIFLAGS+=" -x NCCL_DEBUG=TRACE" #INFO
 # MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event"
 # MPIFLAGS+=" -x CUDA_DEVICE_MAX_CONNECTIONS=2"
diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp
index 85059b89a31..9898df02ac8 100644
--- a/tests/cpp/test_multidevice_overlap.cpp
+++ b/tests/cpp/test_multidevice_overlap.cpp
@@ -51,7 +51,9 @@ using DummyOverlapBenchmarkParams = std::tuple<
     /*M=*/int64_t,
     /*K=*/int64_t,
     /*N=*/int64_t,
-    /*L(communication msgsize)=*/int64_t>;
+    /*L(communication msgsize)=*/int64_t,
+    /*pre_comm=*/bool,
+    /*post_comm=*/bool>;
 
 class DummyOverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterface<DummyOverlapBenchmarkParams> {
  protected:
@@ -81,7 +83,9 @@ TEST_P(DummyOverlapBenchmark, PipelinedAGMatmulBenchmark) {
         M,
         K,
         N,
-        L] = GetParam();
+        L,
+        pre_comm,
+        post_comm] = GetParam();
 
   std::vector<RankType> all_ranks(communicator_->size());
   std::iota(all_ranks.begin(), all_ranks.end(), 0);
@@ -112,13 +116,20 @@ TEST_P(DummyOverlapBenchmark, PipelinedAGMatmulBenchmark) {
       cudaEventRecord(start);
     }
 
-    setCurrentCUDAStream(communication_stream);
-    world->_allgather_base(dst, src)->wait();
+    if (pre_comm) {
+      setCurrentCUDAStream(communication_stream);
+      world->_allgather_base(dst, src)->wait();
+    }
 
     // compute
     setCurrentCUDAStream(compute_stream);
     torch::matmul_out(tc, ta, tb);
 
+    if (post_comm) {
+      setCurrentCUDAStream(communication_stream);
+      world->_allgather_base(dst, src)->wait();
+    }
+
     if (iteration == iteration_profiler_end) {
       cudaProfilerStop();;
     }
@@ -143,7 +154,9 @@ INSTANTIATE_TEST_SUITE_P(
     /*M=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)),
     /*K=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)),
     /*N=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)),
-    /*L=*/testing::Values(pow(2,10), pow(2,15), pow(2,17))),
+    /*L=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)),
+    /*pre-comm=*/testing::Bool(),
+    /*post-comm=*/testing::Bool()),
     [](const testing::TestParamInfo<DummyOverlapBenchmarkParams>& info)
         -> std::string {
       std::ostringstream os;
@@ -151,7 +164,9 @@ INSTANTIATE_TEST_SUITE_P(
          << "M" << std::get<1>(info.param) << "_"
          << "K" << std::get<2>(info.param) << "_"
          << "N" << std::get<3>(info.param) << "_"
-         << "L" << std::get<4>(info.param);
+         << "L" << std::get<4>(info.param)
+         << ((std::get<5>(info.param))? "_pre_comm" : "")
+         << ((std::get<6>(info.param))? "_post_comm" : "");
       return os.str();
     });
 

From d9c581c13a9742b3896baf1bd37bc8bcd0acb923 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Mon, 2 Dec 2024 06:45:27 -0800
Subject: [PATCH 17/55] add pre post comms option

---
 bench/test                             | 20 +++++++++++--------
 tests/cpp/test_multidevice_overlap.cpp | 27 ++++++++++++++++++++------
 2 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/bench/test b/bench/test
index cff8d8b34bb..72c22480714 100755
--- a/bench/test
+++ b/bench/test
@@ -1,5 +1,5 @@
 #!/bin/bash
-EXPERIMENT=Dummy_profile_NCCL_P2P_NET_CHUNKSIZE_LARGE
+EXPERIMENT=Dummy_profile_POST_COMM_UCC_TL_UCP_OVER_IB_LARGE
 DATE=$(date +%Y%m%d-%H%M)
 LOG_BASE="/opt/pytorch/Fuser/bench/logs"
 
@@ -11,20 +11,24 @@ echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO
 
 NP=8
 BACKEND=NCCL
-S=8
 M=131072 #32768
 K=32768
 N=32768 #1024
-L=32768
+
+S=8
 Streams=8
 Pgs=1
+
+L=32768
+# PRE_COMM="_pre_comm"
+POST_COMM="_post_comm"
 # UNFUSE="_unfused"
 # GRAPH="_WithCudaGraph"
 # cuStreamWrite=WithcuStreamWriteValue32_
 # GTEST_PREFIX="OverlapBenchmark.PipelinedAGMatmulBenchmark/"
 GTEST_PREFIX="DummyOverlapBenchmark.PipelinedAGMatmulBenchmark/"
 # GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}${UNFUSE}${GRAPH}"
-GTEST_POSTFIX="${BACKEND}_M${M}_K${K}_N${N}_L${L}"
+GTEST_POSTFIX="${BACKEND}_M${M}_K${K}_N${N}_L${L}${PRE_COMM}${POST_COMM}"
 export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}"
 echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO
 
@@ -32,8 +36,8 @@ MPIFLAGS=" -np $NP"
 MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1"
 # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl"
 # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=cuda"
-# MPIFLAGS+=" -x UCC_CL_BASIC_TLS=ucp"
-# MPIFLAGS+=" -x UCX_RNDV_THRESH=0 -x UCX_TLS=ib,cuda_copy"
+MPIFLAGS+=" -x UCC_CL_BASIC_TLS=ucp"
+MPIFLAGS+=" -x UCX_RNDV_THRESH=0 -x UCX_TLS=ib,cuda_copy"
 # MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_NUM_WORKERS=8"
 # MPIFLAGS+=" -x UCC_EC_CUDA_USE_COOPERATIVE_LAUNCH=0"
 # MPIFLAGS+=" -x UCC_EC_CUDA_STREAM_TASK_MODE=driver"
@@ -45,10 +49,10 @@ MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1"
 # MPIFLAGS+=" -x UCC_TL_CUDA_ALLGATHER_RING_NUM_CHUNKS=32"
 
 # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=^sharp,mlx5"
-# MPIFLAGS+=" -x UCC_COLL_TRACE=debug"
+# MPIFLAGS+=" -x UCC_COLL_TRACE=info"
 # MPIFLAGS+=" -x UCC_LOG_LEVEL=debug"
 # MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1"
-MPIFLAGS+=" -x NCCL_P2P_NET_CHUNKSIZE=2MB"
+# MPIFLAGS+=" -x NCCL_P2P_NET_CHUNKSIZE=2MB"
 # MPIFLAGS+=" -x NCCL_DEBUG=TRACE" #INFO
 # MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event"
 # MPIFLAGS+=" -x CUDA_DEVICE_MAX_CONNECTIONS=2"
diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp
index 85059b89a31..9898df02ac8 100644
--- a/tests/cpp/test_multidevice_overlap.cpp
+++ b/tests/cpp/test_multidevice_overlap.cpp
@@ -51,7 +51,9 @@ using DummyOverlapBenchmarkParams = std::tuple<
     /*M=*/int64_t,
     /*K=*/int64_t,
     /*N=*/int64_t,
-    /*L(communication msgsize)=*/int64_t>;
+    /*L(communication msgsize)=*/int64_t,
+    /*pre_comm=*/bool,
+    /*post_comm=*/bool>;
 
 class DummyOverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterface<DummyOverlapBenchmarkParams> {
  protected:
@@ -81,7 +83,9 @@ TEST_P(DummyOverlapBenchmark, PipelinedAGMatmulBenchmark) {
         M,
         K,
         N,
-        L] = GetParam();
+        L,
+        pre_comm,
+        post_comm] = GetParam();
 
   std::vector<RankType> all_ranks(communicator_->size());
   std::iota(all_ranks.begin(), all_ranks.end(), 0);
@@ -112,13 +116,20 @@ TEST_P(DummyOverlapBenchmark, PipelinedAGMatmulBenchmark) {
       cudaEventRecord(start);
     }
 
-    setCurrentCUDAStream(communication_stream);
-    world->_allgather_base(dst, src)->wait();
+    if (pre_comm) {
+      setCurrentCUDAStream(communication_stream);
+      world->_allgather_base(dst, src)->wait();
+    }
 
     // compute
     setCurrentCUDAStream(compute_stream);
     torch::matmul_out(tc, ta, tb);
 
+    if (post_comm) {
+      setCurrentCUDAStream(communication_stream);
+      world->_allgather_base(dst, src)->wait();
+    }
+
     if (iteration == iteration_profiler_end) {
       cudaProfilerStop();;
     }
@@ -143,7 +154,9 @@ INSTANTIATE_TEST_SUITE_P(
     /*M=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)),
     /*K=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)),
     /*N=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)),
-    /*L=*/testing::Values(pow(2,10), pow(2,15), pow(2,17))),
+    /*L=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)),
+    /*pre-comm=*/testing::Bool(),
+    /*post-comm=*/testing::Bool()),
     [](const testing::TestParamInfo<DummyOverlapBenchmarkParams>& info)
         -> std::string {
       std::ostringstream os;
@@ -151,7 +164,9 @@ INSTANTIATE_TEST_SUITE_P(
          << "M" << std::get<1>(info.param) << "_"
          << "K" << std::get<2>(info.param) << "_"
          << "N" << std::get<3>(info.param) << "_"
-         << "L" << std::get<4>(info.param);
+         << "L" << std::get<4>(info.param)
+         << ((std::get<5>(info.param))? "_pre_comm" : "")
+         << ((std::get<6>(info.param))? "_post_comm" : "");
       return os.str();
     });
 

From bfc7fa6ac9e81d6b4a2552733cc9a76dc1c66635 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Fri, 6 Dec 2024 17:26:44 +0200
Subject: [PATCH 18/55] cleanup test script

---
 bench/test | 37 +++++++++++++++++++++++--------------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/bench/test b/bench/test
index 72c22480714..969b8da00e2 100755
--- a/bench/test
+++ b/bench/test
@@ -1,5 +1,5 @@
 #!/bin/bash
-EXPERIMENT=Dummy_profile_POST_COMM_UCC_TL_UCP_OVER_IB_LARGE
+EXPERIMENT=Dummy_profile_UCC_TL_CUDA
 DATE=$(date +%Y%m%d-%H%M)
 LOG_BASE="/opt/pytorch/Fuser/bench/logs"
 
@@ -10,7 +10,7 @@ export LOG_FILE_INFO="${LOGS}/info.txt"
 echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO
 
 NP=8
-BACKEND=NCCL
+BACKEND=UCC
 M=131072 #32768
 K=32768
 N=32768 #1024
@@ -20,8 +20,8 @@ Streams=8
 Pgs=1
 
 L=32768
-# PRE_COMM="_pre_comm"
-POST_COMM="_post_comm"
+PRE_COMM="_pre_comm"
+# POST_COMM="_post_comm"
 # UNFUSE="_unfused"
 # GRAPH="_WithCudaGraph"
 # cuStreamWrite=WithcuStreamWriteValue32_
@@ -33,28 +33,37 @@ export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}"
 echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO
 
 MPIFLAGS=" -np $NP"
-MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1"
+
+# MPIFLAGS+=" -x NCCL_P2P_NET_CHUNKSIZE=2MB"
+# MPIFLAGS+=" -x NCCL_DEBUG=TRACE" #INFO
+# MPIFLAGS+=" -x NCCL_MAX_NCHANNELS=1"
+
 # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl"
-# MPIFLAGS+=" -x UCC_CL_BASIC_TLS=cuda"
-MPIFLAGS+=" -x UCC_CL_BASIC_TLS=ucp"
-MPIFLAGS+=" -x UCX_RNDV_THRESH=0 -x UCX_TLS=ib,cuda_copy"
+# MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event"
+
+MPIFLAGS+=" -x UCC_CL_BASIC_TLS=cuda"
+# MPIFLAGS+=" -x UCC_TL_CUDA_SCRATCH_SIZE=32mb"
+# MPIFLAGS+=" -x UCC_TL_CUDA_ALLGATHER_RING_MAX_RINGS=32"
+# MPIFLAGS+=" -x UCC_TL_CUDA_ALLGATHER_RING_NUM_CHUNKS=32"
+
 # MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_NUM_WORKERS=8"
 # MPIFLAGS+=" -x UCC_EC_CUDA_USE_COOPERATIVE_LAUNCH=0"
 # MPIFLAGS+=" -x UCC_EC_CUDA_STREAM_TASK_MODE=driver"
 # MPIFLAGS+=" -x UCC_EC_CUDA_STREAM_TASK_MODE=kernel"
 # MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_COPY_LARGE_THRESH=1M"
 # MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_NUM_THREADS=512"
-# MPIFLAGS+=" -x UCC_TL_CUDA_SCRATCH_SIZE=32mb"
-# MPIFLAGS+=" -x UCC_TL_CUDA_ALLGATHER_RING_MAX_RINGS=32"
-# MPIFLAGS+=" -x UCC_TL_CUDA_ALLGATHER_RING_NUM_CHUNKS=32"
 
+# MPIFLAGS+=" -x UCC_CL_BASIC_TLS=ucp"
+# MPIFLAGS+=" -x UCX_RNDV_THRESH=0 -x UCX_TLS=ib,cuda_copy"
+# MPIFLAGS+=" -x UCX_RNDV_SCHEME=put_zcopy"
+# MPIFLAGS+=" -x UCX_RNDV_SCHEME=get_zcopy"
+
+
+MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1"
 # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=^sharp,mlx5"
 # MPIFLAGS+=" -x UCC_COLL_TRACE=info"
 # MPIFLAGS+=" -x UCC_LOG_LEVEL=debug"
 # MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1"
-# MPIFLAGS+=" -x NCCL_P2P_NET_CHUNKSIZE=2MB"
-# MPIFLAGS+=" -x NCCL_DEBUG=TRACE" #INFO
-# MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event"
 # MPIFLAGS+=" -x CUDA_DEVICE_MAX_CONNECTIONS=2"
 echo "mpi flags: $MPIFLAGS" | tee -a $LOG_FILE_INFO
 

From 1a1138cbb5629fd47c9d0c056ac21db68af2f77b Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Wed, 8 Jan 2025 18:28:45 +0200
Subject: [PATCH 19/55] update

---
 bench/test                             | 27 +++++++++--------
 csrc/multidevice/utils.cpp             |  6 ++--
 tests/cpp/test_multidevice_overlap.cpp | 42 ++++++++++++++++++--------
 3 files changed, 46 insertions(+), 29 deletions(-)

diff --git a/bench/test b/bench/test
index 969b8da00e2..1b5d6f41c5a 100755
--- a/bench/test
+++ b/bench/test
@@ -1,25 +1,19 @@
 #!/bin/bash
-EXPERIMENT=Dummy_profile_UCC_TL_CUDA
+EXPERIMENT=Dummy_profile_msgsize256m_float16_M128k_K128k_N32k_UCC_IB
 DATE=$(date +%Y%m%d-%H%M)
 LOG_BASE="/opt/pytorch/Fuser/bench/logs"
 
-export LOGS="${LOG_BASE}/${EXPERIMENT}_${DATE}"
-
-mkdir -p $LOGS
-export LOG_FILE_INFO="${LOGS}/info.txt"
-echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO
-
 NP=8
 BACKEND=UCC
 M=131072 #32768
-K=32768
+K=131072
 N=32768 #1024
 
 S=8
 Streams=8
 Pgs=1
 
-L=32768
+L=1048576  #268435456 #67108864 #131072
 PRE_COMM="_pre_comm"
 # POST_COMM="_post_comm"
 # UNFUSE="_unfused"
@@ -41,7 +35,7 @@ MPIFLAGS=" -np $NP"
 # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl"
 # MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event"
 
-MPIFLAGS+=" -x UCC_CL_BASIC_TLS=cuda"
+# MPIFLAGS+=" -x UCC_CL_BASIC_TLS=cuda"
 # MPIFLAGS+=" -x UCC_TL_CUDA_SCRATCH_SIZE=32mb"
 # MPIFLAGS+=" -x UCC_TL_CUDA_ALLGATHER_RING_MAX_RINGS=32"
 # MPIFLAGS+=" -x UCC_TL_CUDA_ALLGATHER_RING_NUM_CHUNKS=32"
@@ -53,10 +47,10 @@ MPIFLAGS+=" -x UCC_CL_BASIC_TLS=cuda"
 # MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_COPY_LARGE_THRESH=1M"
 # MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_NUM_THREADS=512"
 
-# MPIFLAGS+=" -x UCC_CL_BASIC_TLS=ucp"
-# MPIFLAGS+=" -x UCX_RNDV_THRESH=0 -x UCX_TLS=ib,cuda_copy"
+MPIFLAGS+=" -x UCC_CL_BASIC_TLS=ucp"
+MPIFLAGS+=" -x UCX_RNDV_THRESH=0 -x UCX_TLS=ib,cuda_copy"
 # MPIFLAGS+=" -x UCX_RNDV_SCHEME=put_zcopy"
-# MPIFLAGS+=" -x UCX_RNDV_SCHEME=get_zcopy"
+MPIFLAGS+=" -x UCX_RNDV_SCHEME=get_zcopy"
 
 
 MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1"
@@ -65,6 +59,13 @@ MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1"
 # MPIFLAGS+=" -x UCC_LOG_LEVEL=debug"
 # MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1"
 # MPIFLAGS+=" -x CUDA_DEVICE_MAX_CONNECTIONS=2"
+
+
+export LOGS="${LOG_BASE}/${EXPERIMENT}_${BACKEND}_${DATE}"
+mkdir -p $LOGS
+export LOG_FILE_INFO="${LOGS}/info.txt"
+echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO
+
 echo "mpi flags: $MPIFLAGS" | tee -a $LOG_FILE_INFO
 
 TEST_CMD="$BUILD_DIRECTORY/test_multidevice --gtest_filter=${GTEST_FILTER}"
diff --git a/csrc/multidevice/utils.cpp b/csrc/multidevice/utils.cpp
index d2117b222da..5eb4a8a21b9 100644
--- a/csrc/multidevice/utils.cpp
+++ b/csrc/multidevice/utils.cpp
@@ -43,11 +43,11 @@ std::unordered_set<IterDomain*> getShardedIterDomains(TensorView* tv) {
 // Returns whether a IterDomain in a TensorView is the outermost
 // allocated IterDomain in the TensorView.
 bool isOutermostAllocatedId(TensorView* tv, IterDomain* id) {
-  for (auto i : tv->getLoopDomain()) {
-    if (i == id) {
+  for (auto* loop_id : tv->getLoopDomain()) {
+    if (loop_id == id) {
       return true;
     }
-    if (!i->isDeviceDim() && !i->isReduction() && !i->isBroadcast()) {
+    if (!loop_id->isDeviceDim() && !loop_id->isReduction() && !loop_id->isBroadcast()) {
       return false;
     }
   }
diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp
index 9898df02ac8..a3999b477ba 100644
--- a/tests/cpp/test_multidevice_overlap.cpp
+++ b/tests/cpp/test_multidevice_overlap.cpp
@@ -73,10 +73,11 @@ class DummyOverlapBenchmark : public MultiDeviceTest, public testing::WithParamI
 std::map<std::string, float> DummyOverlapBenchmark::times = {};
 
 TEST_P(DummyOverlapBenchmark, PipelinedAGMatmulBenchmark) {
-  constexpr int64_t number_of_warmups = 50;
-  constexpr int64_t number_of_iterations = 100;
-  constexpr int64_t iteration_profiler_start = 10;
-  constexpr int64_t iteration_profiler_end = 15;
+  constexpr int64_t number_of_warmups = 20;
+  constexpr int64_t number_of_iterations = 80;
+  constexpr int64_t total_number_of_iterations = number_of_warmups + number_of_iterations;
+  constexpr int64_t iteration_profiler_start = 5;
+  constexpr int64_t iteration_profiler_end = 10;
 
 
   auto [backend,
@@ -90,27 +91,36 @@ TEST_P(DummyOverlapBenchmark, PipelinedAGMatmulBenchmark) {
   std::vector<RankType> all_ranks(communicator_->size());
   std::iota(all_ranks.begin(), all_ranks.end(), 0);
   auto world = communicator_->getBackendForTeam(all_ranks, backend);
+  auto nccl_world = communicator_->getBackendForTeam(all_ranks, CommunicatorBackend::kNccl);
 
   std::vector<c10::cuda::CUDAStream> streams =
       createStreams(2, communicator_->deviceId());
   auto& compute_stream = streams.at(0);
   auto& communication_stream = streams.at(1);
 
-  auto options = at::TensorOptions().dtype(at::kFloat).device(communicator_->device());
-  auto ta = at::randn({M, K}, options);
-  auto tb = at::randn({K, N}, options);
-  auto tc = at::empty({M, N}, options);
-  auto src = at::randn({L}, options);
-  auto dst = at::empty({L * communicator_->size()}, options);
+  auto options_matmul = at::TensorOptions().dtype(torch::kFloat16).device(communicator_->device());
+  auto ta = at::randn({M, K}, options_matmul);
+  auto tb = at::randn({K, N}, options_matmul);
+  auto tc = at::empty({M, N}, options_matmul);
+
+  auto options_comms = at::TensorOptions().dtype(torch::kFloat32).device(communicator_->device());
+  auto src = at::randn({L}, options_comms);
+  auto dst = at::empty({L * communicator_->size()},  options_comms);
+  std::vector<at::Tensor> barrier_scratch_buffer = {at::randn({1}, options_comms)};
 
   cudaEvent_t start, stop;
   cudaEventCreate(&start);
   cudaEventCreate(&stop);
 
+  nccl_world->allreduce(barrier_scratch_buffer)->wait();
+
   for (const auto& iteration :
-       c10::irange(number_of_warmups + number_of_iterations)) {
+       c10::irange(total_number_of_iterations)) {
+    if (iteration % 10 == 0 && communicator_->deviceId() == 0) {
+      std::cout << "iteration " << iteration <<"/" << total_number_of_iterations << std::endl;
+    }
     if (iteration == iteration_profiler_start) {
-      cudaProfilerStart();;
+      cudaProfilerStart();
     }
     if (iteration == number_of_warmups) {
       cudaEventRecord(start);
@@ -133,8 +143,14 @@ TEST_P(DummyOverlapBenchmark, PipelinedAGMatmulBenchmark) {
     if (iteration == iteration_profiler_end) {
       cudaProfilerStop();;
     }
+    if (!pre_comm & !post_comm) {
+      nccl_world->allreduce(barrier_scratch_buffer)->wait();
+    }
     synchronizeStreams(streams);
   }
+  if (pre_comm || post_comm) {
+    nccl_world->allreduce(barrier_scratch_buffer)->wait();
+  }
   cudaEventRecord(stop);
   cudaEventSynchronize(stop);
   float milliseconds = 0;
@@ -154,7 +170,7 @@ INSTANTIATE_TEST_SUITE_P(
     /*M=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)),
     /*K=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)),
     /*N=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)),
-    /*L=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)),
+    /*L=*/testing::Values(1, pow(2,10), pow(2,15), pow(2,17), pow(2,20), pow(2,24), pow(2,26), pow(2,28)),
     /*pre-comm=*/testing::Bool(),
     /*post-comm=*/testing::Bool()),
     [](const testing::TestParamInfo<DummyOverlapBenchmarkParams>& info)

From e037ee5b62418632055fe5f32f8659b0b4bc49d9 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Thu, 16 Jan 2025 02:38:52 -0800
Subject: [PATCH 20/55] test with stream parallel type and host IR

---
 bench/test                             |  34 ++++----
 tests/cpp/test_multidevice_overlap.cpp | 111 ++++++++++++++++++++++++-
 2 files changed, 127 insertions(+), 18 deletions(-)

diff --git a/bench/test b/bench/test
index 1b5d6f41c5a..6777835f7b4 100755
--- a/bench/test
+++ b/bench/test
@@ -1,28 +1,32 @@
 #!/bin/bash
-EXPERIMENT=Dummy_profile_msgsize256m_float16_M128k_K128k_N32k_UCC_IB
+EXPERIMENT=StreamParallelType_tests
 DATE=$(date +%Y%m%d-%H%M)
 LOG_BASE="/opt/pytorch/Fuser/bench/logs"
 
 NP=8
 BACKEND=UCC
-M=131072 #32768
-K=131072
-N=32768 #1024
+M=32768
+K=32768
+N=1024
 
 S=8
-Streams=8
+Streams=3
 Pgs=1
 
-L=1048576  #268435456 #67108864 #131072
-PRE_COMM="_pre_comm"
+# M=131072 #32768
+# K=131072
+# N=32768 #1024
+# L=1048576  #268435456 #67108864 #131072
+# PRE_COMM="_pre_comm"
 # POST_COMM="_post_comm"
 # UNFUSE="_unfused"
 # GRAPH="_WithCudaGraph"
 # cuStreamWrite=WithcuStreamWriteValue32_
 # GTEST_PREFIX="OverlapBenchmark.PipelinedAGMatmulBenchmark/"
-GTEST_PREFIX="DummyOverlapBenchmark.PipelinedAGMatmulBenchmark/"
-# GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}${UNFUSE}${GRAPH}"
-GTEST_POSTFIX="${BACKEND}_M${M}_K${K}_N${N}_L${L}${PRE_COMM}${POST_COMM}"
+# GTEST_PREFIX="DummyOverlapBenchmark.PipelinedAGMatmulBenchmark/"
+GTEST_PREFIX="OverlapBenchmark.PipelinedAGMatmulBenchmarkStreamParallelType/"
+GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}${UNFUSE}${GRAPH}"
+# GTEST_POSTFIX="${BACKEND}_M${M}_K${K}_N${N}_L${L}${PRE_COMM}${POST_COMM}"
 export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}"
 echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO
 
@@ -32,7 +36,7 @@ MPIFLAGS=" -np $NP"
 # MPIFLAGS+=" -x NCCL_DEBUG=TRACE" #INFO
 # MPIFLAGS+=" -x NCCL_MAX_NCHANNELS=1"
 
-# MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl"
+MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl"
 # MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event"
 
 # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=cuda"
@@ -47,15 +51,15 @@ MPIFLAGS=" -np $NP"
 # MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_COPY_LARGE_THRESH=1M"
 # MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_NUM_THREADS=512"
 
-MPIFLAGS+=" -x UCC_CL_BASIC_TLS=ucp"
-MPIFLAGS+=" -x UCX_RNDV_THRESH=0 -x UCX_TLS=ib,cuda_copy"
+# MPIFLAGS+=" -x UCC_CL_BASIC_TLS=ucp"
+# MPIFLAGS+=" -x UCX_RNDV_THRESH=0 -x UCX_TLS=ib,cuda_copy"
 # MPIFLAGS+=" -x UCX_RNDV_SCHEME=put_zcopy"
-MPIFLAGS+=" -x UCX_RNDV_SCHEME=get_zcopy"
+# MPIFLAGS+=" -x UCX_RNDV_SCHEME=get_zcopy"
 
 
 MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1"
 # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=^sharp,mlx5"
-# MPIFLAGS+=" -x UCC_COLL_TRACE=info"
+MPIFLAGS+=" -x UCC_COLL_TRACE=info"
 # MPIFLAGS+=" -x UCC_LOG_LEVEL=debug"
 # MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1"
 # MPIFLAGS+=" -x CUDA_DEVICE_MAX_CONNECTIONS=2"
diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp
index 272d785e2a1..7cf3cd288a4 100644
--- a/tests/cpp/test_multidevice_overlap.cpp
+++ b/tests/cpp/test_multidevice_overlap.cpp
@@ -345,15 +345,120 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) {
   }
 }
 
+TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmarkStreamParallelType) {
+  constexpr int64_t number_of_warmups = 50;
+  constexpr int64_t number_of_iterations = 200;
+  constexpr int64_t iteration_profiler_start = 10;
+  constexpr int64_t iteration_profiler_end = 15;
+
+  const int64_t D = communicator_->size();
+  auto [backend,
+        S,
+        M,
+        K,
+        N,
+        number_of_streams,
+        add_cuStreamWriteValue32,
+        number_of_pgs,
+        unfuse_loops,
+        use_cuda_graph] = GetParam();
+
+  if (M % (D * S) != 0) {
+    GTEST_SKIP() << "M must be a multiple of D * S, but got M = " << M
+                 << ", D = " << D << ", S = " << S;
+  }
+  if (add_cuStreamWriteValue32) {
+    GTEST_SKIP() << "cuStreamWriteValue32 not supported with StreamParallelType";
+  }
+  if (number_of_pgs > 1) {
+    GTEST_SKIP() << "StreamParallelType not supported with multiple process groups";
+  }
+  if (unfuse_loops) {
+    GTEST_SKIP() << "StreamParallelType not supported with unfused loops";
+  }
+  if (use_cuda_graph) {
+    GTEST_SKIP() << "StreamParallelType not supported with cuda graphs";
+  }
+
+
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  TensorView* a = makeContigTensor(4); //[S, DIDx(D), M/(S*D), K]
+  TensorView* b = makeContigTensor(2); //[K, N]
+  TensorView* c = matmul(a, b); //[S, D, M/(S*D), N]
+
+  fusion->addInput(a);
+  fusion->addInput(b);
+  fusion->addOutput(c);
+
+  auto mesh = DeviceMesh::createForNumDevices(D);
+  a->setDeviceMesh(mesh);
+  b->setDeviceMesh(mesh);
+  c->setDeviceMesh(mesh);
+
+  a->axis(1)->parallelize(ParallelType::DIDx);
+  c->axis(0)->parallelize(ParallelType::Stream);
+
+  communicator_->setDefaultBackend(backend);
+
+  hir::HostIrEvaluatorParams params;
+  params.number_of_streams = number_of_streams;
+  MultiDeviceExecutor executor(std::move(fusion), *communicator_, params);
+
+
+  auto tensor_options =
+      at::TensorOptions().dtype(at::kFloat).device(communicator_->device());
+  at::Tensor ta_unsharded = at::randn({S, D, M / (S * D), K}, tensor_options);
+  at::Tensor ta = ta_unsharded.slice(
+      1, communicator_->deviceId(), communicator_->deviceId() + 1);
+  at::Tensor tb = at::randn({K, N}, tensor_options);
+  at::Tensor tc_ref = at::matmul(ta_unsharded, tb);
+
+  std::vector<c10::IValue> inputs = {ta, tb};
+  at::Tensor tc;
+
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+
+  for (const auto& iteration :
+       c10::irange(number_of_warmups + number_of_iterations)) {
+    if (iteration == iteration_profiler_start) {
+      cudaProfilerStart();;
+    }
+    if (iteration == number_of_warmups) {
+      cudaEventRecord(start);
+    }
+
+    tc = executor.runWithInput(inputs).at(0);
+
+    if (iteration == iteration_profiler_end) {
+      cudaProfilerStop();;
+    }
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  float milliseconds = 0;
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  milliseconds /= number_of_iterations;
+
+  std::string test_name = ::testing::UnitTest::GetInstance()->current_test_info()->name();
+  times.insert({test_name, milliseconds});
+  std::cout << "rank " << communicator_->deviceId() << ", " << test_name << " : " << milliseconds << std::endl;
+
+  EXPECT_TRUE(torch::allclose(tc_ref, tc, 1e-1, 1e-1));
+}
+
 INSTANTIATE_TEST_SUITE_P(
     ,
     OverlapBenchmark,
     testing::Combine(
     testing::Values(CommunicatorBackend::kNccl, CommunicatorBackend::kUcc),
     /*S=*/testing::Values(1,2,4,8, 16, 32),
-    /*M=*/testing::Values(pow(2,10), pow(2,15)),
-    /*K=*/testing::Values(pow(2,10), pow(2,15)),
-    /*N=*/testing::Values(pow(2,10)),
+    /*M=*/testing::Values(pow(2,10), pow(2,15), pow(2,18)),
+    /*K=*/testing::Values(pow(2,10), pow(2,15), pow(2,18)),
+    /*N=*/testing::Values(pow(2,10), pow(2,15)),
     /*number_of_streams=*/testing::Values(3, 8, 32),
     /*add_cuStreamWriteValue32*/testing::Values(false, true),
     /*number_of_pgs=*/testing::Values(1, 2, 4, 8),

From 8328c2809420bab045248b34c321b74279198a17 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Mon, 20 Jan 2025 05:49:51 -0800
Subject: [PATCH 21/55] add support for other dtypes

---
 bench/test                             |  4 +++-
 tests/cpp/test_multidevice_overlap.cpp | 21 +++++++++++++--------
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/bench/test b/bench/test
index 6777835f7b4..d1836ac8ccb 100755
--- a/bench/test
+++ b/bench/test
@@ -9,6 +9,8 @@ M=32768
 K=32768
 N=1024
 
+DTYPE="__half" # float, __bfloat
+
 S=8
 Streams=3
 Pgs=1
@@ -25,7 +27,7 @@ Pgs=1
 # GTEST_PREFIX="OverlapBenchmark.PipelinedAGMatmulBenchmark/"
 # GTEST_PREFIX="DummyOverlapBenchmark.PipelinedAGMatmulBenchmark/"
 GTEST_PREFIX="OverlapBenchmark.PipelinedAGMatmulBenchmarkStreamParallelType/"
-GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}${UNFUSE}${GRAPH}"
+GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${DTYPE}_${cuStreamWrite}Pgs${Pgs}${UNFUSE}${GRAPH}"
 # GTEST_POSTFIX="${BACKEND}_M${M}_K${K}_N${N}_L${L}${PRE_COMM}${POST_COMM}"
 export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}"
 echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO
diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp
index 7cf3cd288a4..c08eea14b93 100644
--- a/tests/cpp/test_multidevice_overlap.cpp
+++ b/tests/cpp/test_multidevice_overlap.cpp
@@ -196,7 +196,8 @@ using OverlapBenchmarkParams = std::tuple<
     /*add_cuStreamWriteValue32=*/bool,
     /*number_of_pgs=*/int64_t,
     /*unfuse_loops=*/bool,
-    /*use_cuda_graph=*/bool>;
+    /*use_cuda_graph=*/bool,
+    DataType>;
 
 class OverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterface<OverlapBenchmarkParams> {
  protected:
@@ -233,7 +234,8 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) {
         add_cuStreamWriteValue32,
         number_of_pgs,
         unfuse_loops,
-        use_cuda_graph] = GetParam();
+        use_cuda_graph,
+        dtype] = GetParam();
 
   GTEST_ASSERT_EQ(M % S, 0);
 
@@ -244,7 +246,7 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) {
       createStreams(number_of_streams, communicator_->deviceId());
   setCurrentCUDAStream(streams.at(0));
 
-  auto options = at::TensorOptions().dtype(at::kFloat).device(communicator_->device());
+  auto options = at::TensorOptions().dtype(data_type_to_aten(dtype)).device(communicator_->device());
   auto ta = at::randn({S, M/S,K}, options);
   auto ta_unsharded = at::empty({S, D, M/S,K}, options);
   auto tb = at::randn({K,N}, options);
@@ -361,7 +363,8 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmarkStreamParallelType) {
         add_cuStreamWriteValue32,
         number_of_pgs,
         unfuse_loops,
-        use_cuda_graph] = GetParam();
+        use_cuda_graph,
+        dtype] = GetParam();
 
   if (M % (D * S) != 0) {
     GTEST_SKIP() << "M must be a multiple of D * S, but got M = " << M
@@ -384,8 +387,8 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmarkStreamParallelType) {
   auto fusion = std::make_unique<Fusion>();
   FusionGuard fg(fusion.get());
 
-  TensorView* a = makeContigTensor(4); //[S, DIDx(D), M/(S*D), K]
-  TensorView* b = makeContigTensor(2); //[K, N]
+  TensorView* a = makeContigTensor(4, dtype); //[S, DIDx(D), M/(S*D), K]
+  TensorView* b = makeContigTensor(2, dtype); //[K, N]
   TensorView* c = matmul(a, b); //[S, D, M/(S*D), N]
 
   fusion->addInput(a);
@@ -408,7 +411,7 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmarkStreamParallelType) {
 
 
   auto tensor_options =
-      at::TensorOptions().dtype(at::kFloat).device(communicator_->device());
+      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(communicator_->device());
   at::Tensor ta_unsharded = at::randn({S, D, M / (S * D), K}, tensor_options);
   at::Tensor ta = ta_unsharded.slice(
       1, communicator_->deviceId(), communicator_->deviceId() + 1);
@@ -463,7 +466,8 @@ INSTANTIATE_TEST_SUITE_P(
     /*add_cuStreamWriteValue32*/testing::Values(false, true),
     /*number_of_pgs=*/testing::Values(1, 2, 4, 8),
     /*unfuse_loops=*/testing::Values(false, true),
-    /*use_cuda_graph=*/testing::Values(false)), // cuda graphs not supported: ucc does not supports it (segfault) and nccl PG has a "syncStream" that throws
+    /*use_cuda_graph=*/testing::Values(false), // cuda graphs not supported: ucc does not supports it (segfault) and nccl PG has a "syncStream" that throws
+    testing::Values(DataType::Float, DataType::Half, DataType::BFloat16)),
     [](const testing::TestParamInfo<OverlapBenchmarkParams>& info)
         -> std::string {
       std::ostringstream os;
@@ -473,6 +477,7 @@ INSTANTIATE_TEST_SUITE_P(
          << "K" << std::get<3>(info.param) << "_"
          << "N" << std::get<4>(info.param) << "_"
          << "Streams" << std::get<5>(info.param) << "_"
+         << /*dtype:*/std::get<10>(info.param) << "_"
          << ((std::get<6>(info.param))? "WithcuStreamWriteValue32_" : "")
          << "Pgs" << std::get<7>(info.param)
          << ((std::get<8>(info.param))? "_unfused" : "")

From 2fecf02c58822a1fc2da9fdcfcc50ff4ab8204ad Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Wed, 22 Jan 2025 17:05:06 +0200
Subject: [PATCH 22/55] remove trace print

---
 bench/test | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bench/test b/bench/test
index d1836ac8ccb..8527e2d370c 100755
--- a/bench/test
+++ b/bench/test
@@ -61,7 +61,7 @@ MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl"
 
 MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1"
 # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=^sharp,mlx5"
-MPIFLAGS+=" -x UCC_COLL_TRACE=info"
+# MPIFLAGS+=" -x UCC_COLL_TRACE=info"
 # MPIFLAGS+=" -x UCC_LOG_LEVEL=debug"
 # MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1"
 # MPIFLAGS+=" -x CUDA_DEVICE_MAX_CONNECTIONS=2"

From 26f1f7a7c9e4dbeef4006110e5601ad6b3966219 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Thu, 23 Jan 2025 11:21:29 -0800
Subject: [PATCH 23/55] add stub files

---
 CMakeLists.txt                           | 13 ++++++++++++-
 tests/cpp/multidevice.h                  |  2 ++
 tests/cpp/multidevice_kernels.cu         | 23 +++++++++++++++++++++++
 tests/cpp/multidevice_kernels.h          | 16 ++++++++++++++++
 tests/cpp/test_multidevice_gpu_comms.cpp | 23 +++++++++++++++++++++++
 5 files changed, 76 insertions(+), 1 deletion(-)
 create mode 100644 tests/cpp/multidevice_kernels.cu
 create mode 100644 tests/cpp/multidevice_kernels.h
 create mode 100644 tests/cpp/test_multidevice_gpu_comms.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6e5322527cf..c1899c416e7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -611,6 +611,16 @@ if(BUILD_TEST)
   target_include_directories(${RNG_TEST_KERNELS} PRIVATE "${NVFUSER_ROOT}")
 endif()
 
+if(BUILD_TEST)
+  set(MULTIDEVICE_TEST_KERNELS "${NVFUSER_TESTS}_multidevice_kernels")
+  add_library(${MULTIDEVICE_TEST_KERNELS} SHARED ${NVFUSER_ROOT}/tests/cpp/multidevice_kernels.cu)
+
+  # CUDA 11 does not support C++20, so hard code C++17 here
+  set_property(TARGET ${MULTIDEVICE_TEST_KERNELS} PROPERTY CXX_STANDARD 17)
+  target_link_libraries(${MULTIDEVICE_TEST_KERNELS} PRIVATE torch ${TORCH_LIBRARIES} codegen_internal)
+  target_include_directories(${MULTIDEVICE_TEST_KERNELS} PRIVATE "${NVFUSER_ROOT}")
+endif()
+
 function(add_test_without_main TEST_NAME TEST_SRC ADDITIONAL_LINK)
   list(APPEND TEST_SRC
     ${NVFUSER_ROOT}/tests/cpp/utils.cpp
@@ -669,8 +679,9 @@ if(BUILD_TEST)
     ${NVFUSER_ROOT}/tests/cpp/test_multidevice_pipeline.cpp
     ${NVFUSER_ROOT}/tests/cpp/test_multidevice_sharding.cpp
     ${NVFUSER_ROOT}/tests/cpp/test_multidevice_transformer.cpp
+    ${NVFUSER_ROOT}/tests/cpp/test_multidevice_gpu_comms.cpp
   )
-  add_test_without_main(test_multidevice "${MULTIDEVICE_TEST_SRCS}" "")
+  add_test_without_main(test_multidevice "${MULTIDEVICE_TEST_SRCS}" ${MULTIDEVICE_TEST_KERNELS})
   list(APPEND TEST_BINARIES test_multidevice)
 
   set(MULTIDEVICE_TUTORIAL_SRCS)
diff --git a/tests/cpp/multidevice.h b/tests/cpp/multidevice.h
index 1831eb46bbb..9863c4e919e 100644
--- a/tests/cpp/multidevice.h
+++ b/tests/cpp/multidevice.h
@@ -48,4 +48,6 @@ class MultiDeviceTest : public NVFuserTest {
   void waitForDebuggerAtRank(DeviceIdxType rank);
 };
 
+__global__ void DummyMultiDeviceKernel();
+
 } // namespace nvfuser
diff --git a/tests/cpp/multidevice_kernels.cu b/tests/cpp/multidevice_kernels.cu
new file mode 100644
index 00000000000..6553bee9393
--- /dev/null
+++ b/tests/cpp/multidevice_kernels.cu
@@ -0,0 +1,23 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+
+// Warning: this file should not include any header from nvFuser or pytorch
+// (except raw headers). Compiling dynamic_type.h with nvcc is not supported.
+// Compiling pytorch with nvcc is not supported either.
+
+#include <tests/cpp/multidevice_kernels.h>
+
+namespace nvfuser {
+
+__global__ void DummyMultiDeviceKernel() {}
+
+void LaunchDummyMultiDeviceKernel() {
+  DummyMultiDeviceKernel<<<1, 1>>>();
+}
+
+} // namespace nvfuser
diff --git a/tests/cpp/multidevice_kernels.h b/tests/cpp/multidevice_kernels.h
new file mode 100644
index 00000000000..0f1099aa8c3
--- /dev/null
+++ b/tests/cpp/multidevice_kernels.h
@@ -0,0 +1,16 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+
+#pragma once
+
+
+namespace nvfuser {
+
+void LaunchDummyMultiDeviceKernel();
+
+} // namespace nvfuser
diff --git a/tests/cpp/test_multidevice_gpu_comms.cpp b/tests/cpp/test_multidevice_gpu_comms.cpp
new file mode 100644
index 00000000000..bfffdba70f0
--- /dev/null
+++ b/tests/cpp/test_multidevice_gpu_comms.cpp
@@ -0,0 +1,23 @@
+// clang-format off
+/*
+* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+* All rights reserved.
+* SPDX-License-Identifier: BSD-3-Clause
+*/
+// clang-format on
+#include <cuda_profiler_api.h>
+#include <fusion.h>
+#include <host_ir/container.h>
+#include <host_ir/executor.h>
+#include <ir/all_nodes.h>
+#include <ops/all_ops.h>
+#include <tests/cpp/multidevice.h>
+#include <tests/cpp/multidevice_kernels.h>
+
+namespace nvfuser {
+
+TEST_F(MultiDeviceTest, DummyMultiDeviceKernelTest) {
+  LaunchDummyMultiDeviceKernel();
+}
+
+} // namespace nvfuser

From 03b0147012452f393582a4cfbd30c6f10644519f Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Fri, 24 Jan 2025 06:08:05 -0800
Subject: [PATCH 24/55] first working example opening cuda ipc handles

---
 csrc/multidevice/communicator.h          |  4 ++
 tests/cpp/test_multidevice_gpu_comms.cpp | 54 +++++++++++++++++++++++-
 2 files changed, 56 insertions(+), 2 deletions(-)

diff --git a/csrc/multidevice/communicator.h b/csrc/multidevice/communicator.h
index f476de2b37a..9ee6c613da8 100644
--- a/csrc/multidevice/communicator.h
+++ b/csrc/multidevice/communicator.h
@@ -141,6 +141,10 @@ class Communicator {
     return false;
   }
 
+  auto getTcpStore() {
+    return store_;
+  }
+
  private:
   Communicator(
       CommunicatorBackend backend = comm_backend_default,
diff --git a/tests/cpp/test_multidevice_gpu_comms.cpp b/tests/cpp/test_multidevice_gpu_comms.cpp
index bfffdba70f0..8f662130257 100644
--- a/tests/cpp/test_multidevice_gpu_comms.cpp
+++ b/tests/cpp/test_multidevice_gpu_comms.cpp
@@ -16,8 +16,58 @@
 
 namespace nvfuser {
 
-TEST_F(MultiDeviceTest, DummyMultiDeviceKernelTest) {
-  LaunchDummyMultiDeviceKernel();
+namespace {
+
+#define CUDA_CALL(call) ASSERT_EQ((call), cudaSuccess)
+
+template <typename T>
+std::vector<uint8_t> toBytes(T data) {
+  return std::vector<uint8_t>(
+      reinterpret_cast<uint8_t*>(&data),
+      reinterpret_cast<uint8_t*>(&data) + sizeof(T));
+}
+
+template <typename T>
+T fromBytes(std::vector<uint8_t> bytes) {
+  return *reinterpret_cast<T*>(bytes.data());
+}
+
+} // namespace
+
+class GpuCommTest : public MultiDeviceTest {};
+
+TEST_F(GpuCommTest, IpcMemHandle) {
+  // Allocate GPU memory
+  constexpr size_t size = sizeof(int64_t);
+  const int64_t num_devices = communicator_->size();
+  const int64_t rank = communicator_->deviceId();
+  void* d_ptr;
+  CUDA_CALL(cudaMalloc(&d_ptr, size));
+
+  // Write the value 3 to the cuda buffer
+  const int64_t value = rank;
+  CUDA_CALL(cudaMemcpy(d_ptr, &value, sizeof(int64_t), cudaMemcpyHostToDevice));
+
+  cudaIpcMemHandle_t ipc_handle;
+  CUDA_CALL(cudaIpcGetMemHandle(&ipc_handle, d_ptr));
+
+  auto store = communicator_->getTcpStore();
+  store->set("ipc_handle_" + std::to_string(rank), toBytes(ipc_handle));
+  communicator_->barrier();
+  auto peer_ipc_handle = fromBytes<cudaIpcMemHandle_t>(store->get("ipc_handle_" + std::to_string((rank + 1) % num_devices)));
+
+  void* peer_d_ptr;
+  CUDA_CALL(cudaIpcOpenMemHandle(&peer_d_ptr, peer_ipc_handle, cudaIpcMemLazyEnablePeerAccess));
+
+  int64_t peer_value;
+  CUDA_CALL(cudaMemcpy(&peer_value, peer_d_ptr, size, cudaMemcpyDeviceToHost));
+
+  EXPECT_EQ((value + 1) % num_devices, peer_value);
+
+  // Clean up
+  CUDA_CALL(cudaIpcCloseMemHandle(peer_d_ptr));
+  CUDA_CALL(cudaFree(d_ptr));
+
 }
 
 } // namespace nvfuser

From 7625fab82011eba82f19d3b38decb2f7ec6a6f59 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Fri, 24 Jan 2025 07:21:14 -0800
Subject: [PATCH 25/55] adding a non-working example with
 cudaDeviceCanAccessPeer

---
 tests/cpp/test_multidevice_gpu_comms.cpp | 47 +++++++++++++++++++++++-
 1 file changed, 45 insertions(+), 2 deletions(-)

diff --git a/tests/cpp/test_multidevice_gpu_comms.cpp b/tests/cpp/test_multidevice_gpu_comms.cpp
index 8f662130257..13c05f228fd 100644
--- a/tests/cpp/test_multidevice_gpu_comms.cpp
+++ b/tests/cpp/test_multidevice_gpu_comms.cpp
@@ -44,9 +44,8 @@ TEST_F(GpuCommTest, IpcMemHandle) {
   void* d_ptr;
   CUDA_CALL(cudaMalloc(&d_ptr, size));
 
-  // Write the value 3 to the cuda buffer
   const int64_t value = rank;
-  CUDA_CALL(cudaMemcpy(d_ptr, &value, sizeof(int64_t), cudaMemcpyHostToDevice));
+  CUDA_CALL(cudaMemcpy(d_ptr, &value, size, cudaMemcpyHostToDevice));
 
   cudaIpcMemHandle_t ipc_handle;
   CUDA_CALL(cudaIpcGetMemHandle(&ipc_handle, d_ptr));
@@ -70,4 +69,48 @@ TEST_F(GpuCommTest, IpcMemHandle) {
 
 }
 
+TEST_F(GpuCommTest, DeviceEnablePeerAccess) {
+  // Doesn't seem to work when the PID are differents, i.e., when it's one CPU rank per GPU. The line "udaMemcpy(d_ptr, peer_d_ptr, size, cudaMemcpyDeviceToDevice)" throws.
+  // https://github.com/NVIDIA/nccl/blob/1672c85781ba6158d5d173d3ecac969f8796af11/src/transport/p2p.cc#L324-328
+  // https://github.com/NVIDIA/nccl/blob/1672c85781ba6158d5d173d3ecac969f8796af11/src/transport/p2p.cc#L249
+  GTEST_SKIP();
+
+  // Allocate GPU memory
+  constexpr size_t size = sizeof(int64_t);
+  const int64_t num_devices = communicator_->size();
+  const int64_t rank = communicator_->deviceId();
+  const int64_t peer = (rank + 1) % num_devices;
+  // const int64_t accessing_peer = (num_devices + rank - 1) % num_devices;
+
+  int can_access_peer;
+  CUDA_CALL(cudaDeviceCanAccessPeer (&can_access_peer, rank, peer));
+  if (!can_access_peer) {
+    GTEST_SKIP() << "Peer access not enabled between devices " << rank << " and " << peer;
+  }
+
+  CUDA_CALL(cudaDeviceEnablePeerAccess(peer, /*flag (reserved)*/0));
+
+  void* d_ptr;
+  CUDA_CALL(cudaMalloc(&d_ptr, size));
+
+  const int64_t value = rank;
+  CUDA_CALL(cudaMemcpy(d_ptr, &value, size, cudaMemcpyHostToDevice));
+
+
+  auto store = communicator_->getTcpStore();
+  store->set("d_ptr_" + std::to_string(rank), toBytes(d_ptr));
+  communicator_->barrier();
+  auto peer_d_ptr = fromBytes<void*>(store->get("d_ptr_" + std::to_string(peer)));
+
+  CUDA_CALL(cudaMemcpy(d_ptr, peer_d_ptr, size, cudaMemcpyDeviceToDevice));
+  int64_t peer_value;
+  CUDA_CALL(cudaMemcpy(&peer_value, d_ptr, size, cudaMemcpyDeviceToHost));
+
+  EXPECT_EQ((value + 1) % num_devices, peer_value);
+
+  // Clean up
+  CUDA_CALL(cudaDeviceDisablePeerAccess(peer)); // not necessary
+  CUDA_CALL(cudaFree(d_ptr));
+}
+
 } // namespace nvfuser

From f703abda1196179a2140eab84fbc06a0b60fe6f3 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Tue, 28 Jan 2025 09:36:16 -0800
Subject: [PATCH 26/55] cleanup

---
 tests/cpp/test_multidevice_gpu_comms.cpp | 44 ------------------------
 1 file changed, 44 deletions(-)

diff --git a/tests/cpp/test_multidevice_gpu_comms.cpp b/tests/cpp/test_multidevice_gpu_comms.cpp
index 13c05f228fd..a9973d656f4 100644
--- a/tests/cpp/test_multidevice_gpu_comms.cpp
+++ b/tests/cpp/test_multidevice_gpu_comms.cpp
@@ -69,48 +69,4 @@ TEST_F(GpuCommTest, IpcMemHandle) {
 
 }
 
-TEST_F(GpuCommTest, DeviceEnablePeerAccess) {
-  // Doesn't seem to work when the PID are differents, i.e., when it's one CPU rank per GPU. The line "udaMemcpy(d_ptr, peer_d_ptr, size, cudaMemcpyDeviceToDevice)" throws.
-  // https://github.com/NVIDIA/nccl/blob/1672c85781ba6158d5d173d3ecac969f8796af11/src/transport/p2p.cc#L324-328
-  // https://github.com/NVIDIA/nccl/blob/1672c85781ba6158d5d173d3ecac969f8796af11/src/transport/p2p.cc#L249
-  GTEST_SKIP();
-
-  // Allocate GPU memory
-  constexpr size_t size = sizeof(int64_t);
-  const int64_t num_devices = communicator_->size();
-  const int64_t rank = communicator_->deviceId();
-  const int64_t peer = (rank + 1) % num_devices;
-  // const int64_t accessing_peer = (num_devices + rank - 1) % num_devices;
-
-  int can_access_peer;
-  CUDA_CALL(cudaDeviceCanAccessPeer (&can_access_peer, rank, peer));
-  if (!can_access_peer) {
-    GTEST_SKIP() << "Peer access not enabled between devices " << rank << " and " << peer;
-  }
-
-  CUDA_CALL(cudaDeviceEnablePeerAccess(peer, /*flag (reserved)*/0));
-
-  void* d_ptr;
-  CUDA_CALL(cudaMalloc(&d_ptr, size));
-
-  const int64_t value = rank;
-  CUDA_CALL(cudaMemcpy(d_ptr, &value, size, cudaMemcpyHostToDevice));
-
-
-  auto store = communicator_->getTcpStore();
-  store->set("d_ptr_" + std::to_string(rank), toBytes(d_ptr));
-  communicator_->barrier();
-  auto peer_d_ptr = fromBytes<void*>(store->get("d_ptr_" + std::to_string(peer)));
-
-  CUDA_CALL(cudaMemcpy(d_ptr, peer_d_ptr, size, cudaMemcpyDeviceToDevice));
-  int64_t peer_value;
-  CUDA_CALL(cudaMemcpy(&peer_value, d_ptr, size, cudaMemcpyDeviceToHost));
-
-  EXPECT_EQ((value + 1) % num_devices, peer_value);
-
-  // Clean up
-  CUDA_CALL(cudaDeviceDisablePeerAccess(peer)); // not necessary
-  CUDA_CALL(cudaFree(d_ptr));
-}
-
 } // namespace nvfuser

From abf5c17d27f883173f81c8480ac1f4b167f87f17 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Tue, 28 Jan 2025 11:20:23 -0800
Subject: [PATCH 27/55] AllgatherThroughCudaMemcpyAsync

---
 tests/cpp/multidevice_kernels.cu         | 39 ++++++++++++++++++++++++
 tests/cpp/multidevice_kernels.h          | 34 ++++++++++++++++++++-
 tests/cpp/test_multidevice_gpu_comms.cpp | 37 ++++++++++++----------
 3 files changed, 93 insertions(+), 17 deletions(-)

diff --git a/tests/cpp/multidevice_kernels.cu b/tests/cpp/multidevice_kernels.cu
index 6553bee9393..7cbd4753eb3 100644
--- a/tests/cpp/multidevice_kernels.cu
+++ b/tests/cpp/multidevice_kernels.cu
@@ -11,13 +11,52 @@
 // Compiling pytorch with nvcc is not supported either.
 
 #include <tests/cpp/multidevice_kernels.h>
+#include <cuda.h>
 
 namespace nvfuser {
 
+#define CUDA_CALL(call) NVF_ERROR((call) == cudaSuccess, "CUDA call failed: ", cudaGetErrorString(cudaGetLastError()))
+
 __global__ void DummyMultiDeviceKernel() {}
 
 void LaunchDummyMultiDeviceKernel() {
   DummyMultiDeviceKernel<<<1, 1>>>();
 }
 
+int64_t AllgatherThroughCudaMemcpyAsync::running_counter = 0;
+
+AllgatherThroughCudaMemcpyAsync::AllgatherThroughCudaMemcpyAsync(at::Tensor input, std::vector<at::Tensor> outputs, Communicator* communicator) : unique_id(running_counter++), communicator_(communicator) {
+  cudaIpcMemHandle_t input_ipc_handle;
+  CUDA_CALL(cudaIpcGetMemHandle(&input_ipc_handle, input.data_ptr()));
+
+  auto store = communicator->getTcpStore();
+  const int64_t my_rank = communicator->deviceId();
+  store->set(prefix() + std::to_string(my_rank), toBytes(input_ipc_handle));
+
+  communicator_->barrier();
+
+  sizes_.resize(communicator_->size(), 0);
+  input_ptrs_.resize(communicator_->size(), nullptr);
+  output_ptrs_.resize(communicator_->size(), nullptr);
+  for (int64_t rank: c10::irange(communicator_->size())) {
+    auto output = outputs.at(rank);
+    sizes_.at(rank) = output.numel() * output.element_size();
+
+    output_ptrs_.at(rank) = output.data_ptr();
+    if (rank == my_rank) {
+      input_ptrs_.at(rank) = input.data_ptr();
+    } else {
+      auto peer_ipc_handle = fromBytes<cudaIpcMemHandle_t>(store->get(prefix() + std::to_string(rank)));
+      CUDA_CALL(cudaIpcOpenMemHandle(&input_ptrs_.at(rank), peer_ipc_handle, cudaIpcMemLazyEnablePeerAccess));
+    }
+  }
+}
+
+void AllgatherThroughCudaMemcpyAsync::post() const {
+  for (size_t i = 0; i < sizes_.size(); i++) {
+    CUDA_CALL(cudaMemcpyAsync(output_ptrs_.at(i), input_ptrs_.at(i), sizes_.at(i), cudaMemcpyDeviceToDevice));
+  }
+}
+
+
 } // namespace nvfuser
diff --git a/tests/cpp/multidevice_kernels.h b/tests/cpp/multidevice_kernels.h
index 0f1099aa8c3..40e29bd7989 100644
--- a/tests/cpp/multidevice_kernels.h
+++ b/tests/cpp/multidevice_kernels.h
@@ -7,10 +7,42 @@
 // clang-format on
 
 #pragma once
-
+#include <multidevice/communicator.h>
 
 namespace nvfuser {
 
+template <typename T>
+std::vector<uint8_t> toBytes(T data) {
+  return std::vector<uint8_t>(
+      reinterpret_cast<uint8_t*>(&data),
+      reinterpret_cast<uint8_t*>(&data) + sizeof(T));
+}
+
+template <typename T>
+T fromBytes(std::vector<uint8_t> bytes) {
+  return *reinterpret_cast<T*>(bytes.data());
+}
+
 void LaunchDummyMultiDeviceKernel();
 
+class AllgatherThroughCudaMemcpyAsync {
+ public:
+  AllgatherThroughCudaMemcpyAsync(at::Tensor input, std::vector<at::Tensor> outputs, Communicator* communicator);
+
+  void post() const;
+
+ private:
+  std::string prefix() const {
+    return "AllgatherThroughCudaMemcpyAsync" + std::to_string(unique_id);
+  }
+
+  static int64_t running_counter;
+  int64_t unique_id;
+  Communicator* communicator_;
+  std::vector<int64_t> sizes_;
+  std::vector<void*> input_ptrs_;
+  std::vector<void*> output_ptrs_;
+};
+
+
 } // namespace nvfuser
diff --git a/tests/cpp/test_multidevice_gpu_comms.cpp b/tests/cpp/test_multidevice_gpu_comms.cpp
index a9973d656f4..2afe784056d 100644
--- a/tests/cpp/test_multidevice_gpu_comms.cpp
+++ b/tests/cpp/test_multidevice_gpu_comms.cpp
@@ -16,24 +16,8 @@
 
 namespace nvfuser {
 
-namespace {
-
 #define CUDA_CALL(call) ASSERT_EQ((call), cudaSuccess)
 
-template <typename T>
-std::vector<uint8_t> toBytes(T data) {
-  return std::vector<uint8_t>(
-      reinterpret_cast<uint8_t*>(&data),
-      reinterpret_cast<uint8_t*>(&data) + sizeof(T));
-}
-
-template <typename T>
-T fromBytes(std::vector<uint8_t> bytes) {
-  return *reinterpret_cast<T*>(bytes.data());
-}
-
-} // namespace
-
 class GpuCommTest : public MultiDeviceTest {};
 
 TEST_F(GpuCommTest, IpcMemHandle) {
@@ -69,4 +53,25 @@ TEST_F(GpuCommTest, IpcMemHandle) {
 
 }
 
+TEST_F(GpuCommTest, Allgather) {
+  constexpr int64_t kTensorSize = 1024;
+
+  at::Tensor input = at::full({kTensorSize}, communicator_->deviceId(), tensor_options);
+  auto outputs = std::vector<at::Tensor>(communicator_->size());
+  std::generate(outputs.begin(), outputs.end(), [&]() {
+    return at::empty({kTensorSize}, tensor_options);
+  });
+
+  AllgatherThroughCudaMemcpyAsync allgather(input, outputs, communicator_);
+  allgather.post();
+
+  torch::cuda::synchronize();
+  communicator_->barrier();
+
+  for (int64_t i = 0; i < communicator_->size(); ++i) {
+    at::Tensor expected = at::full({kTensorSize}, i, tensor_options);
+    EXPECT_TRUE(outputs[i].equal(expected));
+  }
+}
+
 } // namespace nvfuser

From 836d59955592615c0ee46b0454df9e3c620e43cb Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Wed, 29 Jan 2025 15:44:03 +0200
Subject: [PATCH 28/55] refactor to expose choice of backend

---
 csrc/host_ir/executor.cpp                | 81 ++++++++++++++++++++----
 csrc/host_ir/executor.h                  | 33 ++++++++++
 csrc/host_ir/lower.cpp                   |  4 ++
 csrc/host_ir/lower.h                     | 12 +++-
 csrc/multidevice/communication.cpp       |  4 +-
 csrc/multidevice/communication.h         |  7 +-
 csrc/multidevice/communicator.cpp        |  3 +
 csrc/multidevice/communicator.h          |  3 -
 csrc/multidevice/executor.cpp            |  7 +-
 csrc/multidevice/executor.h              |  8 ++-
 csrc/multidevice/multidevice.h           |  3 +
 tests/cpp/multidevice_kernels.cu         | 36 -----------
 tests/cpp/multidevice_kernels.h          | 32 ----------
 tests/cpp/test_multidevice_gpu_comms.cpp |  2 +-
 tests/cpp/test_multidevice_pipeline.cpp  |  4 +-
 15 files changed, 147 insertions(+), 92 deletions(-)

diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp
index eba71fd6ee9..8806a8a5b46 100644
--- a/csrc/host_ir/executor.cpp
+++ b/csrc/host_ir/executor.cpp
@@ -69,7 +69,8 @@ void HostIrExecutor::compile(Fusion* fusion) {
   } else {
     std::vector<Expr*> exprs = fusion->exprs();
     for (Expr* e : exprs) {
-      std::vector<Expr*> communications = HostIrLower::lower(cloner.clone(e));
+      HostIrLower lower;
+      std::vector<Expr*> communications = lower.lower(cloner.clone(e));
       for (auto* communication : communications) {
         host_ir_container_->pushBackTopLevelExprs(communication);
       }
@@ -408,6 +409,45 @@ void HostIrEvaluator::handle(PostOnStream* post_ir) {
   }
 }
 
+int64_t AllgatherThroughCudaMemcpyAsync::running_counter = 0;
+
+AllgatherThroughCudaMemcpyAsync::AllgatherThroughCudaMemcpyAsync(at::Tensor input, std::vector<at::Tensor> outputs, Communicator* communicator) : unique_id(running_counter++), communicator_(communicator) {
+  cudaIpcMemHandle_t input_ipc_handle;
+  NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcGetMemHandle(&input_ipc_handle, input.data_ptr()));
+
+  auto store = communicator->getTcpStore();
+  const int64_t my_rank = communicator->deviceId();
+  store->set(prefix() + std::to_string(my_rank), toBytes(input_ipc_handle));
+
+  communicator_->barrier();
+
+  sizes_.resize(communicator_->size(), 0);
+  input_ptrs_.resize(communicator_->size(), nullptr);
+  output_ptrs_.resize(communicator_->size(), nullptr);
+  for (int64_t rank: c10::irange(communicator_->size())) {
+    auto output = outputs.at(rank);
+    sizes_.at(rank) = output.numel() * output.element_size();
+
+    output_ptrs_.at(rank) = output.data_ptr();
+    if (rank == my_rank) {
+      input_ptrs_.at(rank) = input.data_ptr();
+    } else {
+      auto peer_ipc_handle = fromBytes<cudaIpcMemHandle_t>(store->get(prefix() + std::to_string(rank)));
+      NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcOpenMemHandle(&input_ptrs_.at(rank), peer_ipc_handle, cudaIpcMemLazyEnablePeerAccess));
+    }
+  }
+}
+
+void AllgatherThroughCudaMemcpyAsync::post() const {
+  for (size_t i = 0; i < sizes_.size(); i++) {
+    NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync(output_ptrs_.at(i), input_ptrs_.at(i), sizes_.at(i), cudaMemcpyDeviceToDevice));
+  }
+}
+
+
+
+
+
 void HostIrEvaluator::handle(Communication* communication) {
   NVF_ERROR(
       communicator_ != nullptr && communicator_->is_available(),
@@ -418,14 +458,30 @@ void HostIrEvaluator::handle(Communication* communication) {
   at::Tensor output_tensor =
       getKnownTensorOrUndefined(communication->output(0), expr_evaluator_);
 
-  c10d::Backend* backend =
-      communicator_->getBackendForTeam(communication->team(), std::nullopt);
-  works_[communication] = postSingleCommunication(
-      communication,
-      communicator_->deviceId(),
-      backend,
-      input_tensor,
-      output_tensor);
+  CommunicatorBackend backend_type = communication->backend();
+
+  if (backend_type != CommunicatorBackend::kCuda) {
+    c10d::Backend* backend =
+        communicator_->getBackendForTeam(communication->team(), backend_type);
+    works_[communication] = postSingleCommunication(
+        communication,
+        communicator_->deviceId(),
+        backend,
+        input_tensor,
+        output_tensor);
+    return;
+  }
+
+  NVF_ERROR(communication->type() == CommunicationType::Allgather);
+  if (allgather_backends_.find(communication) == allgather_backends_.end()) {
+    allgather_backends_.try_emplace(
+        communication,
+        AllgatherThroughCudaMemcpyAsync(
+            input_tensor,
+            getKnownTensorOrUndefined(communication->outputs(), expr_evaluator_),
+            communicator_));
+  }
+  allgather_backends_.at(communication).post();
 }
 
 void HostIrEvaluator::handle(P2PCommunication* communication) {
@@ -446,8 +502,11 @@ void HostIrEvaluator::handle(P2PCommunication* communication) {
 
 void HostIrEvaluator::handle(Wait* wait) {
   Expr* communication = wait->communication();
-  NVF_ERROR(works_.find(communication) != works_.end(), "no wait req");
-  auto& work = works_.at(communication);
+  auto it = works_.find(communication);
+  if (it == works_.end()) {
+    return;
+  }
+  auto& work = it->second;
   if (work != nullptr) {
     work->wait();
   }
diff --git a/csrc/host_ir/executor.h b/csrc/host_ir/executor.h
index ad3e8422ca1..435a568bc63 100644
--- a/csrc/host_ir/executor.h
+++ b/csrc/host_ir/executor.h
@@ -48,8 +48,40 @@ class HostIrExecutor : public ExecutorAbstract {
   Communicator* communicator_;
 };
 
+template <typename T>
+std::vector<uint8_t> toBytes(T data) {
+  return std::vector<uint8_t>(
+      reinterpret_cast<uint8_t*>(&data),
+      reinterpret_cast<uint8_t*>(&data) + sizeof(T));
+}
+
+template <typename T>
+T fromBytes(std::vector<uint8_t> bytes) {
+  return *reinterpret_cast<T*>(bytes.data());
+}
+
 namespace hir {
 
+
+class AllgatherThroughCudaMemcpyAsync {
+ public:
+  AllgatherThroughCudaMemcpyAsync(at::Tensor input, std::vector<at::Tensor> outputs, Communicator* communicator);
+
+  void post() const;
+
+ private:
+  std::string prefix() const {
+    return "AllgatherThroughCudaMemcpyAsync" + std::to_string(unique_id);
+  }
+
+  static int64_t running_counter;
+  int64_t unique_id;
+  Communicator* communicator_;
+  std::vector<int64_t> sizes_;
+  std::vector<void*> input_ptrs_;
+  std::vector<void*> output_ptrs_;
+};
+
 /*
 a HostIrEvaluator evaluates a host programs represented through a
 HostIrContainer It is instantiated with the desired HostIrContainer, and runs
@@ -145,6 +177,7 @@ class HostIrEvaluator final : public OptOutDispatch {
   std::unordered_map<StreamKey, c10::cuda::CUDAStream> streams_;
   std::unordered_map<Expr*, c10::intrusive_ptr<c10d::Work>> works_;
   const int64_t my_device_index_;
+  std::unordered_map<Expr*, AllgatherThroughCudaMemcpyAsync> allgather_backends_;
 };
 
 } // namespace hir
diff --git a/csrc/host_ir/lower.cpp b/csrc/host_ir/lower.cpp
index ea52ba5eeb6..1631e773ea0 100644
--- a/csrc/host_ir/lower.cpp
+++ b/csrc/host_ir/lower.cpp
@@ -304,6 +304,10 @@ std::vector<Expr*> HostIrLower::lower(Expr* c) {
       lowerToBroadcastOrSendRecv(input_tv, output_tv, comms);
     }
   }
+
+  std::for_each(comms.begin(), comms.end(), [this](Expr* comm) {
+    comm->as<Communication>()->backend() = params_.communicator_backend;
+  });
   return comms;
 }
 
diff --git a/csrc/host_ir/lower.h b/csrc/host_ir/lower.h
index 02d120cb734..47417e9eba4 100644
--- a/csrc/host_ir/lower.h
+++ b/csrc/host_ir/lower.h
@@ -14,22 +14,30 @@
 
 namespace nvfuser {
 
+struct HostIrLowerParams {
+  CommunicatorBackend communicator_backend = CommunicatorBackend::kNccl;
+};
+
 class HostIrLower {
  public:
+
+  HostIrLower(HostIrLowerParams params = HostIrLowerParams()) : params_(params) {}
+
   // The flag `ignore_inner_resharding` is useful because the preseg passes
   // `InsertReshardingsPass` and `ReorderShardedAxisPass` want different
   // behaviors
   static bool canLower(Expr* expr, bool ignore_inner_resharding = false);
 
   // Lower a sharded Expr into a series of Communication.
-  static std::vector<Expr*> lower(Expr* c);
+  std::vector<Expr*> lower(Expr* c);
 
-  static std::unique_ptr<hir::HostIrContainer> lower(
+  std::unique_ptr<hir::HostIrContainer> lower(
       std::unique_ptr<Fusion> fusion,
       int64_t my_device_index);
 
  private:
   static std::vector<Expr*> lowerToCollectiveBasedPipelinedGemmComm(Expr* expr);
+  HostIrLowerParams params_;
 };
 
 } // namespace nvfuser
diff --git a/csrc/multidevice/communication.cpp b/csrc/multidevice/communication.cpp
index edcc40e4d5f..c49a5f3a85d 100644
--- a/csrc/multidevice/communication.cpp
+++ b/csrc/multidevice/communication.cpp
@@ -145,7 +145,8 @@ Communication::Communication(
     Team team,
     DeviceIdxType root,
     RedOpType red_op,
-    int64_t scattered_axis)
+    int64_t scattered_axis,
+    CommunicatorBackend backend)
     : Expr(passkey) {
   NVF_ERROR(
       in->getDeviceMesh().size() > 0,
@@ -161,6 +162,7 @@ Communication::Communication(
   addDataAttribute(root);
   addDataAttribute(red_op);
   addDataAttribute(scattered_axis);
+  addDataAttribute(backend);
 
   validate();
 }
diff --git a/csrc/multidevice/communication.h b/csrc/multidevice/communication.h
index 8631a1a04e5..2714ae4dcea 100644
--- a/csrc/multidevice/communication.h
+++ b/csrc/multidevice/communication.h
@@ -59,7 +59,8 @@ class Communication : public Expr {
                  // sharding.
       DeviceIdxType root = -1,
       RedOpType red_op = RedOpType::UNUSED,
-      int64_t scattered_axis = -1);
+      int64_t scattered_axis = -1,
+      CommunicatorBackend backend = CommunicatorBackend::kNccl);
 
   Communication(const Communication& other) = delete;
   Communication& operator=(const Communication& other) = delete;
@@ -107,6 +108,10 @@ class Communication : public Expr {
     return attribute<int64_t>(4);
   }
 
+  CommunicatorBackend& backend() const {
+    return attribute<CommunicatorBackend>(5);
+  }
+
   // PyTorch's process group expects the root to be specified
   // as an integer between 0 and world_size-1. We choose it to be
   // the device's relative index within the team
diff --git a/csrc/multidevice/communicator.cpp b/csrc/multidevice/communicator.cpp
index 6cf1a499bb9..ce102695637 100644
--- a/csrc/multidevice/communicator.cpp
+++ b/csrc/multidevice/communicator.cpp
@@ -38,6 +38,9 @@ std::ostream& operator<<(std::ostream& out, const CommunicatorBackend& cb) {
     case CommunicatorBackend::kGloo:
       out << "GLOO";
       break;
+    case CommunicatorBackend::kCuda:
+      out << "CUDA";
+      break;
   }
   return out;
 }
diff --git a/csrc/multidevice/communicator.h b/csrc/multidevice/communicator.h
index 9ee6c613da8..3ac48d9906b 100644
--- a/csrc/multidevice/communicator.h
+++ b/csrc/multidevice/communicator.h
@@ -36,9 +36,6 @@ namespace nvfuser {
 
 using RankType = DeviceIdxType;
 
-// Supported backends. TODO: gloo untested
-enum class CommunicatorBackend { kNccl, kUcc, kGloo };
-
 std::ostream& operator<<(std::ostream& out, const CommunicatorBackend& cb);
 
 #ifdef USE_C10D_NCCL
diff --git a/csrc/multidevice/executor.cpp b/csrc/multidevice/executor.cpp
index 963b80812d3..eaea12ef2f3 100644
--- a/csrc/multidevice/executor.cpp
+++ b/csrc/multidevice/executor.cpp
@@ -23,13 +23,14 @@ namespace nvfuser {
 MultiDeviceExecutor::MultiDeviceExecutor(
     std::unique_ptr<Fusion> fusion,
     Communicator& comm,
-    hir::HostIrEvaluatorParams params)
+    MultiDeviceExecutorParams params)
     : comm_(comm) {
+  HostIrLower lower(params.lower);
   std::unique_ptr<hir::HostIrContainer> hic =
-      HostIrLower::lower(std::move(fusion), comm.deviceId());
+      lower.lower(std::move(fusion), comm.deviceId());
   // Create the HostIrEvaluator representing the host program
   host_ir_executor_ =
-      std::make_unique<hir::HostIrEvaluator>(std::move(hic), &comm, params);
+      std::make_unique<hir::HostIrEvaluator>(std::move(hic), &comm, params.executor);
 }
 
 std::vector<at::Tensor> MultiDeviceExecutor::runWithInput(
diff --git a/csrc/multidevice/executor.h b/csrc/multidevice/executor.h
index 7cad0388b18..e43b7c57f72 100644
--- a/csrc/multidevice/executor.h
+++ b/csrc/multidevice/executor.h
@@ -11,6 +11,7 @@
 #include <exceptions.h>
 #include <fusion.h>
 #include <fusion_segmenter.h>
+#include <host_ir/lower.h>
 #include <host_ir/executor.h>
 #include <ir/cloner.h>
 #include <multidevice/communication.h>
@@ -19,6 +20,11 @@
 
 namespace nvfuser {
 
+struct MultiDeviceExecutorParams {
+  hir::HostIrEvaluatorParams executor = hir::HostIrEvaluatorParams();
+  HostIrLowerParams lower = HostIrLowerParams();
+};
+
 /*
   The MultiDeviceExecutor executes a Fusion on a multi-device setting.
   It is instantiated from a Fusion and a Communicator.
@@ -74,7 +80,7 @@ class MultiDeviceExecutor {
   MultiDeviceExecutor(
       std::unique_ptr<Fusion> fusion,
       Communicator& comm,
-      hir::HostIrEvaluatorParams params = hir::HostIrEvaluatorParams());
+      MultiDeviceExecutorParams params = MultiDeviceExecutorParams());
 
   // Run the fusion on several devices with the given global inputs
   std::vector<at::Tensor> runWithInput(const std::vector<c10::IValue>& inputs);
diff --git a/csrc/multidevice/multidevice.h b/csrc/multidevice/multidevice.h
index 0923383413f..46656f2aceb 100644
--- a/csrc/multidevice/multidevice.h
+++ b/csrc/multidevice/multidevice.h
@@ -15,4 +15,7 @@ using DeviceIdxType = int64_t;
 using DimensionType = int;
 using DeviceType = c10::Device;
 using Team = std::vector<DeviceIdxType>;
+
+// Supported backends. TODO: gloo untested
+enum class CommunicatorBackend { kNccl, kUcc, kGloo, kCuda };
 } // namespace nvfuser
diff --git a/tests/cpp/multidevice_kernels.cu b/tests/cpp/multidevice_kernels.cu
index 7cbd4753eb3..1d38e034137 100644
--- a/tests/cpp/multidevice_kernels.cu
+++ b/tests/cpp/multidevice_kernels.cu
@@ -23,40 +23,4 @@ void LaunchDummyMultiDeviceKernel() {
   DummyMultiDeviceKernel<<<1, 1>>>();
 }
 
-int64_t AllgatherThroughCudaMemcpyAsync::running_counter = 0;
-
-AllgatherThroughCudaMemcpyAsync::AllgatherThroughCudaMemcpyAsync(at::Tensor input, std::vector<at::Tensor> outputs, Communicator* communicator) : unique_id(running_counter++), communicator_(communicator) {
-  cudaIpcMemHandle_t input_ipc_handle;
-  CUDA_CALL(cudaIpcGetMemHandle(&input_ipc_handle, input.data_ptr()));
-
-  auto store = communicator->getTcpStore();
-  const int64_t my_rank = communicator->deviceId();
-  store->set(prefix() + std::to_string(my_rank), toBytes(input_ipc_handle));
-
-  communicator_->barrier();
-
-  sizes_.resize(communicator_->size(), 0);
-  input_ptrs_.resize(communicator_->size(), nullptr);
-  output_ptrs_.resize(communicator_->size(), nullptr);
-  for (int64_t rank: c10::irange(communicator_->size())) {
-    auto output = outputs.at(rank);
-    sizes_.at(rank) = output.numel() * output.element_size();
-
-    output_ptrs_.at(rank) = output.data_ptr();
-    if (rank == my_rank) {
-      input_ptrs_.at(rank) = input.data_ptr();
-    } else {
-      auto peer_ipc_handle = fromBytes<cudaIpcMemHandle_t>(store->get(prefix() + std::to_string(rank)));
-      CUDA_CALL(cudaIpcOpenMemHandle(&input_ptrs_.at(rank), peer_ipc_handle, cudaIpcMemLazyEnablePeerAccess));
-    }
-  }
-}
-
-void AllgatherThroughCudaMemcpyAsync::post() const {
-  for (size_t i = 0; i < sizes_.size(); i++) {
-    CUDA_CALL(cudaMemcpyAsync(output_ptrs_.at(i), input_ptrs_.at(i), sizes_.at(i), cudaMemcpyDeviceToDevice));
-  }
-}
-
-
 } // namespace nvfuser
diff --git a/tests/cpp/multidevice_kernels.h b/tests/cpp/multidevice_kernels.h
index 40e29bd7989..4cd1e6c16b5 100644
--- a/tests/cpp/multidevice_kernels.h
+++ b/tests/cpp/multidevice_kernels.h
@@ -11,38 +11,6 @@
 
 namespace nvfuser {
 
-template <typename T>
-std::vector<uint8_t> toBytes(T data) {
-  return std::vector<uint8_t>(
-      reinterpret_cast<uint8_t*>(&data),
-      reinterpret_cast<uint8_t*>(&data) + sizeof(T));
-}
-
-template <typename T>
-T fromBytes(std::vector<uint8_t> bytes) {
-  return *reinterpret_cast<T*>(bytes.data());
-}
-
 void LaunchDummyMultiDeviceKernel();
 
-class AllgatherThroughCudaMemcpyAsync {
- public:
-  AllgatherThroughCudaMemcpyAsync(at::Tensor input, std::vector<at::Tensor> outputs, Communicator* communicator);
-
-  void post() const;
-
- private:
-  std::string prefix() const {
-    return "AllgatherThroughCudaMemcpyAsync" + std::to_string(unique_id);
-  }
-
-  static int64_t running_counter;
-  int64_t unique_id;
-  Communicator* communicator_;
-  std::vector<int64_t> sizes_;
-  std::vector<void*> input_ptrs_;
-  std::vector<void*> output_ptrs_;
-};
-
-
 } // namespace nvfuser
diff --git a/tests/cpp/test_multidevice_gpu_comms.cpp b/tests/cpp/test_multidevice_gpu_comms.cpp
index 2afe784056d..10d82c99b85 100644
--- a/tests/cpp/test_multidevice_gpu_comms.cpp
+++ b/tests/cpp/test_multidevice_gpu_comms.cpp
@@ -62,7 +62,7 @@ TEST_F(GpuCommTest, Allgather) {
     return at::empty({kTensorSize}, tensor_options);
   });
 
-  AllgatherThroughCudaMemcpyAsync allgather(input, outputs, communicator_);
+  hir::AllgatherThroughCudaMemcpyAsync allgather(input, outputs, communicator_);
   allgather.post();
 
   torch::cuda::synchronize();
diff --git a/tests/cpp/test_multidevice_pipeline.cpp b/tests/cpp/test_multidevice_pipeline.cpp
index 5a626bfc967..dbd4befd98b 100644
--- a/tests/cpp/test_multidevice_pipeline.cpp
+++ b/tests/cpp/test_multidevice_pipeline.cpp
@@ -124,10 +124,12 @@ void PipelineTest::executeAndValidate(bool validate_with_prescribed_values) {
     std::cout << ss.str() << std::endl;
   }
 
+  MultiDeviceExecutorParams params;
+  params.executor = host_ir_executor_params;
   runtime = std::make_unique<MultiDeviceExecutor>(
       std::make_unique<Fusion>(*fusion),
       *communicator_,
-      host_ir_executor_params);
+      params);
   auto error_msg = runtime->validate();
   if (error_msg != "") {
     GTEST_SKIP() << error_msg;

From e09dd58d02828b382759c81e872d24f3c29addac Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Wed, 29 Jan 2025 06:43:47 -0800
Subject: [PATCH 29/55] add backend type to P2PCommunication

---
 csrc/multidevice/communication.cpp      |  4 +++-
 csrc/multidevice/communication.h        | 10 ++++++++--
 tests/cpp/test_multidevice_pipeline.cpp |  1 -
 3 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/csrc/multidevice/communication.cpp b/csrc/multidevice/communication.cpp
index c49a5f3a85d..07861329567 100644
--- a/csrc/multidevice/communication.cpp
+++ b/csrc/multidevice/communication.cpp
@@ -233,11 +233,13 @@ P2PCommunication::P2PCommunication(
     IrBuilderPasskey passkey,
     P2PCommunicationType type,
     TensorView* buffer,
-    Val* peer)
+    Val* peer,
+    CommunicatorBackend backend)
     : Expr(passkey) {
   addInput(buffer);
   addDataAttribute(type);
   addAttribute(peer);
+  addDataAttribute(backend);
 }
 
 NVFUSER_DEFINE_CLONE_AND_CREATE(P2PCommunication)
diff --git a/csrc/multidevice/communication.h b/csrc/multidevice/communication.h
index 2714ae4dcea..944df467a62 100644
--- a/csrc/multidevice/communication.h
+++ b/csrc/multidevice/communication.h
@@ -108,7 +108,7 @@ class Communication : public Expr {
     return attribute<int64_t>(4);
   }
 
-  CommunicatorBackend& backend() const {
+  CommunicatorBackend& backend() {
     return attribute<CommunicatorBackend>(5);
   }
 
@@ -133,7 +133,8 @@ class P2PCommunication : public Expr {
       IrBuilderPasskey passkey,
       P2PCommunicationType type,
       TensorView* buffer,
-      Val* peer);
+      Val* peer,
+      CommunicatorBackend backend = CommunicatorBackend::kNccl);
 
   P2PCommunication(const P2PCommunication& other) = delete;
   P2PCommunication& operator=(const P2PCommunication& other) = delete;
@@ -159,6 +160,11 @@ class P2PCommunication : public Expr {
   Val* peer() const {
     return attributeVal(1);
   }
+
+  CommunicatorBackend& backend() {
+    return attribute<CommunicatorBackend>(2);
+  }
+
 };
 
 // The method "post" triggers the execution of the communication. This call is
diff --git a/tests/cpp/test_multidevice_pipeline.cpp b/tests/cpp/test_multidevice_pipeline.cpp
index dbd4befd98b..ca113123e16 100644
--- a/tests/cpp/test_multidevice_pipeline.cpp
+++ b/tests/cpp/test_multidevice_pipeline.cpp
@@ -154,7 +154,6 @@ void PipelineTest::executeAndValidate(bool validate_with_prescribed_values) {
 
 PipelineTest::PipelineTest() {
   fusion = std::make_unique<Fusion>();
-  communicator_->setDefaultBackend(CommunicatorBackend::kNccl);
 }
 
 // To run the following tests on several devices, pytorch must be installed with

From 63717467dc86e70c934eec5d680d28156de5e408 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Thu, 30 Jan 2025 10:17:13 +0000
Subject: [PATCH 30/55] wip

---
 csrc/host_ir/executor.cpp                | 54 ++++++++++------
 csrc/host_ir/executor.h                  |  2 +-
 csrc/host_ir/lower.cpp                   |  6 +-
 csrc/host_ir/lower.h                     |  2 +-
 tests/cpp/test_multidevice_gpu_comms.cpp | 36 +++++++++++
 tests/cpp/test_multidevice_overlap.cpp   | 78 +++++++++++++-----------
 6 files changed, 121 insertions(+), 57 deletions(-)

diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp
index 8806a8a5b46..bc35fa4ec93 100644
--- a/csrc/host_ir/executor.cpp
+++ b/csrc/host_ir/executor.cpp
@@ -415,9 +415,12 @@ AllgatherThroughCudaMemcpyAsync::AllgatherThroughCudaMemcpyAsync(at::Tensor inpu
   cudaIpcMemHandle_t input_ipc_handle;
   NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcGetMemHandle(&input_ipc_handle, input.data_ptr()));
 
+  std::string rank_prefix = "_rank=";
+
   auto store = communicator->getTcpStore();
   const int64_t my_rank = communicator->deviceId();
-  store->set(prefix() + std::to_string(my_rank), toBytes(input_ipc_handle));
+  store->set(prefix() + rank_prefix + std::to_string(my_rank), toBytes(input_ipc_handle));
+  std::cout << "rank " << communicator_->deviceId() << " sets at key " << prefix() + rank_prefix + std::to_string(my_rank) <<  std::endl;
 
   communicator_->barrier();
 
@@ -432,7 +435,8 @@ AllgatherThroughCudaMemcpyAsync::AllgatherThroughCudaMemcpyAsync(at::Tensor inpu
     if (rank == my_rank) {
       input_ptrs_.at(rank) = input.data_ptr();
     } else {
-      auto peer_ipc_handle = fromBytes<cudaIpcMemHandle_t>(store->get(prefix() + std::to_string(rank)));
+      std::cout << "rank " << communicator_->deviceId() << " gets at key " << prefix() + rank_prefix + std::to_string(rank) << " for iteration " << rank <<  std::endl;
+      auto peer_ipc_handle = fromBytes<cudaIpcMemHandle_t>(store->get(prefix() + rank_prefix + std::to_string(rank)));
       NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcOpenMemHandle(&input_ptrs_.at(rank), peer_ipc_handle, cudaIpcMemLazyEnablePeerAccess));
     }
   }
@@ -440,6 +444,7 @@ AllgatherThroughCudaMemcpyAsync::AllgatherThroughCudaMemcpyAsync(at::Tensor inpu
 
 void AllgatherThroughCudaMemcpyAsync::post() const {
   for (size_t i = 0; i < sizes_.size(); i++) {
+    std::cout << "rank " << communicator_->deviceId() <<", iteration " << i << ", input_ptr=" << input_ptrs_.at(i) << ", output_ptr=" << output_ptrs_.at(i) << ", size=" << sizes_.at(i)<<  std::endl;
     NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync(output_ptrs_.at(i), input_ptrs_.at(i), sizes_.at(i), cudaMemcpyDeviceToDevice));
   }
 }
@@ -473,15 +478,23 @@ void HostIrEvaluator::handle(Communication* communication) {
   }
 
   NVF_ERROR(communication->type() == CommunicationType::Allgather);
-  if (allgather_backends_.find(communication) == allgather_backends_.end()) {
-    allgather_backends_.try_emplace(
-        communication,
-        AllgatherThroughCudaMemcpyAsync(
-            input_tensor,
-            getKnownTensorOrUndefined(communication->outputs(), expr_evaluator_),
-            communicator_));
-  }
-  allgather_backends_.at(communication).post();
+  // if (allgather_backends_.find(communication) == allgather_backends_.end()) {
+  //   // TODO: retrieve sharded axis here
+  //   auto output_tensors = at::tensor_split(output_tensor.squeeze(), communication->team_size(), 0);
+  //   allgather_backends_.try_emplace(
+  //       communication,
+  //       AllgatherThroughCudaMemcpyAsync(
+  //           input_tensor,
+  //           output_tensors,
+  //           communicator_));
+  // }
+  // allgather_backends_.at(communication).post();
+
+  auto output_tensors = at::tensor_split(output_tensor.squeeze(), communication->team_size(), 0);
+  AllgatherThroughCudaMemcpyAsync allgather_backend(input_tensor, output_tensors, communicator_);
+  allgather_backend.post();
+  torch::cuda::synchronize();
+  communicator_->barrier();
 }
 
 void HostIrEvaluator::handle(P2PCommunication* communication) {
@@ -492,12 +505,19 @@ void HostIrEvaluator::handle(P2PCommunication* communication) {
   at::Tensor buffer =
       getKnownTensorOrUndefined(communication->buffer(), expr_evaluator_);
 
-  works_[communication] = postSingleCommunication(
-      communication,
-      communicator_->deviceId(),
-      expr_evaluator_.evaluate(communication->peer()).as<int64_t>(),
-      communicator_->getWorld(),
-      buffer);
+  CommunicatorBackend backend_type = communication->backend();
+
+  if (backend_type != CommunicatorBackend::kCuda) {
+
+    works_[communication] = postSingleCommunication(
+        communication,
+        communicator_->deviceId(),
+        expr_evaluator_.evaluate(communication->peer()).as<int64_t>(),
+        communicator_->getWorld(),
+        buffer);
+    return;
+  }
+  NVF_ERROR(false, "CUDA backend not supported yet");
 }
 
 void HostIrEvaluator::handle(Wait* wait) {
diff --git a/csrc/host_ir/executor.h b/csrc/host_ir/executor.h
index 435a568bc63..f4cb7608d1d 100644
--- a/csrc/host_ir/executor.h
+++ b/csrc/host_ir/executor.h
@@ -71,7 +71,7 @@ class AllgatherThroughCudaMemcpyAsync {
 
  private:
   std::string prefix() const {
-    return "AllgatherThroughCudaMemcpyAsync" + std::to_string(unique_id);
+    return "AllgatherThroughCudaMemcpyAsync_uniqueId=" + std::to_string(unique_id);
   }
 
   static int64_t running_counter;
diff --git a/csrc/host_ir/lower.cpp b/csrc/host_ir/lower.cpp
index ff6a99dc421..0bb4a8b885c 100644
--- a/csrc/host_ir/lower.cpp
+++ b/csrc/host_ir/lower.cpp
@@ -475,7 +475,11 @@ std::vector<Expr*> HostIrLower::lowerToCollectiveBasedPipelinedGemmComm(
       CommunicationType::Allgather,
       /*out=*/tva_allgathered_j,
       /*in=*/tva_j,
-      /*team=*/tva->getDeviceMesh().vector());
+      /*team=*/tva->getDeviceMesh().vector(),
+      /*root=*/-1,
+      /*red_op=*/RedOpType::UNUSED,
+      /*scattered_axis=*/-1,
+      params_.communicator_backend);
   auto* wait = IrBuilder::create<hir::Wait>(communication);
 
   Expr* compute = nullptr;
diff --git a/csrc/host_ir/lower.h b/csrc/host_ir/lower.h
index 47417e9eba4..88d5dd10fa7 100644
--- a/csrc/host_ir/lower.h
+++ b/csrc/host_ir/lower.h
@@ -36,7 +36,7 @@ class HostIrLower {
       int64_t my_device_index);
 
  private:
-  static std::vector<Expr*> lowerToCollectiveBasedPipelinedGemmComm(Expr* expr);
+  std::vector<Expr*> lowerToCollectiveBasedPipelinedGemmComm(Expr* expr);
   HostIrLowerParams params_;
 };
 
diff --git a/tests/cpp/test_multidevice_gpu_comms.cpp b/tests/cpp/test_multidevice_gpu_comms.cpp
index 10d82c99b85..e017830ea53 100644
--- a/tests/cpp/test_multidevice_gpu_comms.cpp
+++ b/tests/cpp/test_multidevice_gpu_comms.cpp
@@ -53,6 +53,42 @@ TEST_F(GpuCommTest, IpcMemHandle) {
 
 }
 
+TEST_F(GpuCommTest, IpcMemHandlePtrArithmetic) {
+  // Allocate GPU memory
+  constexpr size_t size = 2 * sizeof(int64_t);
+  const int64_t num_devices = communicator_->size();
+  const int64_t rank = communicator_->deviceId();
+  const int64_t peer_rank = (rank + 1) % num_devices;
+  void* d_ptr;
+  CUDA_CALL(cudaMalloc(&d_ptr, size));
+
+  std::vector<int64_t> values;
+  values.push_back(2 * rank);
+  values.push_back(2 * rank + 1);
+  CUDA_CALL(cudaMemcpy(d_ptr, values.data(), size, cudaMemcpyHostToDevice));
+
+  cudaIpcMemHandle_t ipc_handle;
+  CUDA_CALL(cudaIpcGetMemHandle(&ipc_handle, d_ptr));
+
+  auto store = communicator_->getTcpStore();
+  store->set("ipc_handle_" + std::to_string(rank), toBytes(ipc_handle));
+  communicator_->barrier();
+  auto peer_ipc_handle = fromBytes<cudaIpcMemHandle_t>(store->get("ipc_handle_" + std::to_string(peer_rank)));
+
+  int64_t* peer_d_ptr;
+  CUDA_CALL(cudaIpcOpenMemHandle((void**)&peer_d_ptr, peer_ipc_handle, cudaIpcMemLazyEnablePeerAccess));
+
+  int64_t peer_value;
+  CUDA_CALL(cudaMemcpy(&peer_value, peer_d_ptr + 1, size / 2, cudaMemcpyDeviceToHost));
+
+  EXPECT_EQ(2 * peer_rank + 1, peer_value);
+
+  // Clean up
+  CUDA_CALL(cudaIpcCloseMemHandle(peer_d_ptr));
+  CUDA_CALL(cudaFree(d_ptr));
+
+}
+
 TEST_F(GpuCommTest, Allgather) {
   constexpr int64_t kTensorSize = 1024;
 
diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp
index c08eea14b93..100cc3b92a4 100644
--- a/tests/cpp/test_multidevice_overlap.cpp
+++ b/tests/cpp/test_multidevice_overlap.cpp
@@ -236,7 +236,9 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) {
         unfuse_loops,
         use_cuda_graph,
         dtype] = GetParam();
-
+  if (backend == CommunicatorBackend::kCuda) {
+    GTEST_SKIP() << "Cuda Backend not supported in this test";
+  }
   GTEST_ASSERT_EQ(M % S, 0);
 
   std::vector<RankType> all_ranks(communicator_->size());
@@ -348,10 +350,10 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) {
 }
 
 TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmarkStreamParallelType) {
-  constexpr int64_t number_of_warmups = 50;
-  constexpr int64_t number_of_iterations = 200;
-  constexpr int64_t iteration_profiler_start = 10;
-  constexpr int64_t iteration_profiler_end = 15;
+  // constexpr int64_t number_of_warmups = 50;
+  // constexpr int64_t number_of_iterations = 200;
+  // constexpr int64_t iteration_profiler_start = 10;
+  // constexpr int64_t iteration_profiler_end = 15;
 
   const int64_t D = communicator_->size();
   auto [backend,
@@ -403,10 +405,9 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmarkStreamParallelType) {
   a->axis(1)->parallelize(ParallelType::DIDx);
   c->axis(0)->parallelize(ParallelType::Stream);
 
-  communicator_->setDefaultBackend(backend);
-
-  hir::HostIrEvaluatorParams params;
-  params.number_of_streams = number_of_streams;
+  MultiDeviceExecutorParams params;
+  params.lower.communicator_backend = backend;
+  params.executor.number_of_streams = number_of_streams;
   MultiDeviceExecutor executor(std::move(fusion), *communicator_, params);
 
 
@@ -421,47 +422,50 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmarkStreamParallelType) {
   std::vector<c10::IValue> inputs = {ta, tb};
   at::Tensor tc;
 
-  cudaEvent_t start, stop;
-  cudaEventCreate(&start);
-  cudaEventCreate(&stop);
+  // cudaEvent_t start, stop;
+  // cudaEventCreate(&start);
+  // cudaEventCreate(&stop);
 
-  for (const auto& iteration :
-       c10::irange(number_of_warmups + number_of_iterations)) {
-    if (iteration == iteration_profiler_start) {
-      cudaProfilerStart();;
-    }
-    if (iteration == number_of_warmups) {
-      cudaEventRecord(start);
-    }
+  // for (const auto& iteration :
+  //      c10::irange(1)) {
+    // if (iteration == iteration_profiler_start) {
+    //   cudaProfilerStart();;
+    // }
+    // if (iteration == number_of_warmups) {
+    //   cudaEventRecord(start);
+    // }
 
     tc = executor.runWithInput(inputs).at(0);
 
-    if (iteration == iteration_profiler_end) {
-      cudaProfilerStop();;
-    }
-  }
-  cudaEventRecord(stop);
-  cudaEventSynchronize(stop);
-  float milliseconds = 0;
-  cudaEventElapsedTime(&milliseconds, start, stop);
-  milliseconds /= number_of_iterations;
+    // if (iteration == iteration_profiler_end) {
+    //   cudaProfilerStop();;
+    // }
+  // }
+  // cudaEventRecord(stop);
+  // cudaEventSynchronize(stop);
+  // float milliseconds = 0;
+  // cudaEventElapsedTime(&milliseconds, start, stop);
+  // milliseconds /= number_of_iterations;
 
-  std::string test_name = ::testing::UnitTest::GetInstance()->current_test_info()->name();
-  times.insert({test_name, milliseconds});
-  std::cout << "rank " << communicator_->deviceId() << ", " << test_name << " : " << milliseconds << std::endl;
+  // std::string test_name = ::testing::UnitTest::GetInstance()->current_test_info()->name();
+  // times.insert({test_name, milliseconds});
+  // std::cout << "rank " << communicator_->deviceId() << ", " << test_name << " : " << milliseconds << std::endl;
+
+  torch::cuda::synchronize();
+  communicator_->barrier();
 
-  EXPECT_TRUE(torch::allclose(tc_ref, tc, 1e-1, 1e-1));
+  EXPECT_TRUE(torch::allclose(tc_ref, tc, 1e-1, 1e-1)) << "rank " << communicator_->deviceId() << "failed.\ntc_ref: " << tc_ref << ",\ntc: " << tc;
 }
 
 INSTANTIATE_TEST_SUITE_P(
     ,
     OverlapBenchmark,
     testing::Combine(
-    testing::Values(CommunicatorBackend::kNccl, CommunicatorBackend::kUcc),
+    testing::Values(CommunicatorBackend::kNccl, CommunicatorBackend::kUcc, CommunicatorBackend::kCuda),
     /*S=*/testing::Values(1,2,4,8, 16, 32),
-    /*M=*/testing::Values(pow(2,10), pow(2,15), pow(2,18)),
-    /*K=*/testing::Values(pow(2,10), pow(2,15), pow(2,18)),
-    /*N=*/testing::Values(pow(2,10), pow(2,15)),
+    /*M=*/testing::Values(pow(2,3), pow(2,10), pow(2,15), pow(2,18)),
+    /*K=*/testing::Values(pow(2,3), pow(2,10), pow(2,15), pow(2,18)),
+    /*N=*/testing::Values(pow(2,3), pow(2,10), pow(2,15)),
     /*number_of_streams=*/testing::Values(3, 8, 32),
     /*add_cuStreamWriteValue32*/testing::Values(false, true),
     /*number_of_pgs=*/testing::Values(1, 2, 4, 8),

From b700a31b881ebc29c66dda1430eb68ef3db07097 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Thu, 30 Jan 2025 03:36:17 -0800
Subject: [PATCH 31/55] working chkpt

---
 csrc/host_ir/executor.cpp                | 37 ++++++++++++++++-----
 tests/cpp/test_multidevice_gpu_comms.cpp | 42 +++++++++++++++++++++++-
 tests/cpp/test_multidevice_overlap.cpp   |  2 ++
 3 files changed, 72 insertions(+), 9 deletions(-)

diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp
index bc35fa4ec93..4714d14ded2 100644
--- a/csrc/host_ir/executor.cpp
+++ b/csrc/host_ir/executor.cpp
@@ -411,16 +411,30 @@ void HostIrEvaluator::handle(PostOnStream* post_ir) {
 
 int64_t AllgatherThroughCudaMemcpyAsync::running_counter = 0;
 
+struct IpcTensorInfo {
+  cudaIpcMemHandle_t ipc_handle;
+  int64_t storage_offset;
+  int64_t element_size;
+};
+
 AllgatherThroughCudaMemcpyAsync::AllgatherThroughCudaMemcpyAsync(at::Tensor input, std::vector<at::Tensor> outputs, Communicator* communicator) : unique_id(running_counter++), communicator_(communicator) {
-  cudaIpcMemHandle_t input_ipc_handle;
-  NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcGetMemHandle(&input_ipc_handle, input.data_ptr()));
 
   std::string rank_prefix = "_rank=";
+  std::string ipc_handle_prefix = "_IpcHandle=";
+  std::string offset_prefix = "_Offset=";
+
+  IpcTensorInfo ipc_tensor_info;
+  NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcGetMemHandle(&ipc_tensor_info.ipc_handle, input.data_ptr()));
+  ipc_tensor_info.storage_offset = input.storage_offset();
+  ipc_tensor_info.element_size = input.element_size();
 
-  auto store = communicator->getTcpStore();
   const int64_t my_rank = communicator->deviceId();
-  store->set(prefix() + rank_prefix + std::to_string(my_rank), toBytes(input_ipc_handle));
-  std::cout << "rank " << communicator_->deviceId() << " sets at key " << prefix() + rank_prefix + std::to_string(my_rank) <<  std::endl;
+  auto store = communicator->getTcpStore();
+  store->set(prefix() + rank_prefix + std::to_string(my_rank), toBytes(ipc_tensor_info));
+  std::cout << "rank " << communicator_->deviceId()
+            << " sets at key " << prefix() + rank_prefix + std::to_string(my_rank)
+            << " offset " << input.storage_offset() << " at key " << prefix() + offset_prefix + std::to_string(my_rank)
+             << ", for input=" << input <<  std::endl;
 
   communicator_->barrier();
 
@@ -436,16 +450,20 @@ AllgatherThroughCudaMemcpyAsync::AllgatherThroughCudaMemcpyAsync(at::Tensor inpu
       input_ptrs_.at(rank) = input.data_ptr();
     } else {
       std::cout << "rank " << communicator_->deviceId() << " gets at key " << prefix() + rank_prefix + std::to_string(rank) << " for iteration " << rank <<  std::endl;
-      auto peer_ipc_handle = fromBytes<cudaIpcMemHandle_t>(store->get(prefix() + rank_prefix + std::to_string(rank)));
-      NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcOpenMemHandle(&input_ptrs_.at(rank), peer_ipc_handle, cudaIpcMemLazyEnablePeerAccess));
+      ipc_tensor_info = fromBytes<IpcTensorInfo>(store->get(prefix() + rank_prefix + std::to_string(rank)));
+      // auto peer_ipc_handle = fromBytes<cudaIpcMemHandle_t>(store->get(prefix() + rank_prefix + std::to_string(rank)));
+      void*& ptr = input_ptrs_.at(rank);
+      NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcOpenMemHandle(&ptr, ipc_tensor_info.ipc_handle, cudaIpcMemLazyEnablePeerAccess));
+      ptr = (void*)((uint8_t*)ptr + ipc_tensor_info.storage_offset * ipc_tensor_info.element_size);
     }
   }
 }
 
 void AllgatherThroughCudaMemcpyAsync::post() const {
   for (size_t i = 0; i < sizes_.size(); i++) {
-    std::cout << "rank " << communicator_->deviceId() <<", iteration " << i << ", input_ptr=" << input_ptrs_.at(i) << ", output_ptr=" << output_ptrs_.at(i) << ", size=" << sizes_.at(i)<<  std::endl;
     NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync(output_ptrs_.at(i), input_ptrs_.at(i), sizes_.at(i), cudaMemcpyDeviceToDevice));
+    torch::cuda::synchronize();
+    std::cout << "rank " << communicator_->deviceId() <<", iteration " << i << ", input_ptr=" << input_ptrs_.at(i) << ", output_ptr=" << output_ptrs_.at(i) << ", size=" << sizes_.at(i) <<  std::endl;
   }
 }
 
@@ -495,6 +513,9 @@ void HostIrEvaluator::handle(Communication* communication) {
   allgather_backend.post();
   torch::cuda::synchronize();
   communicator_->barrier();
+  if (communicator_->deviceId() == 0) {
+    std::cout << "rank " << communicator_->deviceId() << " finishes allgather, output=" << output_tensor << std::endl;
+  }
 }
 
 void HostIrEvaluator::handle(P2PCommunication* communication) {
diff --git a/tests/cpp/test_multidevice_gpu_comms.cpp b/tests/cpp/test_multidevice_gpu_comms.cpp
index e017830ea53..d59fe3628a1 100644
--- a/tests/cpp/test_multidevice_gpu_comms.cpp
+++ b/tests/cpp/test_multidevice_gpu_comms.cpp
@@ -53,7 +53,9 @@ TEST_F(GpuCommTest, IpcMemHandle) {
 
 }
 
-TEST_F(GpuCommTest, IpcMemHandlePtrArithmetic) {
+TEST_F(GpuCommTest, IpcMemHandlePtrArithmeticAtReceiver) {
+  // TLDR; We can do pointer arithmetic on the receiver side.
+
   // Allocate GPU memory
   constexpr size_t size = 2 * sizeof(int64_t);
   const int64_t num_devices = communicator_->size();
@@ -89,6 +91,44 @@ TEST_F(GpuCommTest, IpcMemHandlePtrArithmetic) {
 
 }
 
+TEST_F(GpuCommTest, IpcMemHandlePtrArithmeticAtSender) {
+  // TLDR; We CANNOT do pointer arithmetic on the sender side! The IPC handle points to the beginning of the allocated buffer.
+
+  // Allocate GPU memory
+  constexpr size_t size = 2 * sizeof(int64_t);
+  const int64_t num_devices = communicator_->size();
+  const int64_t rank = communicator_->deviceId();
+  const int64_t peer_rank = (rank + 1) % num_devices;
+  int64_t* d_ptr;
+  CUDA_CALL(cudaMalloc(&d_ptr, size));
+
+  std::vector<int64_t> values;
+  values.push_back(2 * rank);
+  values.push_back(2 * rank + 1);
+  CUDA_CALL(cudaMemcpy(d_ptr, values.data(), size, cudaMemcpyHostToDevice));
+
+  cudaIpcMemHandle_t ipc_handle;
+  CUDA_CALL(cudaIpcGetMemHandle(&ipc_handle, d_ptr + 1));
+
+  auto store = communicator_->getTcpStore();
+  store->set("ipc_handle_" + std::to_string(rank), toBytes(ipc_handle));
+  communicator_->barrier();
+  auto peer_ipc_handle = fromBytes<cudaIpcMemHandle_t>(store->get("ipc_handle_" + std::to_string(peer_rank)));
+
+  int64_t* peer_d_ptr;
+  CUDA_CALL(cudaIpcOpenMemHandle((void**)&peer_d_ptr, peer_ipc_handle, cudaIpcMemLazyEnablePeerAccess));
+
+  int64_t peer_value;
+  CUDA_CALL(cudaMemcpy(&peer_value, peer_d_ptr, size / 2, cudaMemcpyDeviceToHost));
+
+  EXPECT_EQ(2 * peer_rank, peer_value); // and not 2 * peer_rank + 1 as could be expected!
+
+  // Clean up
+  CUDA_CALL(cudaIpcCloseMemHandle(peer_d_ptr));
+  CUDA_CALL(cudaFree(d_ptr));
+
+}
+
 TEST_F(GpuCommTest, Allgather) {
   constexpr int64_t kTensorSize = 1024;
 
diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp
index 100cc3b92a4..4fa60f67ecc 100644
--- a/tests/cpp/test_multidevice_overlap.cpp
+++ b/tests/cpp/test_multidevice_overlap.cpp
@@ -422,6 +422,8 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmarkStreamParallelType) {
   std::vector<c10::IValue> inputs = {ta, tb};
   at::Tensor tc;
 
+  std::cout << "rank " << communicator_->deviceId() << ", ta_unsharded_ptr=" << ta_unsharded.data_ptr() << ", ta_ptr=" << ta.data_ptr()  << std::endl;
+
   // cudaEvent_t start, stop;
   // cudaEventCreate(&start);
   // cudaEventCreate(&stop);

From 1838d1e7d91fedaa161c4d6a8bd56810741c98f1 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Thu, 30 Jan 2025 14:04:34 +0200
Subject: [PATCH 32/55] remove prints

---
 csrc/host_ir/executor.cpp                | 16 ++--------------
 tests/cpp/test_multidevice_gpu_comms.cpp |  2 --
 2 files changed, 2 insertions(+), 16 deletions(-)

diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp
index 4714d14ded2..de9d3ecbcba 100644
--- a/csrc/host_ir/executor.cpp
+++ b/csrc/host_ir/executor.cpp
@@ -420,8 +420,6 @@ struct IpcTensorInfo {
 AllgatherThroughCudaMemcpyAsync::AllgatherThroughCudaMemcpyAsync(at::Tensor input, std::vector<at::Tensor> outputs, Communicator* communicator) : unique_id(running_counter++), communicator_(communicator) {
 
   std::string rank_prefix = "_rank=";
-  std::string ipc_handle_prefix = "_IpcHandle=";
-  std::string offset_prefix = "_Offset=";
 
   IpcTensorInfo ipc_tensor_info;
   NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcGetMemHandle(&ipc_tensor_info.ipc_handle, input.data_ptr()));
@@ -431,10 +429,6 @@ AllgatherThroughCudaMemcpyAsync::AllgatherThroughCudaMemcpyAsync(at::Tensor inpu
   const int64_t my_rank = communicator->deviceId();
   auto store = communicator->getTcpStore();
   store->set(prefix() + rank_prefix + std::to_string(my_rank), toBytes(ipc_tensor_info));
-  std::cout << "rank " << communicator_->deviceId()
-            << " sets at key " << prefix() + rank_prefix + std::to_string(my_rank)
-            << " offset " << input.storage_offset() << " at key " << prefix() + offset_prefix + std::to_string(my_rank)
-             << ", for input=" << input <<  std::endl;
 
   communicator_->barrier();
 
@@ -449,7 +443,6 @@ AllgatherThroughCudaMemcpyAsync::AllgatherThroughCudaMemcpyAsync(at::Tensor inpu
     if (rank == my_rank) {
       input_ptrs_.at(rank) = input.data_ptr();
     } else {
-      std::cout << "rank " << communicator_->deviceId() << " gets at key " << prefix() + rank_prefix + std::to_string(rank) << " for iteration " << rank <<  std::endl;
       ipc_tensor_info = fromBytes<IpcTensorInfo>(store->get(prefix() + rank_prefix + std::to_string(rank)));
       // auto peer_ipc_handle = fromBytes<cudaIpcMemHandle_t>(store->get(prefix() + rank_prefix + std::to_string(rank)));
       void*& ptr = input_ptrs_.at(rank);
@@ -457,20 +450,18 @@ AllgatherThroughCudaMemcpyAsync::AllgatherThroughCudaMemcpyAsync(at::Tensor inpu
       ptr = (void*)((uint8_t*)ptr + ipc_tensor_info.storage_offset * ipc_tensor_info.element_size);
     }
   }
+  // TODO: close ipc mem handle at shutdown
 }
 
 void AllgatherThroughCudaMemcpyAsync::post() const {
+  // TODO: use multicast
   for (size_t i = 0; i < sizes_.size(); i++) {
     NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync(output_ptrs_.at(i), input_ptrs_.at(i), sizes_.at(i), cudaMemcpyDeviceToDevice));
     torch::cuda::synchronize();
-    std::cout << "rank " << communicator_->deviceId() <<", iteration " << i << ", input_ptr=" << input_ptrs_.at(i) << ", output_ptr=" << output_ptrs_.at(i) << ", size=" << sizes_.at(i) <<  std::endl;
   }
 }
 
 
-
-
-
 void HostIrEvaluator::handle(Communication* communication) {
   NVF_ERROR(
       communicator_ != nullptr && communicator_->is_available(),
@@ -513,9 +504,6 @@ void HostIrEvaluator::handle(Communication* communication) {
   allgather_backend.post();
   torch::cuda::synchronize();
   communicator_->barrier();
-  if (communicator_->deviceId() == 0) {
-    std::cout << "rank " << communicator_->deviceId() << " finishes allgather, output=" << output_tensor << std::endl;
-  }
 }
 
 void HostIrEvaluator::handle(P2PCommunication* communication) {
diff --git a/tests/cpp/test_multidevice_gpu_comms.cpp b/tests/cpp/test_multidevice_gpu_comms.cpp
index d59fe3628a1..026df97ea26 100644
--- a/tests/cpp/test_multidevice_gpu_comms.cpp
+++ b/tests/cpp/test_multidevice_gpu_comms.cpp
@@ -88,7 +88,6 @@ TEST_F(GpuCommTest, IpcMemHandlePtrArithmeticAtReceiver) {
   // Clean up
   CUDA_CALL(cudaIpcCloseMemHandle(peer_d_ptr));
   CUDA_CALL(cudaFree(d_ptr));
-
 }
 
 TEST_F(GpuCommTest, IpcMemHandlePtrArithmeticAtSender) {
@@ -126,7 +125,6 @@ TEST_F(GpuCommTest, IpcMemHandlePtrArithmeticAtSender) {
   // Clean up
   CUDA_CALL(cudaIpcCloseMemHandle(peer_d_ptr));
   CUDA_CALL(cudaFree(d_ptr));
-
 }
 
 TEST_F(GpuCommTest, Allgather) {

From 21eed4a7acf0e9e05a117f312deaf80f1a24230a Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Thu, 30 Jan 2025 08:51:02 -0800
Subject: [PATCH 33/55] working chkpt

---
 csrc/host_ir/executor.cpp              | 19 ++++----
 csrc/host_ir/executor.h                |  2 +-
 tests/cpp/test_multidevice_overlap.cpp | 61 ++++++++++++--------------
 3 files changed, 37 insertions(+), 45 deletions(-)

diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp
index de9d3ecbcba..012f5d054e2 100644
--- a/csrc/host_ir/executor.cpp
+++ b/csrc/host_ir/executor.cpp
@@ -190,7 +190,7 @@ HostIrEvaluator::HostIrEvaluator(
     : container_(std::move(container)),
       communicator_(communicator),
       params_(params),
-      my_device_index_(communicator_ ? communicator_->deviceId() : 0) {
+      my_local_device_index_(communicator_ ? communicator_->local_rank() : 0) {
   const DeviceIdxType device_index =
       (communicator_ != nullptr && communicator_->is_available())
       ? communicator_->deviceId()
@@ -280,13 +280,13 @@ void HostIrEvaluator::handle(GetCurrentStream* get_current_stream) {
   streams_.insert(
       {get_current_stream->stream(),
        c10::cuda::getCurrentCUDAStream(
-           static_cast<c10::DeviceIndex>(my_device_index_))});
+           static_cast<c10::DeviceIndex>(my_local_device_index_))});
 }
 
 void HostIrEvaluator::handle(Synchronize* synchronize) {
   cudaStream_t current_stream =
       c10::cuda::getCurrentCUDAStream(
-          static_cast<c10::DeviceIndex>(my_device_index_))
+          static_cast<c10::DeviceIndex>(my_local_device_index_))
           .stream();
   cudaStream_t stream_to_sync = getCUDAStream(synchronize->stream()).stream();
 
@@ -419,7 +419,6 @@ struct IpcTensorInfo {
 
 AllgatherThroughCudaMemcpyAsync::AllgatherThroughCudaMemcpyAsync(at::Tensor input, std::vector<at::Tensor> outputs, Communicator* communicator) : unique_id(running_counter++), communicator_(communicator) {
 
-  std::string rank_prefix = "_rank=";
 
   IpcTensorInfo ipc_tensor_info;
   NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcGetMemHandle(&ipc_tensor_info.ipc_handle, input.data_ptr()));
@@ -428,7 +427,7 @@ AllgatherThroughCudaMemcpyAsync::AllgatherThroughCudaMemcpyAsync(at::Tensor inpu
 
   const int64_t my_rank = communicator->deviceId();
   auto store = communicator->getTcpStore();
-  store->set(prefix() + rank_prefix + std::to_string(my_rank), toBytes(ipc_tensor_info));
+  store->set(prefix() + std::to_string(my_rank), toBytes(ipc_tensor_info));
 
   communicator_->barrier();
 
@@ -443,8 +442,7 @@ AllgatherThroughCudaMemcpyAsync::AllgatherThroughCudaMemcpyAsync(at::Tensor inpu
     if (rank == my_rank) {
       input_ptrs_.at(rank) = input.data_ptr();
     } else {
-      ipc_tensor_info = fromBytes<IpcTensorInfo>(store->get(prefix() + rank_prefix + std::to_string(rank)));
-      // auto peer_ipc_handle = fromBytes<cudaIpcMemHandle_t>(store->get(prefix() + rank_prefix + std::to_string(rank)));
+      ipc_tensor_info = fromBytes<IpcTensorInfo>(store->get(prefix() + std::to_string(rank)));
       void*& ptr = input_ptrs_.at(rank);
       NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcOpenMemHandle(&ptr, ipc_tensor_info.ipc_handle, cudaIpcMemLazyEnablePeerAccess));
       ptr = (void*)((uint8_t*)ptr + ipc_tensor_info.storage_offset * ipc_tensor_info.element_size);
@@ -454,10 +452,10 @@ AllgatherThroughCudaMemcpyAsync::AllgatherThroughCudaMemcpyAsync(at::Tensor inpu
 }
 
 void AllgatherThroughCudaMemcpyAsync::post() const {
+  cudaStream_t stream = c10::cuda::getCurrentCUDAStream(static_cast<c10::DeviceIndex>(communicator_->local_rank())).stream();
   // TODO: use multicast
   for (size_t i = 0; i < sizes_.size(); i++) {
-    NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync(output_ptrs_.at(i), input_ptrs_.at(i), sizes_.at(i), cudaMemcpyDeviceToDevice));
-    torch::cuda::synchronize();
+    NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync(output_ptrs_.at(i), input_ptrs_.at(i), sizes_.at(i), cudaMemcpyDeviceToDevice, stream));
   }
 }
 
@@ -487,6 +485,7 @@ void HostIrEvaluator::handle(Communication* communication) {
   }
 
   NVF_ERROR(communication->type() == CommunicationType::Allgather);
+  // TODO: fix registration cache
   // if (allgather_backends_.find(communication) == allgather_backends_.end()) {
   //   // TODO: retrieve sharded axis here
   //   auto output_tensors = at::tensor_split(output_tensor.squeeze(), communication->team_size(), 0);
@@ -502,8 +501,6 @@ void HostIrEvaluator::handle(Communication* communication) {
   auto output_tensors = at::tensor_split(output_tensor.squeeze(), communication->team_size(), 0);
   AllgatherThroughCudaMemcpyAsync allgather_backend(input_tensor, output_tensors, communicator_);
   allgather_backend.post();
-  torch::cuda::synchronize();
-  communicator_->barrier();
 }
 
 void HostIrEvaluator::handle(P2PCommunication* communication) {
diff --git a/csrc/host_ir/executor.h b/csrc/host_ir/executor.h
index f4cb7608d1d..44e615eb484 100644
--- a/csrc/host_ir/executor.h
+++ b/csrc/host_ir/executor.h
@@ -176,7 +176,7 @@ class HostIrEvaluator final : public OptOutDispatch {
   using StreamKey = std::variant<int64_t, Stream*>;
   std::unordered_map<StreamKey, c10::cuda::CUDAStream> streams_;
   std::unordered_map<Expr*, c10::intrusive_ptr<c10d::Work>> works_;
-  const int64_t my_device_index_;
+  const int64_t my_local_device_index_;
   std::unordered_map<Expr*, AllgatherThroughCudaMemcpyAsync> allgather_backends_;
 };
 
diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp
index 4fa60f67ecc..21c2a838805 100644
--- a/tests/cpp/test_multidevice_overlap.cpp
+++ b/tests/cpp/test_multidevice_overlap.cpp
@@ -350,10 +350,10 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) {
 }
 
 TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmarkStreamParallelType) {
-  // constexpr int64_t number_of_warmups = 50;
-  // constexpr int64_t number_of_iterations = 200;
-  // constexpr int64_t iteration_profiler_start = 10;
-  // constexpr int64_t iteration_profiler_end = 15;
+  constexpr int64_t number_of_warmups = 50;
+  constexpr int64_t number_of_iterations = 200;
+  constexpr int64_t iteration_profiler_start = 10;
+  constexpr int64_t iteration_profiler_end = 15;
 
   const int64_t D = communicator_->size();
   auto [backend,
@@ -422,39 +422,34 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmarkStreamParallelType) {
   std::vector<c10::IValue> inputs = {ta, tb};
   at::Tensor tc;
 
-  std::cout << "rank " << communicator_->deviceId() << ", ta_unsharded_ptr=" << ta_unsharded.data_ptr() << ", ta_ptr=" << ta.data_ptr()  << std::endl;
-
-  // cudaEvent_t start, stop;
-  // cudaEventCreate(&start);
-  // cudaEventCreate(&stop);
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
 
-  // for (const auto& iteration :
-  //      c10::irange(1)) {
-    // if (iteration == iteration_profiler_start) {
-    //   cudaProfilerStart();;
-    // }
-    // if (iteration == number_of_warmups) {
-    //   cudaEventRecord(start);
-    // }
+  for (const auto& iteration :
+       c10::irange(number_of_iterations)) {
+    if (iteration == iteration_profiler_start) {
+      // cudaProfilerStart();;
+    }
+    if (iteration == number_of_warmups) {
+      cudaEventRecord(start);
+    }
 
     tc = executor.runWithInput(inputs).at(0);
 
-    // if (iteration == iteration_profiler_end) {
-    //   cudaProfilerStop();;
-    // }
-  // }
-  // cudaEventRecord(stop);
-  // cudaEventSynchronize(stop);
-  // float milliseconds = 0;
-  // cudaEventElapsedTime(&milliseconds, start, stop);
-  // milliseconds /= number_of_iterations;
-
-  // std::string test_name = ::testing::UnitTest::GetInstance()->current_test_info()->name();
-  // times.insert({test_name, milliseconds});
-  // std::cout << "rank " << communicator_->deviceId() << ", " << test_name << " : " << milliseconds << std::endl;
-
-  torch::cuda::synchronize();
-  communicator_->barrier();
+    if (iteration == iteration_profiler_end) {
+      // cudaProfilerStop();;
+    }
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  float milliseconds = 0;
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  milliseconds /= number_of_iterations;
+
+  std::string test_name = ::testing::UnitTest::GetInstance()->current_test_info()->name();
+  times.insert({test_name, milliseconds});
+  std::cout << "rank " << communicator_->deviceId() << ", " << test_name << " : " << milliseconds << std::endl;
 
   EXPECT_TRUE(torch::allclose(tc_ref, tc, 1e-1, 1e-1)) << "rank " << communicator_->deviceId() << "failed.\ntc_ref: " << tc_ref << ",\ntc: " << tc;
 }

From f455c7093186cd2c8fa430b2ecac76556a41a4ca Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Thu, 30 Jan 2025 09:11:44 -0800
Subject: [PATCH 34/55] reenable profiling

---
 tests/cpp/test_multidevice_overlap.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp
index 21c2a838805..6344fcae890 100644
--- a/tests/cpp/test_multidevice_overlap.cpp
+++ b/tests/cpp/test_multidevice_overlap.cpp
@@ -429,7 +429,7 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmarkStreamParallelType) {
   for (const auto& iteration :
        c10::irange(number_of_iterations)) {
     if (iteration == iteration_profiler_start) {
-      // cudaProfilerStart();;
+      cudaProfilerStart();
     }
     if (iteration == number_of_warmups) {
       cudaEventRecord(start);
@@ -438,7 +438,7 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmarkStreamParallelType) {
     tc = executor.runWithInput(inputs).at(0);
 
     if (iteration == iteration_profiler_end) {
-      // cudaProfilerStop();;
+      cudaProfilerStop();
     }
   }
   cudaEventRecord(stop);

From 5a27b7e5d4231073a4ea7ca0256bcd0a9a618903 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Fri, 31 Jan 2025 06:26:25 -0800
Subject: [PATCH 35/55] fix cache for ipc handles

---
 bench/test                               | 10 +--
 csrc/host_ir/executor.cpp                | 77 +++---------------------
 csrc/host_ir/executor.h                  | 33 ----------
 csrc/multidevice/communicator.cpp        | 41 +++++++++++++
 csrc/multidevice/communicator.h          | 30 +++++++++
 tests/cpp/test_multidevice_gpu_comms.cpp |  3 +-
 6 files changed, 88 insertions(+), 106 deletions(-)

diff --git a/bench/test b/bench/test
index 8527e2d370c..19275e4b2e5 100755
--- a/bench/test
+++ b/bench/test
@@ -1,15 +1,15 @@
 #!/bin/bash
-EXPERIMENT=StreamParallelType_tests
+EXPERIMENT=CUDA_tests
 DATE=$(date +%Y%m%d-%H%M)
 LOG_BASE="/opt/pytorch/Fuser/bench/logs"
 
 NP=8
-BACKEND=UCC
+BACKEND=CUDA
 M=32768
 K=32768
 N=1024
 
-DTYPE="__half" # float, __bfloat
+DTYPE="float" #"__half" # float, __bfloat
 
 S=8
 Streams=3
@@ -80,7 +80,9 @@ echo "test cmd: $TEST_CMD" | tee -a $LOG_FILE_INFO
 MPICMD="mpirun $MPIFLAGS $TEST_CMD"
 echo $MPICMD | tee -a $LOG_FILE_INFO
 
-NSYSCMD="nsys profile --stats=false -w true -t cublas,cuda,nvtx,osrt,mpi,ucx -o ${LOGS}/${GTEST_POSTFIX} --capture-range-end stop --capture-range=cudaProfilerApi --cudabacktrace=memory,sync,kernel,other"
+# opt/pytorch/scripts/nsight/install-nsight.sh
+NSYS=$(sudo which nsys)
+NSYSCMD="${NSYS} profile --stats=false -w true -t cublas,cuda,nvtx,osrt,mpi,ucx -o ${LOGS}/${GTEST_POSTFIX} --capture-range-end stop --capture-range=cudaProfilerApi --cudabacktrace=memory,sync,kernel,other"
 
 CMD="${NSYSCMD} ${MPICMD}"
 sudo /bin/sh -c "echo '1' > /proc/sys/kernel/perf_event_paranoid"
diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp
index 012f5d054e2..7bc35107996 100644
--- a/csrc/host_ir/executor.cpp
+++ b/csrc/host_ir/executor.cpp
@@ -409,57 +409,6 @@ void HostIrEvaluator::handle(PostOnStream* post_ir) {
   }
 }
 
-int64_t AllgatherThroughCudaMemcpyAsync::running_counter = 0;
-
-struct IpcTensorInfo {
-  cudaIpcMemHandle_t ipc_handle;
-  int64_t storage_offset;
-  int64_t element_size;
-};
-
-AllgatherThroughCudaMemcpyAsync::AllgatherThroughCudaMemcpyAsync(at::Tensor input, std::vector<at::Tensor> outputs, Communicator* communicator) : unique_id(running_counter++), communicator_(communicator) {
-
-
-  IpcTensorInfo ipc_tensor_info;
-  NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcGetMemHandle(&ipc_tensor_info.ipc_handle, input.data_ptr()));
-  ipc_tensor_info.storage_offset = input.storage_offset();
-  ipc_tensor_info.element_size = input.element_size();
-
-  const int64_t my_rank = communicator->deviceId();
-  auto store = communicator->getTcpStore();
-  store->set(prefix() + std::to_string(my_rank), toBytes(ipc_tensor_info));
-
-  communicator_->barrier();
-
-  sizes_.resize(communicator_->size(), 0);
-  input_ptrs_.resize(communicator_->size(), nullptr);
-  output_ptrs_.resize(communicator_->size(), nullptr);
-  for (int64_t rank: c10::irange(communicator_->size())) {
-    auto output = outputs.at(rank);
-    sizes_.at(rank) = output.numel() * output.element_size();
-
-    output_ptrs_.at(rank) = output.data_ptr();
-    if (rank == my_rank) {
-      input_ptrs_.at(rank) = input.data_ptr();
-    } else {
-      ipc_tensor_info = fromBytes<IpcTensorInfo>(store->get(prefix() + std::to_string(rank)));
-      void*& ptr = input_ptrs_.at(rank);
-      NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcOpenMemHandle(&ptr, ipc_tensor_info.ipc_handle, cudaIpcMemLazyEnablePeerAccess));
-      ptr = (void*)((uint8_t*)ptr + ipc_tensor_info.storage_offset * ipc_tensor_info.element_size);
-    }
-  }
-  // TODO: close ipc mem handle at shutdown
-}
-
-void AllgatherThroughCudaMemcpyAsync::post() const {
-  cudaStream_t stream = c10::cuda::getCurrentCUDAStream(static_cast<c10::DeviceIndex>(communicator_->local_rank())).stream();
-  // TODO: use multicast
-  for (size_t i = 0; i < sizes_.size(); i++) {
-    NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync(output_ptrs_.at(i), input_ptrs_.at(i), sizes_.at(i), cudaMemcpyDeviceToDevice, stream));
-  }
-}
-
-
 void HostIrEvaluator::handle(Communication* communication) {
   NVF_ERROR(
       communicator_ != nullptr && communicator_->is_available(),
@@ -485,22 +434,16 @@ void HostIrEvaluator::handle(Communication* communication) {
   }
 
   NVF_ERROR(communication->type() == CommunicationType::Allgather);
-  // TODO: fix registration cache
-  // if (allgather_backends_.find(communication) == allgather_backends_.end()) {
-  //   // TODO: retrieve sharded axis here
-  //   auto output_tensors = at::tensor_split(output_tensor.squeeze(), communication->team_size(), 0);
-  //   allgather_backends_.try_emplace(
-  //       communication,
-  //       AllgatherThroughCudaMemcpyAsync(
-  //           input_tensor,
-  //           output_tensors,
-  //           communicator_));
-  // }
-  // allgather_backends_.at(communication).post();
-
-  auto output_tensors = at::tensor_split(output_tensor.squeeze(), communication->team_size(), 0);
-  AllgatherThroughCudaMemcpyAsync allgather_backend(input_tensor, output_tensors, communicator_);
-  allgather_backend.post();
+
+  std::vector<at::Tensor> output_tensors = at::tensor_split(output_tensor.squeeze(), communication->team_size(), 0);
+  std::vector<void*> input_ptrs = communicator_->getRemotePtrs(input_tensor);
+  cudaStream_t stream = c10::cuda::getCurrentCUDAStream(static_cast<c10::DeviceIndex>(communicator_->local_rank())).stream();
+  // TODO: use multicast
+  for (auto i = 0; i < communicator_->size(); i++) {
+    auto output = output_tensors.at(i);
+    NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync(output.data_ptr(), input_ptrs.at(i), output.numel() * output.element_size(), cudaMemcpyDeviceToDevice, stream));
+  }
+
 }
 
 void HostIrEvaluator::handle(P2PCommunication* communication) {
diff --git a/csrc/host_ir/executor.h b/csrc/host_ir/executor.h
index 44e615eb484..8e281b66143 100644
--- a/csrc/host_ir/executor.h
+++ b/csrc/host_ir/executor.h
@@ -48,40 +48,8 @@ class HostIrExecutor : public ExecutorAbstract {
   Communicator* communicator_;
 };
 
-template <typename T>
-std::vector<uint8_t> toBytes(T data) {
-  return std::vector<uint8_t>(
-      reinterpret_cast<uint8_t*>(&data),
-      reinterpret_cast<uint8_t*>(&data) + sizeof(T));
-}
-
-template <typename T>
-T fromBytes(std::vector<uint8_t> bytes) {
-  return *reinterpret_cast<T*>(bytes.data());
-}
-
 namespace hir {
 
-
-class AllgatherThroughCudaMemcpyAsync {
- public:
-  AllgatherThroughCudaMemcpyAsync(at::Tensor input, std::vector<at::Tensor> outputs, Communicator* communicator);
-
-  void post() const;
-
- private:
-  std::string prefix() const {
-    return "AllgatherThroughCudaMemcpyAsync_uniqueId=" + std::to_string(unique_id);
-  }
-
-  static int64_t running_counter;
-  int64_t unique_id;
-  Communicator* communicator_;
-  std::vector<int64_t> sizes_;
-  std::vector<void*> input_ptrs_;
-  std::vector<void*> output_ptrs_;
-};
-
 /*
 a HostIrEvaluator evaluates a host programs represented through a
 HostIrContainer It is instantiated with the desired HostIrContainer, and runs
@@ -177,7 +145,6 @@ class HostIrEvaluator final : public OptOutDispatch {
   std::unordered_map<StreamKey, c10::cuda::CUDAStream> streams_;
   std::unordered_map<Expr*, c10::intrusive_ptr<c10d::Work>> works_;
   const int64_t my_local_device_index_;
-  std::unordered_map<Expr*, AllgatherThroughCudaMemcpyAsync> allgather_backends_;
 };
 
 } // namespace hir
diff --git a/csrc/multidevice/communicator.cpp b/csrc/multidevice/communicator.cpp
index ce102695637..ce8c2226ca1 100644
--- a/csrc/multidevice/communicator.cpp
+++ b/csrc/multidevice/communicator.cpp
@@ -319,4 +319,45 @@ void Communicator::barrier(std::optional<CommunicatorBackend> backend) {
   getWorld(backend)->barrier(options)->wait();
 }
 
+struct IpcTensorInfo {
+  cudaIpcMemHandle_t ipc_handle;
+  int64_t storage_offset;
+  int64_t element_size;
+};
+
+std::vector<void*> Communicator::getRemotePtrs(at::Tensor tensor) {
+  auto it = remote_ptrs_.find(tensor);
+  if (it == remote_ptrs_.end()) {
+    if (deviceId() == 0) {
+      std::cout << "rank " << deviceId() << " registers tensor " << tensor.data_ptr() << "with hash" << std::endl;
+    }
+    std::vector<void*> remote_ptrs(size(), nullptr);
+    std::string prefix = "nvfuser_ipc_tensor_info_";
+    IpcTensorInfo ipc_tensor_info;
+    NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcGetMemHandle(&ipc_tensor_info.ipc_handle, tensor.data_ptr()));
+    ipc_tensor_info.storage_offset = tensor.storage_offset();
+    ipc_tensor_info.element_size = tensor.element_size();
+
+    const int64_t my_rank = deviceId();
+    auto store = getTcpStore();
+    store->set(prefix + std::to_string(my_rank), toBytes(ipc_tensor_info));
+
+    barrier();
+
+    for (int64_t rank: c10::irange(size())) {
+      if (rank == my_rank) {
+        remote_ptrs.at(rank) = tensor.data_ptr();
+      } else {
+        ipc_tensor_info = fromBytes<IpcTensorInfo>(store->get(prefix + std::to_string(rank)));
+        void*& ptr = remote_ptrs.at(rank);
+        NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcOpenMemHandle(&ptr, ipc_tensor_info.ipc_handle, cudaIpcMemLazyEnablePeerAccess));
+        // TODO: close ipc mem handle at shutdown
+        ptr = (void*)((uint8_t*)ptr + ipc_tensor_info.storage_offset * ipc_tensor_info.element_size);
+      }
+    }
+    it = remote_ptrs_.emplace(tensor, std::move(remote_ptrs)).first;
+  }
+  return it->second;
+}
+
 } // namespace nvfuser
diff --git a/csrc/multidevice/communicator.h b/csrc/multidevice/communicator.h
index 3ac48d9906b..7124b91f006 100644
--- a/csrc/multidevice/communicator.h
+++ b/csrc/multidevice/communicator.h
@@ -24,6 +24,18 @@
 
 namespace nvfuser {
 
+template <typename T>
+std::vector<uint8_t> toBytes(T data) {
+  return std::vector<uint8_t>(
+      reinterpret_cast<uint8_t*>(&data),
+      reinterpret_cast<uint8_t*>(&data) + sizeof(T));
+}
+
+template <typename T>
+T fromBytes(std::vector<uint8_t> bytes) {
+  return *reinterpret_cast<T*>(bytes.data());
+}
+
 // This file implements the class Communicator which sets up the inter-process
 // Backend. This class contains inter-process information, such as the rank, the
 // world size, as well as the Process Group that can be called to perform
@@ -142,7 +154,24 @@ class Communicator {
     return store_;
   }
 
+  std::vector<void*> getRemotePtrs(at::Tensor tensor);
+
  private:
+  struct TensorHash {
+    std::size_t operator()(const at::Tensor& tensor) const {
+      auto ptr = reinterpret_cast<std::uintptr_t>(tensor.data_ptr());
+      auto offset = tensor.storage_offset();
+      auto element_size = tensor.element_size();
+      return std::hash<std::uintptr_t>()(ptr) ^ std::hash<int64_t>()(offset) ^ std::hash<int>()(element_size);
+    }
+  };
+
+  struct TensorEqual {
+    bool operator()(const at::Tensor& lhs, const at::Tensor& rhs) const {
+      return lhs.equal(rhs);
+    }
+  };
+
   Communicator(
       CommunicatorBackend backend = comm_backend_default,
       RankType server_local_rank = comm_server_local_rank_default);
@@ -175,6 +204,7 @@ class Communicator {
   c10::intrusive_ptr<c10d::TCPStore> store_;
   // cache for the created backends. The keys are strings generated from Teams
   std::unordered_map<std::string, c10::intrusive_ptr<c10d::Backend>> backends_;
+  std::unordered_map<at::Tensor, std::vector<void*>, TensorHash, TensorEqual> remote_ptrs_;
 };
 
 } // namespace nvfuser
diff --git a/tests/cpp/test_multidevice_gpu_comms.cpp b/tests/cpp/test_multidevice_gpu_comms.cpp
index 026df97ea26..db44f0a5e31 100644
--- a/tests/cpp/test_multidevice_gpu_comms.cpp
+++ b/tests/cpp/test_multidevice_gpu_comms.cpp
@@ -136,8 +136,7 @@ TEST_F(GpuCommTest, Allgather) {
     return at::empty({kTensorSize}, tensor_options);
   });
 
-  hir::AllgatherThroughCudaMemcpyAsync allgather(input, outputs, communicator_);
-  allgather.post();
+  // AllgatherThroughCudaMemcpyAsync(input, outputs, communicator_);
 
   torch::cuda::synchronize();
   communicator_->barrier();

From 356feeb188c9aec0db0c3dc1d2dabe5f3ddf9f32 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Mon, 3 Feb 2025 07:08:09 -0800
Subject: [PATCH 36/55] synchronize running stream with original stream at the
 beginning of pipeline for-loop

---
 csrc/host_ir/executor.cpp | 19 ++++++++++++++++++-
 csrc/host_ir/lower.cpp    |  2 ++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp
index 7bc35107996..4133f3f7a75 100644
--- a/csrc/host_ir/executor.cpp
+++ b/csrc/host_ir/executor.cpp
@@ -437,11 +437,28 @@ void HostIrEvaluator::handle(Communication* communication) {
 
   std::vector<at::Tensor> output_tensors = at::tensor_split(output_tensor.squeeze(), communication->team_size(), 0);
   std::vector<void*> input_ptrs = communicator_->getRemotePtrs(input_tensor);
-  cudaStream_t stream = c10::cuda::getCurrentCUDAStream(static_cast<c10::DeviceIndex>(communicator_->local_rank())).stream();
+  cudaStream_t current_stream = c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream();
   // TODO: use multicast
   for (auto i = 0; i < communicator_->size(); i++) {
+    cudaStream_t stream = c10::cuda::getStreamFromPool(/*isHighPriority=*/false, my_local_device_index_).stream();
+    cudaEvent_t event = {};
+    NVFUSER_CUDA_RT_SAFE_CALL(
+        cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
+    NVFUSER_CUDA_RT_SAFE_CALL(cudaEventRecord(event, current_stream));
+    NVFUSER_CUDA_RT_SAFE_CALL(
+        cudaStreamWaitEvent(stream, event, cudaEventWaitDefault));
+    NVFUSER_CUDA_RT_SAFE_CALL(cudaEventDestroy(event));
+
     auto output = output_tensors.at(i);
     NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync(output.data_ptr(), input_ptrs.at(i), output.numel() * output.element_size(), cudaMemcpyDeviceToDevice, stream));
+
+    // sync
+    NVFUSER_CUDA_RT_SAFE_CALL(
+        cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
+    NVFUSER_CUDA_RT_SAFE_CALL(cudaEventRecord(event, stream));
+    NVFUSER_CUDA_RT_SAFE_CALL(
+        cudaStreamWaitEvent(current_stream, event, cudaEventWaitDefault));
+    NVFUSER_CUDA_RT_SAFE_CALL(cudaEventDestroy(event));
   }
 
 }
diff --git a/csrc/host_ir/lower.cpp b/csrc/host_ir/lower.cpp
index 0bb4a8b885c..f4a53783460 100644
--- a/csrc/host_ir/lower.cpp
+++ b/csrc/host_ir/lower.cpp
@@ -457,6 +457,7 @@ std::vector<Expr*> HostIrLower::lowerToCollectiveBasedPipelinedGemmComm(
   auto* stream_index = mod(j, number_of_streams);
   auto* stream = IrBuilder::create<hir::Stream>(stream_index);
   auto* set_stream = IrBuilder::create<hir::SetCurrentStream>(stream);
+  auto* initial_sync_stream = IrBuilder::create<hir::Synchronize>(original_stream);
 
   TensorView* tva_j = select(tva, 0, j);
   TensorView* tva_allgathered_j = select(tva_allgathered, 0, j);
@@ -496,6 +497,7 @@ std::vector<Expr*> HostIrLower::lowerToCollectiveBasedPipelinedGemmComm(
 
   std::vector<Expr*> loop_body = {
       set_stream,
+      initial_sync_stream,
       tva_j->definition(),
       tva_allgathered_j->definition(),
       communication,

From 4c0736a7f06e9a1b0189bd194f3cbee6aa8e93e7 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Mon, 3 Feb 2025 07:11:01 -0800
Subject: [PATCH 37/55] lint

---
 bench/process_outputs                    |   2 +-
 bench/test                               |   1 -
 csrc/host_ir/executor.cpp                |  19 +-
 csrc/host_ir/lower.cpp                   |   3 +-
 csrc/host_ir/lower.h                     |   4 +-
 csrc/multidevice/communication.h         |   1 -
 csrc/multidevice/communicator.cpp        |  18 +-
 csrc/multidevice/communicator.h          |   6 +-
 csrc/multidevice/executor.cpp            |   4 +-
 csrc/multidevice/executor.h              |   2 +-
 tests/cpp/multidevice_kernels.cu         |   8 +-
 tests/cpp/test_multidevice_gpu_comms.cpp |  35 +--
 tests/cpp/test_multidevice_overlap.cpp   | 258 +++++++++++++----------
 tests/cpp/test_multidevice_pipeline.cpp  |   4 +-
 14 files changed, 217 insertions(+), 148 deletions(-)

diff --git a/bench/process_outputs b/bench/process_outputs
index c1781394dbc..8913a10dd04 100755
--- a/bench/process_outputs
+++ b/bench/process_outputs
@@ -4,4 +4,4 @@ FILE="/opt/pytorch/Fuser/bench/logs/${1}/info"
 
 cat $FILE | grep "rank 0: "  #| awk '{print $4}'
 
-# | grep -E 'Streams32\b'
\ No newline at end of file
+# | grep -E 'Streams32\b'
diff --git a/bench/test b/bench/test
index 19275e4b2e5..8abc200f9a9 100755
--- a/bench/test
+++ b/bench/test
@@ -88,4 +88,3 @@ CMD="${NSYSCMD} ${MPICMD}"
 sudo /bin/sh -c "echo '1' > /proc/sys/kernel/perf_event_paranoid"
 echo $CMD | tee -a ${LOG_FILE_INFO}
 $CMD | tee -a ${LOG_FILE_INFO}
-
diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp
index 4133f3f7a75..cc30ee58316 100644
--- a/csrc/host_ir/executor.cpp
+++ b/csrc/host_ir/executor.cpp
@@ -435,12 +435,16 @@ void HostIrEvaluator::handle(Communication* communication) {
 
   NVF_ERROR(communication->type() == CommunicationType::Allgather);
 
-  std::vector<at::Tensor> output_tensors = at::tensor_split(output_tensor.squeeze(), communication->team_size(), 0);
+  std::vector<at::Tensor> output_tensors =
+      at::tensor_split(output_tensor.squeeze(), communication->team_size(), 0);
   std::vector<void*> input_ptrs = communicator_->getRemotePtrs(input_tensor);
-  cudaStream_t current_stream = c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream();
+  cudaStream_t current_stream =
+      c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream();
   // TODO: use multicast
   for (auto i = 0; i < communicator_->size(); i++) {
-    cudaStream_t stream = c10::cuda::getStreamFromPool(/*isHighPriority=*/false, my_local_device_index_).stream();
+    cudaStream_t stream = c10::cuda::getStreamFromPool(
+                              /*isHighPriority=*/false, my_local_device_index_)
+                              .stream();
     cudaEvent_t event = {};
     NVFUSER_CUDA_RT_SAFE_CALL(
         cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
@@ -450,7 +454,12 @@ void HostIrEvaluator::handle(Communication* communication) {
     NVFUSER_CUDA_RT_SAFE_CALL(cudaEventDestroy(event));
 
     auto output = output_tensors.at(i);
-    NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync(output.data_ptr(), input_ptrs.at(i), output.numel() * output.element_size(), cudaMemcpyDeviceToDevice, stream));
+    NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync(
+        output.data_ptr(),
+        input_ptrs.at(i),
+        output.numel() * output.element_size(),
+        cudaMemcpyDeviceToDevice,
+        stream));
 
     // sync
     NVFUSER_CUDA_RT_SAFE_CALL(
@@ -460,7 +469,6 @@ void HostIrEvaluator::handle(Communication* communication) {
         cudaStreamWaitEvent(current_stream, event, cudaEventWaitDefault));
     NVFUSER_CUDA_RT_SAFE_CALL(cudaEventDestroy(event));
   }
-
 }
 
 void HostIrEvaluator::handle(P2PCommunication* communication) {
@@ -474,7 +482,6 @@ void HostIrEvaluator::handle(P2PCommunication* communication) {
   CommunicatorBackend backend_type = communication->backend();
 
   if (backend_type != CommunicatorBackend::kCuda) {
-
     works_[communication] = postSingleCommunication(
         communication,
         communicator_->deviceId(),
diff --git a/csrc/host_ir/lower.cpp b/csrc/host_ir/lower.cpp
index f4a53783460..194d7ee7170 100644
--- a/csrc/host_ir/lower.cpp
+++ b/csrc/host_ir/lower.cpp
@@ -457,7 +457,8 @@ std::vector<Expr*> HostIrLower::lowerToCollectiveBasedPipelinedGemmComm(
   auto* stream_index = mod(j, number_of_streams);
   auto* stream = IrBuilder::create<hir::Stream>(stream_index);
   auto* set_stream = IrBuilder::create<hir::SetCurrentStream>(stream);
-  auto* initial_sync_stream = IrBuilder::create<hir::Synchronize>(original_stream);
+  auto* initial_sync_stream =
+      IrBuilder::create<hir::Synchronize>(original_stream);
 
   TensorView* tva_j = select(tva, 0, j);
   TensorView* tva_allgathered_j = select(tva_allgathered, 0, j);
diff --git a/csrc/host_ir/lower.h b/csrc/host_ir/lower.h
index 88d5dd10fa7..bce81d3ecab 100644
--- a/csrc/host_ir/lower.h
+++ b/csrc/host_ir/lower.h
@@ -20,8 +20,8 @@ struct HostIrLowerParams {
 
 class HostIrLower {
  public:
-
-  HostIrLower(HostIrLowerParams params = HostIrLowerParams()) : params_(params) {}
+  HostIrLower(HostIrLowerParams params = HostIrLowerParams())
+      : params_(params) {}
 
   // The flag `ignore_inner_resharding` is useful because the preseg passes
   // `InsertReshardingsPass` and `ReorderShardedAxisPass` want different
diff --git a/csrc/multidevice/communication.h b/csrc/multidevice/communication.h
index 944df467a62..d8724356e15 100644
--- a/csrc/multidevice/communication.h
+++ b/csrc/multidevice/communication.h
@@ -164,7 +164,6 @@ class P2PCommunication : public Expr {
   CommunicatorBackend& backend() {
     return attribute<CommunicatorBackend>(2);
   }
-
 };
 
 // The method "post" triggers the execution of the communication. This call is
diff --git a/csrc/multidevice/communicator.cpp b/csrc/multidevice/communicator.cpp
index ce8c2226ca1..46be9eb885f 100644
--- a/csrc/multidevice/communicator.cpp
+++ b/csrc/multidevice/communicator.cpp
@@ -329,12 +329,14 @@ std::vector<void*> Communicator::getRemotePtrs(at::Tensor tensor) {
   auto it = remote_ptrs_.find(tensor);
   if (it == remote_ptrs_.end()) {
     if (deviceId() == 0) {
-      std::cout << "rank " << deviceId() << " registers tensor " << tensor.data_ptr() << "with hash" << std::endl;
+      std::cout << "rank " << deviceId() << " registers tensor "
+                << tensor.data_ptr() << "with hash" << std::endl;
     }
     std::vector<void*> remote_ptrs(size(), nullptr);
     std::string prefix = "nvfuser_ipc_tensor_info_";
     IpcTensorInfo ipc_tensor_info;
-    NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcGetMemHandle(&ipc_tensor_info.ipc_handle, tensor.data_ptr()));
+    NVFUSER_CUDA_RT_SAFE_CALL(
+        cudaIpcGetMemHandle(&ipc_tensor_info.ipc_handle, tensor.data_ptr()));
     ipc_tensor_info.storage_offset = tensor.storage_offset();
     ipc_tensor_info.element_size = tensor.element_size();
 
@@ -344,15 +346,19 @@ std::vector<void*> Communicator::getRemotePtrs(at::Tensor tensor) {
 
     barrier();
 
-    for (int64_t rank: c10::irange(size())) {
+    for (int64_t rank : c10::irange(size())) {
       if (rank == my_rank) {
         remote_ptrs.at(rank) = tensor.data_ptr();
       } else {
-        ipc_tensor_info = fromBytes<IpcTensorInfo>(store->get(prefix + std::to_string(rank)));
+        ipc_tensor_info =
+            fromBytes<IpcTensorInfo>(store->get(prefix + std::to_string(rank)));
         void*& ptr = remote_ptrs.at(rank);
-        NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcOpenMemHandle(&ptr, ipc_tensor_info.ipc_handle, cudaIpcMemLazyEnablePeerAccess));
+        NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcOpenMemHandle(
+            &ptr, ipc_tensor_info.ipc_handle, cudaIpcMemLazyEnablePeerAccess));
         // TODO: close ipc mem handle at shutdown
-        ptr = (void*)((uint8_t*)ptr + ipc_tensor_info.storage_offset * ipc_tensor_info.element_size);
+        ptr = (void*)((uint8_t*)ptr +
+                      ipc_tensor_info.storage_offset *
+                          ipc_tensor_info.element_size);
       }
     }
     it = remote_ptrs_.emplace(tensor, std::move(remote_ptrs)).first;
diff --git a/csrc/multidevice/communicator.h b/csrc/multidevice/communicator.h
index 7124b91f006..ebe4a60ddfd 100644
--- a/csrc/multidevice/communicator.h
+++ b/csrc/multidevice/communicator.h
@@ -162,7 +162,8 @@ class Communicator {
       auto ptr = reinterpret_cast<std::uintptr_t>(tensor.data_ptr());
       auto offset = tensor.storage_offset();
       auto element_size = tensor.element_size();
-      return std::hash<std::uintptr_t>()(ptr) ^ std::hash<int64_t>()(offset) ^ std::hash<int>()(element_size);
+      return std::hash<std::uintptr_t>()(ptr) ^ std::hash<int64_t>()(offset) ^
+          std::hash<int>()(element_size);
     }
   };
 
@@ -204,7 +205,8 @@ class Communicator {
   c10::intrusive_ptr<c10d::TCPStore> store_;
   // cache for the created backends. The keys are strings generated from Teams
   std::unordered_map<std::string, c10::intrusive_ptr<c10d::Backend>> backends_;
-  std::unordered_map<at::Tensor, std::vector<void*>, TensorHash, TensorEqual> remote_ptrs_;
+  std::unordered_map<at::Tensor, std::vector<void*>, TensorHash, TensorEqual>
+      remote_ptrs_;
 };
 
 } // namespace nvfuser
diff --git a/csrc/multidevice/executor.cpp b/csrc/multidevice/executor.cpp
index eaea12ef2f3..5cb0d691f09 100644
--- a/csrc/multidevice/executor.cpp
+++ b/csrc/multidevice/executor.cpp
@@ -29,8 +29,8 @@ MultiDeviceExecutor::MultiDeviceExecutor(
   std::unique_ptr<hir::HostIrContainer> hic =
       lower.lower(std::move(fusion), comm.deviceId());
   // Create the HostIrEvaluator representing the host program
-  host_ir_executor_ =
-      std::make_unique<hir::HostIrEvaluator>(std::move(hic), &comm, params.executor);
+  host_ir_executor_ = std::make_unique<hir::HostIrEvaluator>(
+      std::move(hic), &comm, params.executor);
 }
 
 std::vector<at::Tensor> MultiDeviceExecutor::runWithInput(
diff --git a/csrc/multidevice/executor.h b/csrc/multidevice/executor.h
index e43b7c57f72..a3953fd0a47 100644
--- a/csrc/multidevice/executor.h
+++ b/csrc/multidevice/executor.h
@@ -11,8 +11,8 @@
 #include <exceptions.h>
 #include <fusion.h>
 #include <fusion_segmenter.h>
-#include <host_ir/lower.h>
 #include <host_ir/executor.h>
+#include <host_ir/lower.h>
 #include <ir/cloner.h>
 #include <multidevice/communication.h>
 #include <multidevice/communicator.h>
diff --git a/tests/cpp/multidevice_kernels.cu b/tests/cpp/multidevice_kernels.cu
index 1d38e034137..cd8275dc92c 100644
--- a/tests/cpp/multidevice_kernels.cu
+++ b/tests/cpp/multidevice_kernels.cu
@@ -10,12 +10,16 @@
 // (except raw headers). Compiling dynamic_type.h with nvcc is not supported.
 // Compiling pytorch with nvcc is not supported either.
 
-#include <tests/cpp/multidevice_kernels.h>
 #include <cuda.h>
+#include <tests/cpp/multidevice_kernels.h>
 
 namespace nvfuser {
 
-#define CUDA_CALL(call) NVF_ERROR((call) == cudaSuccess, "CUDA call failed: ", cudaGetErrorString(cudaGetLastError()))
+#define CUDA_CALL(call)      \
+  NVF_ERROR(                 \
+      (call) == cudaSuccess, \
+      "CUDA call failed: ",  \
+      cudaGetErrorString(cudaGetLastError()))
 
 __global__ void DummyMultiDeviceKernel() {}
 
diff --git a/tests/cpp/test_multidevice_gpu_comms.cpp b/tests/cpp/test_multidevice_gpu_comms.cpp
index db44f0a5e31..37e72445484 100644
--- a/tests/cpp/test_multidevice_gpu_comms.cpp
+++ b/tests/cpp/test_multidevice_gpu_comms.cpp
@@ -37,10 +37,12 @@ TEST_F(GpuCommTest, IpcMemHandle) {
   auto store = communicator_->getTcpStore();
   store->set("ipc_handle_" + std::to_string(rank), toBytes(ipc_handle));
   communicator_->barrier();
-  auto peer_ipc_handle = fromBytes<cudaIpcMemHandle_t>(store->get("ipc_handle_" + std::to_string((rank + 1) % num_devices)));
+  auto peer_ipc_handle = fromBytes<cudaIpcMemHandle_t>(
+      store->get("ipc_handle_" + std::to_string((rank + 1) % num_devices)));
 
   void* peer_d_ptr;
-  CUDA_CALL(cudaIpcOpenMemHandle(&peer_d_ptr, peer_ipc_handle, cudaIpcMemLazyEnablePeerAccess));
+  CUDA_CALL(cudaIpcOpenMemHandle(
+      &peer_d_ptr, peer_ipc_handle, cudaIpcMemLazyEnablePeerAccess));
 
   int64_t peer_value;
   CUDA_CALL(cudaMemcpy(&peer_value, peer_d_ptr, size, cudaMemcpyDeviceToHost));
@@ -50,7 +52,6 @@ TEST_F(GpuCommTest, IpcMemHandle) {
   // Clean up
   CUDA_CALL(cudaIpcCloseMemHandle(peer_d_ptr));
   CUDA_CALL(cudaFree(d_ptr));
-
 }
 
 TEST_F(GpuCommTest, IpcMemHandlePtrArithmeticAtReceiver) {
@@ -75,13 +76,16 @@ TEST_F(GpuCommTest, IpcMemHandlePtrArithmeticAtReceiver) {
   auto store = communicator_->getTcpStore();
   store->set("ipc_handle_" + std::to_string(rank), toBytes(ipc_handle));
   communicator_->barrier();
-  auto peer_ipc_handle = fromBytes<cudaIpcMemHandle_t>(store->get("ipc_handle_" + std::to_string(peer_rank)));
+  auto peer_ipc_handle = fromBytes<cudaIpcMemHandle_t>(
+      store->get("ipc_handle_" + std::to_string(peer_rank)));
 
   int64_t* peer_d_ptr;
-  CUDA_CALL(cudaIpcOpenMemHandle((void**)&peer_d_ptr, peer_ipc_handle, cudaIpcMemLazyEnablePeerAccess));
+  CUDA_CALL(cudaIpcOpenMemHandle(
+      (void**)&peer_d_ptr, peer_ipc_handle, cudaIpcMemLazyEnablePeerAccess));
 
   int64_t peer_value;
-  CUDA_CALL(cudaMemcpy(&peer_value, peer_d_ptr + 1, size / 2, cudaMemcpyDeviceToHost));
+  CUDA_CALL(cudaMemcpy(
+      &peer_value, peer_d_ptr + 1, size / 2, cudaMemcpyDeviceToHost));
 
   EXPECT_EQ(2 * peer_rank + 1, peer_value);
 
@@ -91,7 +95,8 @@ TEST_F(GpuCommTest, IpcMemHandlePtrArithmeticAtReceiver) {
 }
 
 TEST_F(GpuCommTest, IpcMemHandlePtrArithmeticAtSender) {
-  // TLDR; We CANNOT do pointer arithmetic on the sender side! The IPC handle points to the beginning of the allocated buffer.
+  // TLDR; We CANNOT do pointer arithmetic on the sender side! The IPC handle
+  // points to the beginning of the allocated buffer.
 
   // Allocate GPU memory
   constexpr size_t size = 2 * sizeof(int64_t);
@@ -112,15 +117,20 @@ TEST_F(GpuCommTest, IpcMemHandlePtrArithmeticAtSender) {
   auto store = communicator_->getTcpStore();
   store->set("ipc_handle_" + std::to_string(rank), toBytes(ipc_handle));
   communicator_->barrier();
-  auto peer_ipc_handle = fromBytes<cudaIpcMemHandle_t>(store->get("ipc_handle_" + std::to_string(peer_rank)));
+  auto peer_ipc_handle = fromBytes<cudaIpcMemHandle_t>(
+      store->get("ipc_handle_" + std::to_string(peer_rank)));
 
   int64_t* peer_d_ptr;
-  CUDA_CALL(cudaIpcOpenMemHandle((void**)&peer_d_ptr, peer_ipc_handle, cudaIpcMemLazyEnablePeerAccess));
+  CUDA_CALL(cudaIpcOpenMemHandle(
+      (void**)&peer_d_ptr, peer_ipc_handle, cudaIpcMemLazyEnablePeerAccess));
 
   int64_t peer_value;
-  CUDA_CALL(cudaMemcpy(&peer_value, peer_d_ptr, size / 2, cudaMemcpyDeviceToHost));
+  CUDA_CALL(
+      cudaMemcpy(&peer_value, peer_d_ptr, size / 2, cudaMemcpyDeviceToHost));
 
-  EXPECT_EQ(2 * peer_rank, peer_value); // and not 2 * peer_rank + 1 as could be expected!
+  EXPECT_EQ(
+      2 * peer_rank,
+      peer_value); // and not 2 * peer_rank + 1 as could be expected!
 
   // Clean up
   CUDA_CALL(cudaIpcCloseMemHandle(peer_d_ptr));
@@ -130,7 +140,8 @@ TEST_F(GpuCommTest, IpcMemHandlePtrArithmeticAtSender) {
 TEST_F(GpuCommTest, Allgather) {
   constexpr int64_t kTensorSize = 1024;
 
-  at::Tensor input = at::full({kTensorSize}, communicator_->deviceId(), tensor_options);
+  at::Tensor input =
+      at::full({kTensorSize}, communicator_->deviceId(), tensor_options);
   auto outputs = std::vector<at::Tensor>(communicator_->size());
   std::generate(outputs.begin(), outputs.end(), [&]() {
     return at::empty({kTensorSize}, tensor_options);
diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp
index 6344fcae890..ee916344001 100644
--- a/tests/cpp/test_multidevice_overlap.cpp
+++ b/tests/cpp/test_multidevice_overlap.cpp
@@ -9,6 +9,9 @@
 #include <ATen/cuda/CUDAGraph.h>
 #include <c10/cuda/CUDAStream.h>
 #include <c10/util/ArrayRef.h>
+#include <cuda.h>
+#include <cuda_profiler_api.h>
+#include <cuda_runtime.h>
 #include <fusion.h>
 #include <host_ir/container.h>
 #include <host_ir/executor.h>
@@ -16,9 +19,6 @@
 #include <ir/utils.h>
 #include <ops/all_ops.h>
 #include <tests/cpp/multidevice.h>
-#include <cuda.h>
-#include <cuda_profiler_api.h>
-#include <cuda_runtime.h>
 
 #define CUSTOM_PG_WITH_INTERNAL_STREAM_ACCESS 0
 
@@ -55,7 +55,9 @@ using DummyOverlapBenchmarkParams = std::tuple<
     /*pre_comm=*/bool,
     /*post_comm=*/bool>;
 
-class DummyOverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterface<DummyOverlapBenchmarkParams> {
+class DummyOverlapBenchmark
+    : public MultiDeviceTest,
+      public testing::WithParamInterface<DummyOverlapBenchmarkParams> {
  protected:
   static std::map<std::string, float> times;
 
@@ -64,8 +66,9 @@ class DummyOverlapBenchmark : public MultiDeviceTest, public testing::WithParamI
     if (rank != 0) {
       return;
     }
-    for (auto it: times) {
-      std::cout << "time " << rank << ": " << it.first << ": " << it.second << std::endl;
+    for (auto it : times) {
+      std::cout << "time " << rank << ": " << it.first << ": " << it.second
+                << std::endl;
     }
   }
 };
@@ -75,38 +78,38 @@ std::map<std::string, float> DummyOverlapBenchmark::times = {};
 TEST_P(DummyOverlapBenchmark, PipelinedAGMatmulBenchmark) {
   constexpr int64_t number_of_warmups = 20;
   constexpr int64_t number_of_iterations = 80;
-  constexpr int64_t total_number_of_iterations = number_of_warmups + number_of_iterations;
+  constexpr int64_t total_number_of_iterations =
+      number_of_warmups + number_of_iterations;
   constexpr int64_t iteration_profiler_start = 5;
   constexpr int64_t iteration_profiler_end = 10;
 
-
-  auto [backend,
-        M,
-        K,
-        N,
-        L,
-        pre_comm,
-        post_comm] = GetParam();
+  auto [backend, M, K, N, L, pre_comm, post_comm] = GetParam();
 
   std::vector<RankType> all_ranks(communicator_->size());
   std::iota(all_ranks.begin(), all_ranks.end(), 0);
   auto world = communicator_->getBackendForTeam(all_ranks, backend);
-  auto nccl_world = communicator_->getBackendForTeam(all_ranks, CommunicatorBackend::kNccl);
+  auto nccl_world =
+      communicator_->getBackendForTeam(all_ranks, CommunicatorBackend::kNccl);
 
   std::vector<c10::cuda::CUDAStream> streams =
       createStreams(2, communicator_->deviceId());
   auto& compute_stream = streams.at(0);
   auto& communication_stream = streams.at(1);
 
-  auto options_matmul = at::TensorOptions().dtype(torch::kFloat16).device(communicator_->device());
+  auto options_matmul = at::TensorOptions()
+                            .dtype(torch::kFloat16)
+                            .device(communicator_->device());
   auto ta = at::randn({M, K}, options_matmul);
   auto tb = at::randn({K, N}, options_matmul);
   auto tc = at::empty({M, N}, options_matmul);
 
-  auto options_comms = at::TensorOptions().dtype(torch::kFloat32).device(communicator_->device());
+  auto options_comms = at::TensorOptions()
+                           .dtype(torch::kFloat32)
+                           .device(communicator_->device());
   auto src = at::randn({L}, options_comms);
-  auto dst = at::empty({L * communicator_->size()},  options_comms);
-  std::vector<at::Tensor> barrier_scratch_buffer = {at::randn({1}, options_comms)};
+  auto dst = at::empty({L * communicator_->size()}, options_comms);
+  std::vector<at::Tensor> barrier_scratch_buffer = {
+      at::randn({1}, options_comms)};
 
   cudaEvent_t start, stop;
   cudaEventCreate(&start);
@@ -114,10 +117,10 @@ TEST_P(DummyOverlapBenchmark, PipelinedAGMatmulBenchmark) {
 
   nccl_world->allreduce(barrier_scratch_buffer)->wait();
 
-  for (const auto& iteration :
-       c10::irange(total_number_of_iterations)) {
+  for (const auto& iteration : c10::irange(total_number_of_iterations)) {
     if (iteration % 10 == 0 && communicator_->deviceId() == 0) {
-      std::cout << "iteration " << iteration <<"/" << total_number_of_iterations << std::endl;
+      std::cout << "iteration " << iteration << "/"
+                << total_number_of_iterations << std::endl;
     }
     if (iteration == iteration_profiler_start) {
       cudaProfilerStart();
@@ -141,7 +144,8 @@ TEST_P(DummyOverlapBenchmark, PipelinedAGMatmulBenchmark) {
     }
 
     if (iteration == iteration_profiler_end) {
-      cudaProfilerStop();;
+      cudaProfilerStop();
+      ;
     }
     if (!pre_comm & !post_comm) {
       nccl_world->allreduce(barrier_scratch_buffer)->wait();
@@ -157,32 +161,43 @@ TEST_P(DummyOverlapBenchmark, PipelinedAGMatmulBenchmark) {
   cudaEventElapsedTime(&milliseconds, start, stop);
   milliseconds /= number_of_iterations;
 
-  std::string test_name = ::testing::UnitTest::GetInstance()->current_test_info()->name();
+  std::string test_name =
+      ::testing::UnitTest::GetInstance()->current_test_info()->name();
   times.insert({test_name, milliseconds});
-  std::cout << "rank " << communicator_->deviceId() << ", " << test_name << " : " << milliseconds << std::endl;
+  std::cout << "rank " << communicator_->deviceId() << ", " << test_name
+            << " : " << milliseconds << std::endl;
 }
 
 INSTANTIATE_TEST_SUITE_P(
     ,
     DummyOverlapBenchmark,
     testing::Combine(
-    testing::Values(CommunicatorBackend::kNccl, CommunicatorBackend::kUcc),
-    /*M=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)),
-    /*K=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)),
-    /*N=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)),
-    /*L=*/testing::Values(1, pow(2,10), pow(2,15), pow(2,17), pow(2,20), pow(2,24), pow(2,26), pow(2,28)),
-    /*pre-comm=*/testing::Bool(),
-    /*post-comm=*/testing::Bool()),
+        testing::Values(CommunicatorBackend::kNccl, CommunicatorBackend::kUcc),
+        /*M=*/testing::Values(pow(2, 10), pow(2, 15), pow(2, 17)),
+        /*K=*/testing::Values(pow(2, 10), pow(2, 15), pow(2, 17)),
+        /*N=*/testing::Values(pow(2, 10), pow(2, 15), pow(2, 17)),
+        /*L=*/
+        testing::Values(
+            1,
+            pow(2, 10),
+            pow(2, 15),
+            pow(2, 17),
+            pow(2, 20),
+            pow(2, 24),
+            pow(2, 26),
+            pow(2, 28)),
+        /*pre-comm=*/testing::Bool(),
+        /*post-comm=*/testing::Bool()),
     [](const testing::TestParamInfo<DummyOverlapBenchmarkParams>& info)
         -> std::string {
       std::ostringstream os;
-      os << /*backend*/std::get<0>(info.param) << "_"
+      os << /*backend*/ std::get<0>(info.param) << "_"
          << "M" << std::get<1>(info.param) << "_"
          << "K" << std::get<2>(info.param) << "_"
          << "N" << std::get<3>(info.param) << "_"
          << "L" << std::get<4>(info.param)
-         << ((std::get<5>(info.param))? "_pre_comm" : "")
-         << ((std::get<6>(info.param))? "_post_comm" : "");
+         << ((std::get<5>(info.param)) ? "_pre_comm" : "")
+         << ((std::get<6>(info.param)) ? "_post_comm" : "");
       return os.str();
     });
 
@@ -199,7 +214,9 @@ using OverlapBenchmarkParams = std::tuple<
     /*use_cuda_graph=*/bool,
     DataType>;
 
-class OverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterface<OverlapBenchmarkParams> {
+class OverlapBenchmark
+    : public MultiDeviceTest,
+      public testing::WithParamInterface<OverlapBenchmarkParams> {
  protected:
   static std::map<std::string, float> times;
 
@@ -208,8 +225,9 @@ class OverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterf
     if (rank != 0) {
       return;
     }
-    for (auto it: times) {
-      std::cout << "time " << rank << ": " << it.first << ": " << it.second << std::endl;
+    for (auto it : times) {
+      std::cout << "time " << rank << ": " << it.first << ": " << it.second
+                << std::endl;
     }
   }
 };
@@ -223,19 +241,19 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) {
   constexpr int64_t iteration_profiler_end = 15;
   constexpr int64_t iteration_cuda_graph_capture = 5;
 
-
   const int64_t D = communicator_->size();
-  auto [backend,
-        S,
-        M,
-        K,
-        N,
-        number_of_streams,
-        add_cuStreamWriteValue32,
-        number_of_pgs,
-        unfuse_loops,
-        use_cuda_graph,
-        dtype] = GetParam();
+  auto
+      [backend,
+       S,
+       M,
+       K,
+       N,
+       number_of_streams,
+       add_cuStreamWriteValue32,
+       number_of_pgs,
+       unfuse_loops,
+       use_cuda_graph,
+       dtype] = GetParam();
   if (backend == CommunicatorBackend::kCuda) {
     GTEST_SKIP() << "Cuda Backend not supported in this test";
   }
@@ -248,11 +266,13 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) {
       createStreams(number_of_streams, communicator_->deviceId());
   setCurrentCUDAStream(streams.at(0));
 
-  auto options = at::TensorOptions().dtype(data_type_to_aten(dtype)).device(communicator_->device());
-  auto ta = at::randn({S, M/S,K}, options);
-  auto ta_unsharded = at::empty({S, D, M/S,K}, options);
-  auto tb = at::randn({K,N}, options);
-  auto tc = at::empty({S, D, M/S, N}, options);
+  auto options = at::TensorOptions()
+                     .dtype(data_type_to_aten(dtype))
+                     .device(communicator_->device());
+  auto ta = at::randn({S, M / S, K}, options);
+  auto ta_unsharded = at::empty({S, D, M / S, K}, options);
+  auto tb = at::randn({K, N}, options);
+  auto tc = at::empty({S, D, M / S, N}, options);
 
   cudaEvent_t start, stop;
   cudaEventCreate(&start);
@@ -270,7 +290,8 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) {
   for (const auto& iteration :
        c10::irange(number_of_warmups + number_of_iterations)) {
     if (iteration == iteration_profiler_start) {
-      cudaProfilerStart();;
+      cudaProfilerStart();
+      ;
     }
     if (iteration == number_of_warmups) {
       cudaEventRecord(start);
@@ -283,7 +304,8 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) {
         int64_t stream_index = j % streams.size();
         setCurrentCUDAStream(streams.at(stream_index));
 
-        auto world = communicator_->getBackendForTeam(all_ranks, backend, std::to_string(j % number_of_pgs));
+        auto world = communicator_->getBackendForTeam(
+            all_ranks, backend, std::to_string(j % number_of_pgs));
 
         auto ta_j = ta.select(0, j);
         auto ta_unsharded_j = ta_unsharded.select(0, j);
@@ -295,15 +317,17 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) {
         if (add_cuStreamWriteValue32) {
           cuStreamWriteValue32(
 #if CUSTOM_PG_WITH_INTERNAL_STREAM_ACCESS
-            (CUstream)world->getCudaStream(communicator_->device()).stream(),
+              (CUstream)world->getCudaStream(communicator_->device()).stream(),
 #else
-            (CUstream)streams.at(stream_index).stream(),
+              (CUstream)streams.at(stream_index).stream(),
 #endif
-            (CUdeviceptr)pDevice, (cuuint32_t)(iteration * S + j), (unsigned int)0);
+              (CUdeviceptr)pDevice,
+              (cuuint32_t)(iteration * S + j),
+              (unsigned int)0);
         }
         if (unfuse_loops == false) {
           // compute
-          torch::matmul_out(tc_j, ta_unsharded_j,tb);
+          torch::matmul_out(tc_j, ta_unsharded_j, tb);
         }
       }
       if (unfuse_loops) {
@@ -314,7 +338,7 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) {
           auto tc_j = tc.select(0, j);
 
           // compute
-          torch::matmul_out(tc_j, ta_unsharded_j,tb);
+          torch::matmul_out(tc_j, ta_unsharded_j, tb);
         }
       }
       if (use_cuda_graph && (iteration == iteration_cuda_graph_capture)) {
@@ -327,7 +351,8 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) {
       cuda_graph.replay();
     }
     if (iteration == iteration_profiler_end) {
-      cudaProfilerStop();;
+      cudaProfilerStop();
+      ;
     }
   }
   cudaEventRecord(stop);
@@ -336,14 +361,21 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) {
   cudaEventElapsedTime(&milliseconds, start, stop);
   milliseconds /= number_of_iterations;
 
-  std::string test_name = ::testing::UnitTest::GetInstance()->current_test_info()->name();
+  std::string test_name =
+      ::testing::UnitTest::GetInstance()->current_test_info()->name();
   times.insert({test_name, milliseconds});
-  std::cout << "rank " << communicator_->deviceId() << ", " << test_name << " : " << milliseconds << std::endl;
+  std::cout << "rank " << communicator_->deviceId() << ", " << test_name
+            << " : " << milliseconds << std::endl;
 
   if (add_cuStreamWriteValue32) {
-    std::cout << "RANK " << communicator_->device() << " entering while loop. Max index=" << (number_of_warmups + number_of_iterations)*S + S << std::endl;
-    while (*ptr < (cuuint32_t)(number_of_warmups + number_of_iterations)*S + S - 1) {
-      std::cout << "RANK " << communicator_->device() << " waiting at index=" << *ptr << std::endl;
+    std::cout << "RANK " << communicator_->device()
+              << " entering while loop. Max index="
+              << (number_of_warmups + number_of_iterations) * S + S
+              << std::endl;
+    while (*ptr <
+           (cuuint32_t)(number_of_warmups + number_of_iterations) * S + S - 1) {
+      std::cout << "RANK " << communicator_->device()
+                << " waiting at index=" << *ptr << std::endl;
     }
     cudaFree((void*)ptr);
   }
@@ -356,27 +388,30 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmarkStreamParallelType) {
   constexpr int64_t iteration_profiler_end = 15;
 
   const int64_t D = communicator_->size();
-  auto [backend,
-        S,
-        M,
-        K,
-        N,
-        number_of_streams,
-        add_cuStreamWriteValue32,
-        number_of_pgs,
-        unfuse_loops,
-        use_cuda_graph,
-        dtype] = GetParam();
+  auto
+      [backend,
+       S,
+       M,
+       K,
+       N,
+       number_of_streams,
+       add_cuStreamWriteValue32,
+       number_of_pgs,
+       unfuse_loops,
+       use_cuda_graph,
+       dtype] = GetParam();
 
   if (M % (D * S) != 0) {
     GTEST_SKIP() << "M must be a multiple of D * S, but got M = " << M
                  << ", D = " << D << ", S = " << S;
   }
   if (add_cuStreamWriteValue32) {
-    GTEST_SKIP() << "cuStreamWriteValue32 not supported with StreamParallelType";
+    GTEST_SKIP()
+        << "cuStreamWriteValue32 not supported with StreamParallelType";
   }
   if (number_of_pgs > 1) {
-    GTEST_SKIP() << "StreamParallelType not supported with multiple process groups";
+    GTEST_SKIP()
+        << "StreamParallelType not supported with multiple process groups";
   }
   if (unfuse_loops) {
     GTEST_SKIP() << "StreamParallelType not supported with unfused loops";
@@ -385,7 +420,6 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmarkStreamParallelType) {
     GTEST_SKIP() << "StreamParallelType not supported with cuda graphs";
   }
 
-
   auto fusion = std::make_unique<Fusion>();
   FusionGuard fg(fusion.get());
 
@@ -410,9 +444,9 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmarkStreamParallelType) {
   params.executor.number_of_streams = number_of_streams;
   MultiDeviceExecutor executor(std::move(fusion), *communicator_, params);
 
-
-  auto tensor_options =
-      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(communicator_->device());
+  auto tensor_options = at::TensorOptions()
+                            .dtype(data_type_to_aten(dtype))
+                            .device(communicator_->device());
   at::Tensor ta_unsharded = at::randn({S, D, M / (S * D), K}, tensor_options);
   at::Tensor ta = ta_unsharded.slice(
       1, communicator_->deviceId(), communicator_->deviceId() + 1);
@@ -426,8 +460,7 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmarkStreamParallelType) {
   cudaEventCreate(&start);
   cudaEventCreate(&stop);
 
-  for (const auto& iteration :
-       c10::irange(number_of_iterations)) {
+  for (const auto& iteration : c10::irange(number_of_iterations)) {
     if (iteration == iteration_profiler_start) {
       cudaProfilerStart();
     }
@@ -447,46 +480,55 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmarkStreamParallelType) {
   cudaEventElapsedTime(&milliseconds, start, stop);
   milliseconds /= number_of_iterations;
 
-  std::string test_name = ::testing::UnitTest::GetInstance()->current_test_info()->name();
+  std::string test_name =
+      ::testing::UnitTest::GetInstance()->current_test_info()->name();
   times.insert({test_name, milliseconds});
-  std::cout << "rank " << communicator_->deviceId() << ", " << test_name << " : " << milliseconds << std::endl;
+  std::cout << "rank " << communicator_->deviceId() << ", " << test_name
+            << " : " << milliseconds << std::endl;
 
-  EXPECT_TRUE(torch::allclose(tc_ref, tc, 1e-1, 1e-1)) << "rank " << communicator_->deviceId() << "failed.\ntc_ref: " << tc_ref << ",\ntc: " << tc;
+  EXPECT_TRUE(torch::allclose(tc_ref, tc, 1e-1, 1e-1))
+      << "rank " << communicator_->deviceId() << "failed.\ntc_ref: " << tc_ref
+      << ",\ntc: " << tc;
 }
 
 INSTANTIATE_TEST_SUITE_P(
     ,
     OverlapBenchmark,
     testing::Combine(
-    testing::Values(CommunicatorBackend::kNccl, CommunicatorBackend::kUcc, CommunicatorBackend::kCuda),
-    /*S=*/testing::Values(1,2,4,8, 16, 32),
-    /*M=*/testing::Values(pow(2,3), pow(2,10), pow(2,15), pow(2,18)),
-    /*K=*/testing::Values(pow(2,3), pow(2,10), pow(2,15), pow(2,18)),
-    /*N=*/testing::Values(pow(2,3), pow(2,10), pow(2,15)),
-    /*number_of_streams=*/testing::Values(3, 8, 32),
-    /*add_cuStreamWriteValue32*/testing::Values(false, true),
-    /*number_of_pgs=*/testing::Values(1, 2, 4, 8),
-    /*unfuse_loops=*/testing::Values(false, true),
-    /*use_cuda_graph=*/testing::Values(false), // cuda graphs not supported: ucc does not supports it (segfault) and nccl PG has a "syncStream" that throws
-    testing::Values(DataType::Float, DataType::Half, DataType::BFloat16)),
+        testing::Values(
+            CommunicatorBackend::kNccl,
+            CommunicatorBackend::kUcc,
+            CommunicatorBackend::kCuda),
+        /*S=*/testing::Values(1, 2, 4, 8, 16, 32),
+        /*M=*/testing::Values(pow(2, 3), pow(2, 10), pow(2, 15), pow(2, 18)),
+        /*K=*/testing::Values(pow(2, 3), pow(2, 10), pow(2, 15), pow(2, 18)),
+        /*N=*/testing::Values(pow(2, 3), pow(2, 10), pow(2, 15)),
+        /*number_of_streams=*/testing::Values(3, 8, 32),
+        /*add_cuStreamWriteValue32*/ testing::Values(false, true),
+        /*number_of_pgs=*/testing::Values(1, 2, 4, 8),
+        /*unfuse_loops=*/testing::Values(false, true),
+        /*use_cuda_graph=*/testing::Values(false), // cuda graphs not supported:
+                                                   // ucc does not supports it
+                                                   // (segfault) and nccl PG has
+                                                   // a "syncStream" that throws
+        testing::Values(DataType::Float, DataType::Half, DataType::BFloat16)),
     [](const testing::TestParamInfo<OverlapBenchmarkParams>& info)
         -> std::string {
       std::ostringstream os;
-      os << /*backend*/std::get<0>(info.param) << "_"
+      os << /*backend*/ std::get<0>(info.param) << "_"
          << "S" << std::get<1>(info.param) << "_"
          << "M" << std::get<2>(info.param) << "_"
          << "K" << std::get<3>(info.param) << "_"
          << "N" << std::get<4>(info.param) << "_"
          << "Streams" << std::get<5>(info.param) << "_"
-         << /*dtype:*/std::get<10>(info.param) << "_"
-         << ((std::get<6>(info.param))? "WithcuStreamWriteValue32_" : "")
+         << /*dtype:*/ std::get<10>(info.param) << "_"
+         << ((std::get<6>(info.param)) ? "WithcuStreamWriteValue32_" : "")
          << "Pgs" << std::get<7>(info.param)
-         << ((std::get<8>(info.param))? "_unfused" : "")
-         << ((std::get<9>(info.param))? "_WithCudaGraph" : "");
+         << ((std::get<8>(info.param)) ? "_unfused" : "")
+         << ((std::get<9>(info.param)) ? "_WithCudaGraph" : "");
       return os.str();
     });
 
-
 struct OverlapTestParams {
   // Tensors sizes
   int64_t M = std::pow(2, 6);
diff --git a/tests/cpp/test_multidevice_pipeline.cpp b/tests/cpp/test_multidevice_pipeline.cpp
index ca113123e16..bee3bb7c56c 100644
--- a/tests/cpp/test_multidevice_pipeline.cpp
+++ b/tests/cpp/test_multidevice_pipeline.cpp
@@ -127,9 +127,7 @@ void PipelineTest::executeAndValidate(bool validate_with_prescribed_values) {
   MultiDeviceExecutorParams params;
   params.executor = host_ir_executor_params;
   runtime = std::make_unique<MultiDeviceExecutor>(
-      std::make_unique<Fusion>(*fusion),
-      *communicator_,
-      params);
+      std::make_unique<Fusion>(*fusion), *communicator_, params);
   auto error_msg = runtime->validate();
   if (error_msg != "") {
     GTEST_SKIP() << error_msg;

From 7fca0355ab99c452172c3b2e63f974a88391cd83 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Wed, 5 Feb 2025 14:26:49 -0800
Subject: [PATCH 38/55] wip. The send and recv Expr* need to be matched
 together for associating the buffer. Need to either use
 (non-P2P)Communication* or better have a Host Node to explicitely share the
 handles as something explicit in the Host Ir program

---
 CMakeLists.txt                                |   1 +
 csrc/driver_api.h                             |   1 -
 csrc/host_ir/executor.cpp                     | 181 ++++++++++++++----
 csrc/multidevice/communicator.cpp             |  86 +++++----
 csrc/multidevice/communicator.h               |  46 ++++-
 tests/cpp/multidevice_kernels.cu              |   3 +-
 tests/cpp/test_multidevice_communications.cpp |  55 ++++++
 tests/cpp/test_multidevice_gpu_comms.cpp      |  18 ++
 tests/cpp/test_multidevice_overlap.cpp        |   3 +-
 9 files changed, 311 insertions(+), 83 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 364e39596b9..5e59cfddc65 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -346,6 +346,7 @@ target_link_libraries(codegen_internal PUBLIC
   ${LIBCUPTI}
   ${TORCH_LIBRARIES}
   dl
+  cuda
 )
 
 add_library(nvfuser_codegen SHARED $<TARGET_OBJECTS:codegen_internal>)
diff --git a/csrc/driver_api.h b/csrc/driver_api.h
index 8105cf855c2..b8c413a4054 100644
--- a/csrc/driver_api.h
+++ b/csrc/driver_api.h
@@ -32,7 +32,6 @@ namespace nvfuser {
   fn(cuModuleGetFunction);                \
   fn(cuModuleLoadDataEx);                 \
   fn(cuModuleUnload);                     \
-  fn(cuStreamWriteValue32);               \
   fn(cuOccupancyMaxActiveBlocksPerMultiprocessor)
 
 #if (CUDA_VERSION >= 12000)
diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp
index cc30ee58316..070cd299ee3 100644
--- a/csrc/host_ir/executor.cpp
+++ b/csrc/host_ir/executor.cpp
@@ -21,6 +21,7 @@
 #include <runtime/executor_dispatch.h>
 #include <runtime/executor_kernel_arg.h>
 #include <runtime/fusion_kernel_runtime.h>
+#include <cuda.h>
 
 namespace nvfuser {
 
@@ -435,40 +436,40 @@ void HostIrEvaluator::handle(Communication* communication) {
 
   NVF_ERROR(communication->type() == CommunicationType::Allgather);
 
-  std::vector<at::Tensor> output_tensors =
-      at::tensor_split(output_tensor.squeeze(), communication->team_size(), 0);
-  std::vector<void*> input_ptrs = communicator_->getRemotePtrs(input_tensor);
-  cudaStream_t current_stream =
-      c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream();
-  // TODO: use multicast
-  for (auto i = 0; i < communicator_->size(); i++) {
-    cudaStream_t stream = c10::cuda::getStreamFromPool(
-                              /*isHighPriority=*/false, my_local_device_index_)
-                              .stream();
-    cudaEvent_t event = {};
-    NVFUSER_CUDA_RT_SAFE_CALL(
-        cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
-    NVFUSER_CUDA_RT_SAFE_CALL(cudaEventRecord(event, current_stream));
-    NVFUSER_CUDA_RT_SAFE_CALL(
-        cudaStreamWaitEvent(stream, event, cudaEventWaitDefault));
-    NVFUSER_CUDA_RT_SAFE_CALL(cudaEventDestroy(event));
-
-    auto output = output_tensors.at(i);
-    NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync(
-        output.data_ptr(),
-        input_ptrs.at(i),
-        output.numel() * output.element_size(),
-        cudaMemcpyDeviceToDevice,
-        stream));
-
-    // sync
-    NVFUSER_CUDA_RT_SAFE_CALL(
-        cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
-    NVFUSER_CUDA_RT_SAFE_CALL(cudaEventRecord(event, stream));
-    NVFUSER_CUDA_RT_SAFE_CALL(
-        cudaStreamWaitEvent(current_stream, event, cudaEventWaitDefault));
-    NVFUSER_CUDA_RT_SAFE_CALL(cudaEventDestroy(event));
-  }
+  // std::vector<at::Tensor> output_tensors =
+  //     at::tensor_split(output_tensor.squeeze(), communication->team_size(), 0);
+  // const std::vector<void*>& input_ptrs = communicator_->getRemotePtrs(input_tensor);
+  // cudaStream_t current_stream =
+  //     c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream();
+  // // TODO: use multicast
+  // for (auto i = 0; i < communicator_->size(); i++) {
+  //   cudaStream_t stream = c10::cuda::getStreamFromPool(
+  //                             /*isHighPriority=*/false, my_local_device_index_)
+  //                             .stream();
+  //   cudaEvent_t event = {};
+  //   NVFUSER_CUDA_RT_SAFE_CALL(
+  //       cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
+  //   NVFUSER_CUDA_RT_SAFE_CALL(cudaEventRecord(event, current_stream));
+  //   NVFUSER_CUDA_RT_SAFE_CALL(
+  //       cudaStreamWaitEvent(stream, event, cudaEventWaitDefault));
+  //   NVFUSER_CUDA_RT_SAFE_CALL(cudaEventDestroy(event));
+
+  //   auto output = output_tensors.at(i);
+  //   NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync(
+  //       output.data_ptr(),
+  //       input_ptrs.at(i),
+  //       output.numel() * output.element_size(),
+  //       cudaMemcpyDeviceToDevice,
+  //       stream));
+
+  //   // sync
+  //   NVFUSER_CUDA_RT_SAFE_CALL(
+  //       cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
+  //   NVFUSER_CUDA_RT_SAFE_CALL(cudaEventRecord(event, stream));
+  //   NVFUSER_CUDA_RT_SAFE_CALL(
+  //       cudaStreamWaitEvent(current_stream, event, cudaEventWaitDefault));
+  //   NVFUSER_CUDA_RT_SAFE_CALL(cudaEventDestroy(event));
+  // }
 }
 
 void HostIrEvaluator::handle(P2PCommunication* communication) {
@@ -490,20 +491,118 @@ void HostIrEvaluator::handle(P2PCommunication* communication) {
         buffer);
     return;
   }
-  NVF_ERROR(false, "CUDA backend not supported yet");
+
+
+
+  // FIST TIME:
+  // sender exports cudaIpc mem handle on input buffer and put it to store
+  // sender signals recv it can open the mem handle. It needs to be CPU blocking
+  // recv opens the handle and gets the pointer.
+  // It copies the data and then signal sender on completion
+
+  // SECOND TIME:
+  // Sender signals recv it can copy
+  // Recv copies the data and signals sender on completion
+
+// each rank must have a bool "recvied" and a "sent" bool per rank. So n+1
+// each rank must have, per rank, a sent_to and a received_from a bool "recvied" and a "sent" bool per rank. So n+1
+
+
+
+  // std::string prefix = "nvfuser_ipc_tensor_info_" + communication->buffer()->name() + "_";
+  // IpcTensorInfo ipc_tensor_info;
+  // NVFUSER_CUDA_RT_SAFE_CALL(
+  //     cudaIpcGetMemHandle(&ipc_tensor_info.ipc_handle, buffer.data_ptr()));
+  // ipc_tensor_info.storage_offset = buffer.storage_offset();
+  // ipc_tensor_info.element_size = buffer.element_size();
+
+  // auto store = communicator_->getTcpStore();
+  // store->set(prefix + std::to_string(my_rank), toBytes(ipc_tensor_info));
+
+  // Team team = {my_rank, peer};
+  // communicator_->getBackendForTeam(team, CommunicatorBackend::kNccl)->barrier()->wait();
+
+  // for (int64_t rank : c10::irange(size())) {
+  //   if (rank == my_rank) {
+  //     remote_ptrs.at(rank) = tensor.data_ptr();
+  //   } else {
+  //     ipc_tensor_info =
+  //         fromBytes<IpcTensorInfo>(store->get(prefix + std::to_string(rank)));
+  //     void*& ptr = remote_ptrs.at(rank);
+  //     NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcOpenMemHandle(
+  //         &ptr, ipc_tensor_info.ipc_handle, cudaIpcMemLazyEnablePeerAccess));
+  //     // TODO: close ipc mem handle at shutdown
+  //     ptr = (void*)((uint8_t*)ptr +
+  //                   ipc_tensor_info.storage_offset *
+  //                       ipc_tensor_info.element_size);
+  //   }
+  // }
+
+  const auto current_stream = reinterpret_cast<CUstream>(c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream());
+  const std::vector<RemoteBufferInfo>& remote_buffers = communicator_->getRemoteBuffer(buffer, std::to_string(communication->buffer()->name()));
+  const int64_t my_rank = communicator_->deviceId();
+  const int64_t peer = expr_evaluator_.evaluate(communication->peer()).as<int64_t>();
+  const RemoteBufferInfo& my_buffer = remote_buffers.at(my_rank);
+  const RemoteBufferInfo& peer_buffer = remote_buffers.at(peer);
+  const auto local_semaphore = reinterpret_cast<CUdeviceptr>(&my_buffer.semaphores()[peer]);
+  const auto remote_semaphore = reinterpret_cast<CUdeviceptr>(&peer_buffer.semaphores()[my_rank]);
+  static_assert(sizeof(IpcSemaphore) == sizeof(uint32_t), "IpcSemaphore must be 32 bits");
+
+
+  if (communication->type() == P2PCommunicationType::RECV) {
+    std::cout << "RANK " << my_rank << " RECV, local semaphore=" << local_semaphore << ", remote semaphore=" << remote_semaphore << std::endl;
+    // signal to self that transfer is in progress
+    NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kTransferInProgress), CU_STREAM_WRITE_VALUE_DEFAULT));
+    // signal sender that receiver is ready
+    NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(current_stream, remote_semaphore, (cuuint32_t)(IpcSemaphore::kTransferInProgress), CU_STREAM_WRITE_VALUE_DEFAULT)); // passing CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER gives an error
+  } else /*sender*/ {
+    std::cout << "RANK " << my_rank << " SEND, local semaphore=" << local_semaphore << ", remote semaphore=" << remote_semaphore << std::endl;
+    // wait for sender to be ready
+    // NVFUSER_CUDA_SAFE_CALL(cuStreamWaitValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kTransferInProgress), CU_STREAM_WAIT_VALUE_EQ));
+    std::cout << "RANK " << my_rank << " SEND after 1st WAIT" << std::endl;
+    // RDMA writes data from sender to receiver
+    NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync(
+        remote_buffers.at(my_rank).ptr(),
+        my_buffer.ptr(),
+        buffer.numel() * buffer.element_size(),
+        cudaMemcpyDeviceToDevice,
+        current_stream));
+    std::cout << "RANK " << my_rank << " SEND after memcpy" << std::endl;
+    // Signals completion to self
+    NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kReady), CU_STREAM_WRITE_VALUE_DEFAULT));
+    // Signals completion to receiver
+    NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(current_stream, remote_semaphore, (cuuint32_t)(IpcSemaphore::kReady), CU_STREAM_WRITE_VALUE_DEFAULT));
+  }
 }
 
 void HostIrEvaluator::handle(Wait* wait) {
   Expr* communication = wait->communication();
-  auto it = works_.find(communication);
-  if (it == works_.end()) {
+  auto* p2p_comm = dynamic_cast<P2PCommunication*>(communication);
+  if (p2p_comm && p2p_comm->backend() != CommunicatorBackend::kCuda) {
+    auto it = works_.find(communication);
+    if (it == works_.end()) {
+      return;
+    }
+    auto& work = it->second;
+    if (work != nullptr) {
+      work->wait();
+    }
+    works_.erase(communication);
     return;
   }
-  auto& work = it->second;
-  if (work != nullptr) {
-    work->wait();
+
+  if (p2p_comm->type() == P2PCommunicationType::RECV) {
+    // const auto current_stream = static_cast<CUstream>(c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream());
+    const std::vector<RemoteBufferInfo>& remote_buffers = communicator_->getRemoteBuffer(getKnownTensorOrUndefined(p2p_comm->buffer(), expr_evaluator_), std::to_string(p2p_comm->buffer()->name()));
+    const int64_t my_rank = communicator_->deviceId();
+    const int64_t peer = expr_evaluator_.evaluate(p2p_comm->peer()).as<int64_t>();
+    const RemoteBufferInfo& my_buffer = remote_buffers.at(my_rank);
+    const auto local_semaphore = reinterpret_cast<CUdeviceptr>(&my_buffer.semaphores()[peer]);
+
+    std::cout << "RANK " << my_rank << " WAIT RECV BEFORE cuStreamWaitValue32 on local semaphore " << local_semaphore << std::endl;
+    // NVFUSER_CUDA_SAFE_CALL(cuStreamWaitValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kReady), CU_STREAM_WAIT_VALUE_EQ));
+    std::cout << "RANK " << my_rank << " FINISHED WAIT RECV AFTER cuStreamWaitValue32 on local semaphore " << local_semaphore << std::endl;
   }
-  works_.erase(communication);
 }
 
 namespace {
diff --git a/csrc/multidevice/communicator.cpp b/csrc/multidevice/communicator.cpp
index 46be9eb885f..902ceaaa64d 100644
--- a/csrc/multidevice/communicator.cpp
+++ b/csrc/multidevice/communicator.cpp
@@ -319,49 +319,63 @@ void Communicator::barrier(std::optional<CommunicatorBackend> backend) {
   getWorld(backend)->barrier(options)->wait();
 }
 
-struct IpcTensorInfo {
-  cudaIpcMemHandle_t ipc_handle;
-  int64_t storage_offset;
-  int64_t element_size;
-};
-
-std::vector<void*> Communicator::getRemotePtrs(at::Tensor tensor) {
-  auto it = remote_ptrs_.find(tensor);
-  if (it == remote_ptrs_.end()) {
-    if (deviceId() == 0) {
-      std::cout << "rank " << deviceId() << " registers tensor "
-                << tensor.data_ptr() << "with hash" << std::endl;
-    }
-    std::vector<void*> remote_ptrs(size(), nullptr);
-    std::string prefix = "nvfuser_ipc_tensor_info_";
-    IpcTensorInfo ipc_tensor_info;
-    NVFUSER_CUDA_RT_SAFE_CALL(
-        cudaIpcGetMemHandle(&ipc_tensor_info.ipc_handle, tensor.data_ptr()));
-    ipc_tensor_info.storage_offset = tensor.storage_offset();
-    ipc_tensor_info.element_size = tensor.element_size();
-
-    const int64_t my_rank = deviceId();
+RemoteBufferInfo::RemoteBufferInfo(at::Tensor tensor, int64_t size) : ptr_(tensor.data_ptr()), size_(size), storage_offset_(tensor.storage_offset()), element_size_(tensor.element_size()), is_imported_(false) {
+  NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcGetMemHandle(&ipc_handle_, tensor.data_ptr()));
+  NVFUSER_CUDA_RT_SAFE_CALL(cudaMalloc((void**)&semaphores_, size_ * sizeof(IpcSemaphore)));
+  static_assert(sizeof(IpcSemaphore) == sizeof(int), "IpcSemaphore must be same size as int");
+  NVFUSER_CUDA_RT_SAFE_CALL(cudaMemset((void*) semaphores_, (int)IpcSemaphore::kReady, size_ * sizeof(IpcSemaphore)));
+  NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcGetMemHandle(&semaphores_ipc_handle_, semaphores_));
+}
+
+RemoteBufferInfo::RemoteBufferInfo(std::vector<uint8_t> data) : is_imported_(true) {
+  RemoteBufferInfo imported_buffer = fromBytes<RemoteBufferInfo>(data);
+
+  size_ = imported_buffer.size_;
+  storage_offset_ = imported_buffer.storage_offset_;
+  element_size_ = imported_buffer.element_size_;
+  ipc_handle_ = imported_buffer.ipc_handle_;
+  semaphores_ipc_handle_ = imported_buffer.semaphores_ipc_handle_;
+
+  NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcOpenMemHandle(&ptr_, ipc_handle_, cudaIpcMemLazyEnablePeerAccess));
+  ptr_ = (void*)((uint8_t*)ptr_ + storage_offset_ * element_size_);
+
+  NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcOpenMemHandle((void**)&semaphores_, semaphores_ipc_handle_, cudaIpcMemLazyEnablePeerAccess));
+}
+
+RemoteBufferInfo::~RemoteBufferInfo() {
+  // if (is_imported_) {
+  //   NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcCloseMemHandle(&ipc_handle_));
+  //   NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcCloseMemHandle(&semaphores_ipc_handle_));
+  // } else {
+  //   NVFUSER_CUDA_RT_SAFE_CALL(cudaFree(semaphores_));
+  // }
+}
+
+
+std::vector<RemoteBufferInfo> Communicator::getRemoteBuffer(at::Tensor tensor, std::string key) {
+  auto it = remote_buffers_.find(tensor);
+  if (it == remote_buffers_.end()) {
+    RemoteBufferInfo buffer_handle(tensor, size_);
+
     auto store = getTcpStore();
-    store->set(prefix + std::to_string(my_rank), toBytes(ipc_tensor_info));
+    std::string prefix = "nvfuser_remote_buffer_info_" + key;
+    std::cout << "RANK " << deviceId() << "registers at key " << prefix + std::to_string(deviceId()) << std::endl;
+    store->set(prefix + std::to_string(deviceId()), toBytes(buffer_handle));
 
     barrier();
 
-    for (int64_t rank : c10::irange(size())) {
-      if (rank == my_rank) {
-        remote_ptrs.at(rank) = tensor.data_ptr();
+    std::cout << "RANK " << deviceId() << "after barrier for key " << prefix + std::to_string(deviceId()) << std::endl;
+    std::vector<RemoteBufferInfo> remote_buffers;
+    remote_buffers.reserve(size_);
+    for (int64_t rank : c10::irange(size_)) {
+      if (rank == deviceId()) {
+        remote_buffers.push_back(std::move(buffer_handle));
       } else {
-        ipc_tensor_info =
-            fromBytes<IpcTensorInfo>(store->get(prefix + std::to_string(rank)));
-        void*& ptr = remote_ptrs.at(rank);
-        NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcOpenMemHandle(
-            &ptr, ipc_tensor_info.ipc_handle, cudaIpcMemLazyEnablePeerAccess));
-        // TODO: close ipc mem handle at shutdown
-        ptr = (void*)((uint8_t*)ptr +
-                      ipc_tensor_info.storage_offset *
-                          ipc_tensor_info.element_size);
+        RemoteBufferInfo imported_remote_buffer_info(store->get(prefix + std::to_string(rank)));
+        remote_buffers.push_back(std::move(imported_remote_buffer_info));
       }
     }
-    it = remote_ptrs_.emplace(tensor, std::move(remote_ptrs)).first;
+    it = remote_buffers_.emplace(tensor, std::move(remote_buffers)).first;
   }
   return it->second;
 }
diff --git a/csrc/multidevice/communicator.h b/csrc/multidevice/communicator.h
index ebe4a60ddfd..39c9d667bf4 100644
--- a/csrc/multidevice/communicator.h
+++ b/csrc/multidevice/communicator.h
@@ -10,6 +10,11 @@
 #include <ATen/core/TensorBody.h>
 #include <ATen/core/ivalue.h>
 #include <c10/util/intrusive_ptr.h>
+// #include <cuda.h>
+#include <driver_api.h>
+#include <cuda_utils.h>
+#include <cuda_runtime.h>
+
 
 #include <exceptions.h>
 #include <multidevice/multidevice.h>
@@ -36,6 +41,41 @@ T fromBytes(std::vector<uint8_t> bytes) {
   return *reinterpret_cast<T*>(bytes.data());
 }
 
+enum class IpcSemaphore : cuuint32_t {
+  kReady,
+  kTransferInProgress
+};
+
+class RemoteBufferInfo {
+ public:
+
+  RemoteBufferInfo(at::Tensor tensor, int64_t size);
+  RemoteBufferInfo(std::vector<uint8_t> data); // means it is imported
+  ~RemoteBufferInfo();
+
+  void* ptr() const {
+    return ptr_;
+  }
+
+  auto semaphores() const {
+    return semaphores_;
+  }
+
+  auto size() const {
+    return size_;
+  }
+
+ private:
+  void* ptr_;
+  int64_t size_;
+  int64_t storage_offset_;
+  int64_t element_size_;
+  bool is_imported_;
+  cudaIpcMemHandle_t ipc_handle_;
+  cudaIpcMemHandle_t semaphores_ipc_handle_;
+  IpcSemaphore* semaphores_;
+};
+
 // This file implements the class Communicator which sets up the inter-process
 // Backend. This class contains inter-process information, such as the rank, the
 // world size, as well as the Process Group that can be called to perform
@@ -154,7 +194,7 @@ class Communicator {
     return store_;
   }
 
-  std::vector<void*> getRemotePtrs(at::Tensor tensor);
+  std::vector<RemoteBufferInfo> getRemoteBuffer(at::Tensor tensor, std::string key);
 
  private:
   struct TensorHash {
@@ -205,8 +245,8 @@ class Communicator {
   c10::intrusive_ptr<c10d::TCPStore> store_;
   // cache for the created backends. The keys are strings generated from Teams
   std::unordered_map<std::string, c10::intrusive_ptr<c10d::Backend>> backends_;
-  std::unordered_map<at::Tensor, std::vector<void*>, TensorHash, TensorEqual>
-      remote_ptrs_;
+  std::unordered_map<at::Tensor, std::vector<RemoteBufferInfo>, TensorHash, TensorEqual>
+      remote_buffers_;
 };
 
 } // namespace nvfuser
diff --git a/tests/cpp/multidevice_kernels.cu b/tests/cpp/multidevice_kernels.cu
index cd8275dc92c..9634130cb3d 100644
--- a/tests/cpp/multidevice_kernels.cu
+++ b/tests/cpp/multidevice_kernels.cu
@@ -10,7 +10,8 @@
 // (except raw headers). Compiling dynamic_type.h with nvcc is not supported.
 // Compiling pytorch with nvcc is not supported either.
 
-#include <cuda.h>
+// #include <cuda.h>
+#include <driver_api.h>
 #include <tests/cpp/multidevice_kernels.h>
 
 namespace nvfuser {
diff --git a/tests/cpp/test_multidevice_communications.cpp b/tests/cpp/test_multidevice_communications.cpp
index 82a45c4f87f..8a50d152a4d 100644
--- a/tests/cpp/test_multidevice_communications.cpp
+++ b/tests/cpp/test_multidevice_communications.cpp
@@ -413,4 +413,59 @@ INSTANTIATE_TEST_SUITE_P(
     testing::Values(CommunicatorBackend::kNccl, CommunicatorBackend::kUcc),
     testing::PrintToStringParamName());
 
+using P2PCommunicationTest = MultiDeviceTest; 
+
+TEST_F(P2PCommunicationTest, CudaComm) {
+  static constexpr int kTensorSize = 8;
+  static constexpr int kNumRepetitions = 8;
+
+  if (communicator_->size() < 2 || torch::cuda::device_count() < 2) {
+    GTEST_SKIP() << "This test needs at least 2 GPUs and 2 ranks.";
+  }
+
+  const DeviceIdxType my_rank = communicator_->deviceId();
+  const DeviceIdxType size = communicator_->size();
+  const DeviceIdxType send_peer = (my_rank + 1) % size;
+  const DeviceIdxType recv_peer = (size + my_rank - 1) % size;
+
+  auto container = std::make_unique<hir::HostIrContainer>();
+  FusionGuard fg(container.get());
+  auto* send_tv = makeContigTensor(1);
+  auto* recv_tv = ops::newValLike(send_tv, send_tv->dtype())->as<TensorView>();
+  container->addInput(send_tv);
+  container->addInput(recv_tv);
+
+  auto* val_recv_peer = IrBuilder::create<Val>(recv_peer, DataType::Int);
+  auto* val_send_peer = IrBuilder::create<Val>(send_peer, DataType::Int);
+
+  auto recv = IrBuilder::create<P2PCommunication>(P2PCommunicationType::RECV, recv_tv, val_recv_peer, CommunicatorBackend::kCuda);
+  auto send = IrBuilder::create<P2PCommunication>(P2PCommunicationType::SEND, send_tv, val_send_peer, CommunicatorBackend::kCuda);
+  auto wait_recv = IrBuilder::create<hir::Wait>(recv);
+  auto wait_send = IrBuilder::create<hir::Wait>(send);
+
+  container->pushBackTopLevelExprs(recv);
+  container->pushBackTopLevelExprs(send);
+  container->pushBackTopLevelExprs(wait_recv);
+  container->pushBackTopLevelExprs(wait_send);
+
+  hir::HostIrEvaluator executor(std::move(container), communicator_);
+
+  at::Tensor send_tensor = at::empty({kTensorSize}, tensor_options);
+  at::Tensor recv_tensor = at::empty({kTensorSize}, tensor_options);
+
+  std::unordered_map<Val*, c10::IValue> inputs = {{send_tv, send_tensor}, {recv_tv, recv_tensor}};
+
+  for (auto repetition : c10::irange(kNumRepetitions)) {
+    send_tensor.copy_(at::arange(kTensorSize, tensor_options) + repetition * my_rank);
+    std::cout << "RANK " << my_rank << " REPETITION " << repetition << ", send_peer=" << send_peer << ", recv_peer=" << recv_peer << ", send_tensor=" << send_tensor << std::endl;
+
+    executor.runWithInput(inputs);
+
+    torch::cuda::synchronize();
+    std::cout << "RANK " << my_rank << " validation at" << " REPETITION " << repetition << std::endl;
+    auto ref = at::arange(kTensorSize, tensor_options) + repetition * recv_peer;
+    EXPECT_TRUE(torch::allclose(recv_tensor, ref)) << "Rank " << my_rank << " failed at repetition " << repetition << " with recv tensor " << recv_tensor << " and ref " << ref;
+  }
+}
+
 } // namespace nvfuser
diff --git a/tests/cpp/test_multidevice_gpu_comms.cpp b/tests/cpp/test_multidevice_gpu_comms.cpp
index 37e72445484..413df0f06a4 100644
--- a/tests/cpp/test_multidevice_gpu_comms.cpp
+++ b/tests/cpp/test_multidevice_gpu_comms.cpp
@@ -13,6 +13,7 @@
 #include <ops/all_ops.h>
 #include <tests/cpp/multidevice.h>
 #include <tests/cpp/multidevice_kernels.h>
+#include <cuda.h>
 
 namespace nvfuser {
 
@@ -137,6 +138,23 @@ TEST_F(GpuCommTest, IpcMemHandlePtrArithmeticAtSender) {
   CUDA_CALL(cudaFree(d_ptr));
 }
 
+class StreamOpTest : public NVFuserTest {};
+
+TEST_F(StreamOpTest, StreamWriteValue32) {
+  cudaStream_t stream;
+  void* buf;
+  int value = 0;
+  constexpr int new_value = 42;
+  NVFUSER_CUDA_RT_SAFE_CALL(cudaSetDevice(0));
+  NVFUSER_CUDA_RT_SAFE_CALL(cudaStreamCreate(&stream));
+  NVFUSER_CUDA_RT_SAFE_CALL(cudaMalloc(&buf, sizeof(int)));
+  NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync(buf, &value, sizeof(int), cudaMemcpyHostToDevice, stream));
+  NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(stream, (CUdeviceptr)buf, new_value, CU_STREAM_WRITE_VALUE_DEFAULT));
+  NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync(&value, buf, sizeof(int), cudaMemcpyDeviceToHost, stream));
+  NVFUSER_CUDA_RT_SAFE_CALL(cudaStreamSynchronize(stream));
+  EXPECT_EQ(value, new_value);
+}
+
 TEST_F(GpuCommTest, Allgather) {
   constexpr int64_t kTensorSize = 1024;
 
diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp
index ee916344001..47bb5d915db 100644
--- a/tests/cpp/test_multidevice_overlap.cpp
+++ b/tests/cpp/test_multidevice_overlap.cpp
@@ -9,7 +9,8 @@
 #include <ATen/cuda/CUDAGraph.h>
 #include <c10/cuda/CUDAStream.h>
 #include <c10/util/ArrayRef.h>
-#include <cuda.h>
+// #include <cuda.h>
+#include <driver_api.h>
 #include <cuda_profiler_api.h>
 #include <cuda_runtime.h>
 #include <fusion.h>

From 371554e0bd21948933c72e9108d496b0ff26ff12 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Wed, 5 Feb 2025 15:06:14 -0800
Subject: [PATCH 39/55] working chkpt well prepared for two ranks

---
 csrc/host_ir/executor.cpp                     |  4 +-
 tests/cpp/test_multidevice_communications.cpp | 62 +++++++++++--------
 2 files changed, 38 insertions(+), 28 deletions(-)

diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp
index 070cd299ee3..5a45c9b23e0 100644
--- a/csrc/host_ir/executor.cpp
+++ b/csrc/host_ir/executor.cpp
@@ -539,7 +539,7 @@ void HostIrEvaluator::handle(P2PCommunication* communication) {
   // }
 
   const auto current_stream = reinterpret_cast<CUstream>(c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream());
-  const std::vector<RemoteBufferInfo>& remote_buffers = communicator_->getRemoteBuffer(buffer, std::to_string(communication->buffer()->name()));
+  const std::vector<RemoteBufferInfo>& remote_buffers = communicator_->getRemoteBuffer(buffer, "");
   const int64_t my_rank = communicator_->deviceId();
   const int64_t peer = expr_evaluator_.evaluate(communication->peer()).as<int64_t>();
   const RemoteBufferInfo& my_buffer = remote_buffers.at(my_rank);
@@ -562,7 +562,7 @@ void HostIrEvaluator::handle(P2PCommunication* communication) {
     std::cout << "RANK " << my_rank << " SEND after 1st WAIT" << std::endl;
     // RDMA writes data from sender to receiver
     NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync(
-        remote_buffers.at(my_rank).ptr(),
+        peer_buffer.ptr(),
         my_buffer.ptr(),
         buffer.numel() * buffer.element_size(),
         cudaMemcpyDeviceToDevice,
diff --git a/tests/cpp/test_multidevice_communications.cpp b/tests/cpp/test_multidevice_communications.cpp
index 8a50d152a4d..ad13c747a5b 100644
--- a/tests/cpp/test_multidevice_communications.cpp
+++ b/tests/cpp/test_multidevice_communications.cpp
@@ -423,48 +423,58 @@ TEST_F(P2PCommunicationTest, CudaComm) {
     GTEST_SKIP() << "This test needs at least 2 GPUs and 2 ranks.";
   }
 
+  if (communicator_->size() != 2) {
+    GTEST_SKIP() << "This test needs for now exactly 2 GPUs and 2 ranks.";
+  }
+
+
+
   const DeviceIdxType my_rank = communicator_->deviceId();
   const DeviceIdxType size = communicator_->size();
-  const DeviceIdxType send_peer = (my_rank + 1) % size;
-  const DeviceIdxType recv_peer = (size + my_rank - 1) % size;
 
   auto container = std::make_unique<hir::HostIrContainer>();
   FusionGuard fg(container.get());
-  auto* send_tv = makeContigTensor(1);
-  auto* recv_tv = ops::newValLike(send_tv, send_tv->dtype())->as<TensorView>();
-  container->addInput(send_tv);
-  container->addInput(recv_tv);
-
-  auto* val_recv_peer = IrBuilder::create<Val>(recv_peer, DataType::Int);
-  auto* val_send_peer = IrBuilder::create<Val>(send_peer, DataType::Int);
-
-  auto recv = IrBuilder::create<P2PCommunication>(P2PCommunicationType::RECV, recv_tv, val_recv_peer, CommunicatorBackend::kCuda);
-  auto send = IrBuilder::create<P2PCommunication>(P2PCommunicationType::SEND, send_tv, val_send_peer, CommunicatorBackend::kCuda);
-  auto wait_recv = IrBuilder::create<hir::Wait>(recv);
-  auto wait_send = IrBuilder::create<hir::Wait>(send);
-
-  container->pushBackTopLevelExprs(recv);
-  container->pushBackTopLevelExprs(send);
-  container->pushBackTopLevelExprs(wait_recv);
-  container->pushBackTopLevelExprs(wait_send);
+  auto* tv = makeContigTensor(1);
+  container->addInput(tv);
+  if (my_rank == 0) {
+    const DeviceIdxType send_peer = (my_rank + 1) % size;
+
+    auto* val_send_peer = IrBuilder::create<Val>(send_peer, DataType::Int);
+    auto send = IrBuilder::create<P2PCommunication>(P2PCommunicationType::SEND, tv, val_send_peer, CommunicatorBackend::kCuda);
+    auto wait_send = IrBuilder::create<hir::Wait>(send);
+    container->pushBackTopLevelExprs(send);
+    container->pushBackTopLevelExprs(wait_send);
+  } else {
+    ASSERT_EQ(my_rank, 1);
+    const DeviceIdxType recv_peer = (size + my_rank - 1) % size;
+    auto* val_recv_peer = IrBuilder::create<Val>(recv_peer, DataType::Int);
+    auto recv = IrBuilder::create<P2PCommunication>(P2PCommunicationType::RECV, tv, val_recv_peer, CommunicatorBackend::kCuda);
+    auto wait_recv = IrBuilder::create<hir::Wait>(recv);
+    container->pushBackTopLevelExprs(recv);
+    container->pushBackTopLevelExprs(wait_recv);
+  }
 
   hir::HostIrEvaluator executor(std::move(container), communicator_);
 
-  at::Tensor send_tensor = at::empty({kTensorSize}, tensor_options);
-  at::Tensor recv_tensor = at::empty({kTensorSize}, tensor_options);
+  at::Tensor tensor = at::empty({kTensorSize}, tensor_options);
 
-  std::unordered_map<Val*, c10::IValue> inputs = {{send_tv, send_tensor}, {recv_tv, recv_tensor}};
+  std::unordered_map<Val*, c10::IValue> inputs = {{tv, tensor}};
 
   for (auto repetition : c10::irange(kNumRepetitions)) {
-    send_tensor.copy_(at::arange(kTensorSize, tensor_options) + repetition * my_rank);
-    std::cout << "RANK " << my_rank << " REPETITION " << repetition << ", send_peer=" << send_peer << ", recv_peer=" << recv_peer << ", send_tensor=" << send_tensor << std::endl;
+    tensor.copy_(at::arange(kTensorSize, tensor_options) + (1+repetition) * 10 + 100* (1+(1-my_rank)));
+    torch::cuda::synchronize();
+    communicator_->barrier();
+    std::cout << "RANK " << my_rank << " REPETITION " << repetition << ", tensor=" << tensor << std::endl;
 
     executor.runWithInput(inputs);
 
     torch::cuda::synchronize();
+    communicator_->barrier();
     std::cout << "RANK " << my_rank << " validation at" << " REPETITION " << repetition << std::endl;
-    auto ref = at::arange(kTensorSize, tensor_options) + repetition * recv_peer;
-    EXPECT_TRUE(torch::allclose(recv_tensor, ref)) << "Rank " << my_rank << " failed at repetition " << repetition << " with recv tensor " << recv_tensor << " and ref " << ref;
+    if (my_rank == 1) {
+      auto ref = at::arange(kTensorSize, tensor_options) + (1+repetition) * 10 + 100* (1+my_rank);
+      EXPECT_TRUE(torch::allclose(tensor, ref)) << "Rank " << my_rank << " failed at repetition " << repetition << " with tensor " << tensor << " and ref " << ref;
+    }
   }
 }
 

From c7c0404d50ae60f1c8b9596f1cc3ca73bdfaae74 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Fri, 7 Feb 2025 06:00:49 -0800
Subject: [PATCH 40/55] change signature of P2Pcomms to accept src and dst

---
 csrc/host_ir/executor.cpp                     | 30 +++++++++----
 csrc/multidevice/communication.cpp            | 43 ++++++-------------
 csrc/multidevice/communication.h              | 21 ++++-----
 tests/cpp/test_multidevice_communications.cpp |  5 ++-
 tests/cpp/test_multidevice_host_ir.cpp        | 12 +++---
 tests/cpp/test_multidevice_overlap.cpp        |  8 ++--
 6 files changed, 57 insertions(+), 62 deletions(-)

diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp
index 5a45c9b23e0..3e454270ec5 100644
--- a/csrc/host_ir/executor.cpp
+++ b/csrc/host_ir/executor.cpp
@@ -477,16 +477,26 @@ void HostIrEvaluator::handle(P2PCommunication* communication) {
       communicator_ != nullptr && communicator_->is_available(),
       "A valid communicator must be provided");
 
+  const int64_t my_rank = communicator_->deviceId();
+  const auto dst = expr_evaluator_.evaluate(communication->dst()).as<int64_t>();
+  const auto src = expr_evaluator_.evaluate(communication->src()).as<int64_t>();
+  const bool is_sender = my_rank == src;
+  const bool is_receiver = my_rank == dst;
+  if (!(is_sender || is_receiver)) {
+    return;
+  }
+
+  CommunicatorBackend backend_type = communication->backend();
   at::Tensor buffer =
       getKnownTensorOrUndefined(communication->buffer(), expr_evaluator_);
 
-  CommunicatorBackend backend_type = communication->backend();
 
   if (backend_type != CommunicatorBackend::kCuda) {
     works_[communication] = postSingleCommunication(
         communication,
         communicator_->deviceId(),
-        expr_evaluator_.evaluate(communication->peer()).as<int64_t>(),
+        expr_evaluator_.evaluate(communication->dst()).as<int64_t>(),
+        expr_evaluator_.evaluate(communication->src()).as<int64_t>(),
         communicator_->getWorld(),
         buffer);
     return;
@@ -540,8 +550,7 @@ void HostIrEvaluator::handle(P2PCommunication* communication) {
 
   const auto current_stream = reinterpret_cast<CUstream>(c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream());
   const std::vector<RemoteBufferInfo>& remote_buffers = communicator_->getRemoteBuffer(buffer, "");
-  const int64_t my_rank = communicator_->deviceId();
-  const int64_t peer = expr_evaluator_.evaluate(communication->peer()).as<int64_t>();
+  const int64_t peer = is_sender ? dst : src;
   const RemoteBufferInfo& my_buffer = remote_buffers.at(my_rank);
   const RemoteBufferInfo& peer_buffer = remote_buffers.at(peer);
   const auto local_semaphore = reinterpret_cast<CUdeviceptr>(&my_buffer.semaphores()[peer]);
@@ -549,7 +558,7 @@ void HostIrEvaluator::handle(P2PCommunication* communication) {
   static_assert(sizeof(IpcSemaphore) == sizeof(uint32_t), "IpcSemaphore must be 32 bits");
 
 
-  if (communication->type() == P2PCommunicationType::RECV) {
+  if (is_receiver) {
     std::cout << "RANK " << my_rank << " RECV, local semaphore=" << local_semaphore << ", remote semaphore=" << remote_semaphore << std::endl;
     // signal to self that transfer is in progress
     NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kTransferInProgress), CU_STREAM_WRITE_VALUE_DEFAULT));
@@ -591,13 +600,16 @@ void HostIrEvaluator::handle(Wait* wait) {
     return;
   }
 
-  if (p2p_comm->type() == P2PCommunicationType::RECV) {
+
+  const auto dst = expr_evaluator_.evaluate(p2p_comm->dst()).as<int64_t>();
+  const auto src = expr_evaluator_.evaluate(p2p_comm->src()).as<int64_t>();
+  const int64_t my_rank = communicator_->deviceId();
+  const bool is_receiver = my_rank == dst;
+  if (is_receiver) {
     // const auto current_stream = static_cast<CUstream>(c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream());
     const std::vector<RemoteBufferInfo>& remote_buffers = communicator_->getRemoteBuffer(getKnownTensorOrUndefined(p2p_comm->buffer(), expr_evaluator_), std::to_string(p2p_comm->buffer()->name()));
-    const int64_t my_rank = communicator_->deviceId();
-    const int64_t peer = expr_evaluator_.evaluate(p2p_comm->peer()).as<int64_t>();
     const RemoteBufferInfo& my_buffer = remote_buffers.at(my_rank);
-    const auto local_semaphore = reinterpret_cast<CUdeviceptr>(&my_buffer.semaphores()[peer]);
+    const auto local_semaphore = reinterpret_cast<CUdeviceptr>(&my_buffer.semaphores()[src]);
 
     std::cout << "RANK " << my_rank << " WAIT RECV BEFORE cuStreamWaitValue32 on local semaphore " << local_semaphore << std::endl;
     // NVFUSER_CUDA_SAFE_CALL(cuStreamWaitValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kReady), CU_STREAM_WAIT_VALUE_EQ));
diff --git a/csrc/multidevice/communication.cpp b/csrc/multidevice/communication.cpp
index 07861329567..e48290241b0 100644
--- a/csrc/multidevice/communication.cpp
+++ b/csrc/multidevice/communication.cpp
@@ -215,30 +215,16 @@ std::string Communication::toInlineString(int indent_size) const {
   return toString(indent_size);
 }
 
-std::ostream& operator<<(std::ostream& os, const P2PCommunicationType& type) {
-  switch (type) {
-    case P2PCommunicationType::SEND:
-      os << "send";
-      break;
-    case P2PCommunicationType::RECV:
-      os << "recv";
-      break;
-    default:
-      NVF_THROW("unrecognized P2PCommunicationType: ", type);
-  }
-  return os;
-}
-
 P2PCommunication::P2PCommunication(
     IrBuilderPasskey passkey,
-    P2PCommunicationType type,
     TensorView* buffer,
-    Val* peer,
+    Val* dst,
+    Val* src,
     CommunicatorBackend backend)
     : Expr(passkey) {
   addInput(buffer);
-  addDataAttribute(type);
-  addAttribute(peer);
+  addAttribute(dst);
+  addAttribute(src);
   addDataAttribute(backend);
 }
 
@@ -247,9 +233,9 @@ NVFUSER_DEFINE_CLONE_AND_CREATE(P2PCommunication)
 std::string P2PCommunication::toString(const int indent_size) const {
   std::stringstream ss;
   indent(ss, indent_size) << "P2PCommunication " << name() << " ("
-                          << "type=" << type() << ", "
                           << "buffer=" << buffer() << ", "
-                          << "peer=" << peer() << ")\n";
+                          << "dst=" << dst() << ", "
+                          << "src=" << src() << ")\n";
   return ss.str();
 }
 
@@ -588,19 +574,18 @@ c10::intrusive_ptr<c10d::Work> postRecv(
 c10::intrusive_ptr<c10d::Work> postSingleCommunication(
     P2PCommunication* communication,
     DeviceIdxType my_device_index,
-    DeviceIdxType peer,
+    DeviceIdxType dst,
+    DeviceIdxType src,
     c10d::Backend* backend,
     at::Tensor buffer) {
   NVF_ERROR(backend != nullptr);
 
-  switch (communication->type()) {
-    case P2PCommunicationType::SEND:
-      return postSend(communication, my_device_index, peer, backend, buffer);
-    case P2PCommunicationType::RECV:
-      return postRecv(communication, my_device_index, peer, backend, buffer);
-    default:
-      NVF_THROW("Wrong communication type: ", communication->type());
-      return nullptr;
+  if (my_device_index == src) {
+    return postSend(communication, my_device_index, dst, backend, buffer);
+  } else if (my_device_index == dst) {
+    return postRecv(communication, my_device_index, src, backend, buffer);
+  } else {
+    return nullptr;
   }
 }
 
diff --git a/csrc/multidevice/communication.h b/csrc/multidevice/communication.h
index d8724356e15..6c2049fba3e 100644
--- a/csrc/multidevice/communication.h
+++ b/csrc/multidevice/communication.h
@@ -121,19 +121,15 @@ class Communication : public Expr {
   void validate();
 };
 
-enum class P2PCommunicationType { SEND, RECV };
-
-std::ostream& operator<<(std::ostream& os, const P2PCommunicationType& type);
-
 class P2PCommunication : public Expr {
  public:
   using Expr::Expr;
 
   P2PCommunication(
       IrBuilderPasskey passkey,
-      P2PCommunicationType type,
       TensorView* buffer,
-      Val* peer,
+      Val* dst,
+      Val* src,
       CommunicatorBackend backend = CommunicatorBackend::kNccl);
 
   P2PCommunication(const P2PCommunication& other) = delete;
@@ -149,15 +145,15 @@ class P2PCommunication : public Expr {
     return "P2PCommunication";
   }
 
-  P2PCommunicationType type() const {
-    return attribute<P2PCommunicationType>(0);
-  }
-
   TensorView* buffer() const {
     return input(0)->as<TensorView>();
   }
 
-  Val* peer() const {
+  Val* dst() const {
+    return attributeVal(0);
+  }
+
+  Val* src() const {
     return attributeVal(1);
   }
 
@@ -235,7 +231,8 @@ c10::intrusive_ptr<c10d::Work> postSingleCommunication(
 c10::intrusive_ptr<c10d::Work> postSingleCommunication(
     P2PCommunication* communication,
     DeviceIdxType my_device_index,
-    DeviceIdxType peer,
+    DeviceIdxType dst,
+    DeviceIdxType src,
     c10d::Backend* backend,
     at::Tensor buffer);
 
diff --git a/tests/cpp/test_multidevice_communications.cpp b/tests/cpp/test_multidevice_communications.cpp
index ad13c747a5b..e5e6e3e78e1 100644
--- a/tests/cpp/test_multidevice_communications.cpp
+++ b/tests/cpp/test_multidevice_communications.cpp
@@ -435,12 +435,13 @@ TEST_F(P2PCommunicationTest, CudaComm) {
   auto container = std::make_unique<hir::HostIrContainer>();
   FusionGuard fg(container.get());
   auto* tv = makeContigTensor(1);
+  auto* val_my_rank = IrBuilder::create<Val>(my_rank, DataType::Int);
   container->addInput(tv);
   if (my_rank == 0) {
     const DeviceIdxType send_peer = (my_rank + 1) % size;
 
     auto* val_send_peer = IrBuilder::create<Val>(send_peer, DataType::Int);
-    auto send = IrBuilder::create<P2PCommunication>(P2PCommunicationType::SEND, tv, val_send_peer, CommunicatorBackend::kCuda);
+    auto send = IrBuilder::create<P2PCommunication>(tv, val_send_peer, val_my_rank, CommunicatorBackend::kCuda);
     auto wait_send = IrBuilder::create<hir::Wait>(send);
     container->pushBackTopLevelExprs(send);
     container->pushBackTopLevelExprs(wait_send);
@@ -448,7 +449,7 @@ TEST_F(P2PCommunicationTest, CudaComm) {
     ASSERT_EQ(my_rank, 1);
     const DeviceIdxType recv_peer = (size + my_rank - 1) % size;
     auto* val_recv_peer = IrBuilder::create<Val>(recv_peer, DataType::Int);
-    auto recv = IrBuilder::create<P2PCommunication>(P2PCommunicationType::RECV, tv, val_recv_peer, CommunicatorBackend::kCuda);
+    auto recv = IrBuilder::create<P2PCommunication>(tv, val_my_rank, val_recv_peer, CommunicatorBackend::kCuda);
     auto wait_recv = IrBuilder::create<hir::Wait>(recv);
     container->pushBackTopLevelExprs(recv);
     container->pushBackTopLevelExprs(wait_recv);
diff --git a/tests/cpp/test_multidevice_host_ir.cpp b/tests/cpp/test_multidevice_host_ir.cpp
index b2e2d12cb6d..ab65c27405f 100644
--- a/tests/cpp/test_multidevice_host_ir.cpp
+++ b/tests/cpp/test_multidevice_host_ir.cpp
@@ -262,13 +262,13 @@ TEST_F(P2PCommHostIrTest, RingPairwiseExchange) {
   TensorView* recv_buffer = makeContigTensor(1);
 
   auto* send = IrBuilder::create<P2PCommunication>(
-      P2PCommunicationType::SEND,
       send_buffer,
-      IrBuilder::create<Val>(send_peer));
+      IrBuilder::create<Val>(send_peer),
+      IrBuilder::create<Val>(my_device_index));
 
   auto* recv = IrBuilder::create<P2PCommunication>(
-      P2PCommunicationType::RECV,
       recv_buffer,
+      IrBuilder::create<Val>(my_device_index),
       IrBuilder::create<Val>(recv_peer));
 
   auto* wait = IrBuilder::create<Wait>(recv);
@@ -316,12 +316,12 @@ TEST_F(P2PCommHostIrTest, CoalescedRingPairwiseExchange) {
 
   auto* start_coalescing = IrBuilder::create<StartCoalescing>();
   auto* send = IrBuilder::create<P2PCommunication>(
-      P2PCommunicationType::SEND,
       send_buffer,
-      IrBuilder::create<Val>(send_peer));
+      IrBuilder::create<Val>(send_peer),
+      IrBuilder::create<Val>(my_device_index));
   auto* recv = IrBuilder::create<P2PCommunication>(
-      P2PCommunicationType::RECV,
       recv_buffer,
+      IrBuilder::create<Val>(my_device_index),
       IrBuilder::create<Val>(recv_peer));
   auto* end_coalescing = IrBuilder::create<EndCoalescing>();
   auto* wait = IrBuilder::create<Wait>(end_coalescing);
diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp
index 47bb5d915db..c1df5684c47 100644
--- a/tests/cpp/test_multidevice_overlap.cpp
+++ b/tests/cpp/test_multidevice_overlap.cpp
@@ -1076,9 +1076,9 @@ TEST_F(
 
   auto* start_coalescing = IrBuilder::create<hir::StartCoalescing>();
   auto* send = IrBuilder::create<P2PCommunication>(
-      P2PCommunicationType::SEND, src_buffer_ij, send_rank);
+      src_buffer_ij, send_rank, my_device_index_val);
   auto* recv = IrBuilder::create<P2PCommunication>(
-      P2PCommunicationType::RECV, dst_buffer_ij, recv_rank);
+      dst_buffer_ij, my_device_index_val, recv_rank);
   auto* end_coalescing = IrBuilder::create<hir::EndCoalescing>();
   auto* wait = IrBuilder::create<hir::Wait>(end_coalescing);
 
@@ -1668,9 +1668,9 @@ TEST_F(
 
   auto* start_coalescing = IrBuilder::create<hir::StartCoalescing>();
   auto* send = IrBuilder::create<P2PCommunication>(
-      P2PCommunicationType::SEND, tva_j_curr_slice, send_rank);
+      tva_j_curr_slice, send_rank, my_device_index_val);
   auto* recv = IrBuilder::create<P2PCommunication>(
-      P2PCommunicationType::RECV, tva_j_next_slice, recv_rank);
+      tva_j_next_slice, my_device_index_val, recv_rank);
   auto* end_coalescing = IrBuilder::create<hir::EndCoalescing>();
   auto* wait = IrBuilder::create<hir::Wait>(end_coalescing);
 

From 6c20a20bc62fc255e8585b6ea442079224424eb9 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Tue, 11 Feb 2025 15:38:14 -0800
Subject: [PATCH 41/55] working chkpt with get zcopy

---
 csrc/dispatch.h                               |   3 +-
 csrc/host_ir/executor.cpp                     | 224 ++++++++++++------
 csrc/host_ir/executor.h                       |  46 ++++
 csrc/host_ir/host_ir.cpp                      |  22 ++
 csrc/host_ir/host_ir.h                        |  23 ++
 csrc/multidevice/communicator.cpp             |  61 -----
 csrc/multidevice/communicator.h               |  65 +----
 tests/cpp/test_multidevice_communications.cpp |  67 +++---
 8 files changed, 288 insertions(+), 223 deletions(-)

diff --git a/csrc/dispatch.h b/csrc/dispatch.h
index ee47464a6fb..1eb584bc2d7 100644
--- a/csrc/dispatch.h
+++ b/csrc/dispatch.h
@@ -155,7 +155,8 @@ class Val;
   f(Wait);                            \
   f(Synchronize);                     \
   f(StartCoalescing);                 \
-  f(EndCoalescing);
+  f(EndCoalescing);                   \
+  f(ShareMemHandles);
 
 // Forward declarations for all Val and Expr types
 
diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp
index 3e454270ec5..6b144ddd7d8 100644
--- a/csrc/host_ir/executor.cpp
+++ b/csrc/host_ir/executor.cpp
@@ -410,6 +410,125 @@ void HostIrEvaluator::handle(PostOnStream* post_ir) {
   }
 }
 
+RemoteBufferInfo::RemoteBufferInfo(at::Tensor tensor) : ptr_(tensor.data_ptr()), storage_offset_(tensor.storage_offset()), element_size_(tensor.element_size()), is_imported_(false) {
+
+  NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcGetMemHandle(&ipc_handle_, tensor.data_ptr()));
+  const auto number_of_semaphores = Communicator::getInstance().size();
+  NVFUSER_CUDA_RT_SAFE_CALL(cudaMalloc((void**)&semaphores_, number_of_semaphores * sizeof(IpcSemaphore)));
+  static_assert(sizeof(IpcSemaphore) == sizeof(int), "IpcSemaphore must be same size as int");
+  NVFUSER_CUDA_RT_SAFE_CALL(cudaMemset((void*) semaphores_, (int)IpcSemaphore::kReady, number_of_semaphores * sizeof(IpcSemaphore)));
+  NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcGetMemHandle(&semaphores_ipc_handle_, semaphores_));
+}
+
+RemoteBufferInfo::RemoteBufferInfo(std::vector<uint8_t> data) : is_imported_(true) {
+  const RemoteBufferInfo& imported_buffer = fromBytes<RemoteBufferInfo>(data);
+
+  storage_offset_ = imported_buffer.storage_offset_;
+  element_size_ = imported_buffer.element_size_;
+  ipc_handle_ = imported_buffer.ipc_handle_;
+  semaphores_ipc_handle_ = imported_buffer.semaphores_ipc_handle_;
+
+  NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcOpenMemHandle(&ptr_, ipc_handle_, cudaIpcMemLazyEnablePeerAccess));
+  ptr_ = (void*)((uint8_t*)ptr_ + storage_offset_ * element_size_);
+
+  NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcOpenMemHandle((void**)&semaphores_, semaphores_ipc_handle_, cudaIpcMemLazyEnablePeerAccess));
+}
+
+RemoteBufferInfo::~RemoteBufferInfo() {
+  if (is_imported_) {
+    std::cout << "RANK " << Communicator::getInstance().deviceId() << " closes ipc handle" << std::endl;
+    NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcCloseMemHandle(ptr_));
+    NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcCloseMemHandle((void*)semaphores_));
+  } else {
+    std::cout << "RANK " << Communicator::getInstance().deviceId() << " frees semaphores" << std::endl;
+    NVFUSER_CUDA_RT_SAFE_CALL(cudaFree((void*)semaphores_));
+  }
+}
+
+std::ostream& operator<<(std::ostream& os, const cudaIpcMemHandle_t& info) {
+  uint8_t* ptr = (uint8_t*)&info;
+  for (int i = 0; i < (int)sizeof(cudaIpcMemHandle_t); i++) {
+    os << ptr[i];
+  }
+  return os;
+}
+
+
+
+std::ostream& operator<<(std::ostream& os, const RemoteBufferInfo& info) {
+  os << "RemoteBufferInfo(ptr=" << info.ptr_
+     << ", storage_offset=" << info.storage_offset_
+     << ", element_size=" << info.element_size_
+     << ", is_imported=" << info.is_imported_
+     << ", semaphores_=" << info.semaphores_
+     << ", ipc_handle_=" << info.ipc_handle_
+     << ", semaphores_ipc_handle_=" << info.semaphores_ipc_handle_
+     << ")";
+  return os;
+}
+
+void HostIrEvaluator::handle(ShareMemHandles* share_mem_handles) {
+  const int64_t my_rank = communicator_->deviceId();
+  auto get_tensor = [this](P2PCommunication* communication) -> at::Tensor { 
+    return this->expr_evaluator_.evaluate(communication->buffer()).as<at::Tensor>();
+  };
+
+  std::vector<P2PCommunication*> communications;
+  for (auto expr: share_mem_handles->communications()) {
+    auto communication = expr->as<P2PCommunication>();
+    const auto dst = expr_evaluator_.evaluate(communication->dst()).as<int64_t>();
+    const auto src = expr_evaluator_.evaluate(communication->src()).as<int64_t>();
+    const bool is_sender = my_rank == src;
+    const bool is_receiver = my_rank == dst;
+    if (!(is_sender ^ is_receiver)) { // REMOVE or adapt exporting/opening the handle
+      return;
+    }
+    if (remote_buffers_.find(get_tensor(communication)) != remote_buffers_.end()) {
+      continue;
+    }
+    communications.push_back(communication);
+  }
+
+  // put memhandles to TCP store
+  auto get_key = [this] (P2PCommunication* communication, int64_t rank) -> std::string { 
+    return "nvfuser_remote_buffer_info_P2PComm_dst=" + std::to_string(this->expr_evaluator_.evaluate(communication->dst()).as<int64_t>()) + "_src=" + std::to_string(this->expr_evaluator_.evaluate(communication->src()).as<int64_t>()) + "_rank=" + std::to_string(rank);
+  };
+  std::unordered_map<P2PCommunication*, std::unique_ptr<RemoteBufferInfo>> buffer_handles;
+  auto store = communicator_->getTcpStore();
+  for (P2PCommunication* communication: communications) {
+    auto buffer_handle = std::make_unique<RemoteBufferInfo>(get_tensor(communication));
+    std::cout << "RANK " << my_rank << " registers at key " << get_key(communication, my_rank) << std::endl;
+    store->set(get_key(communication, my_rank), toBytes(*buffer_handle));
+    std::cout << "RANK "  << my_rank << " creates buffer_handle " << *buffer_handle << std::endl;
+    buffer_handles.emplace(communication, std::move(buffer_handle));
+  }
+
+  // barrier to ensure all ranks have pushed their memhandles to the store
+  // TODO: precisely select what ranks need to wait on that barrier.
+  communicator_->barrier();
+
+    // get memhandles to TCP store
+  for (P2PCommunication* communication: communications) {
+    std::vector<std::unique_ptr<RemoteBufferInfo>> remote_buffers;
+    remote_buffers.reserve(communicator_->size());
+    for (int64_t rank : c10::irange(communicator_->size())) {
+      std::cout << "RANK " << my_rank << " after barrier for key " << get_key(communication, rank) << std::endl;
+      if (rank == my_rank) {
+        // opening an ipc handle on the exporter's device is not supported
+        remote_buffers.push_back(std::move(buffer_handles.at(communication)));
+      } else {
+        std::string key = get_key(communication, rank);
+        NVF_ERROR(store->check({key}), "key ", key, " not found in store at rank ", my_rank);
+        auto imported_remote_buffer_info = std::make_unique<RemoteBufferInfo>(store->get(key));
+        remote_buffers.push_back(std::move(imported_remote_buffer_info));
+      }
+      std::cout << "RANK "  << my_rank << " emplaces at rank " << rank << " remote buffer " << *remote_buffers.back() << std::endl;
+    }
+    remote_buffers_.emplace(get_tensor(communication), std::move(remote_buffers));
+  }
+}
+
+
 void HostIrEvaluator::handle(Communication* communication) {
   NVF_ERROR(
       communicator_ != nullptr && communicator_->is_available(),
@@ -482,7 +601,7 @@ void HostIrEvaluator::handle(P2PCommunication* communication) {
   const auto src = expr_evaluator_.evaluate(communication->src()).as<int64_t>();
   const bool is_sender = my_rank == src;
   const bool is_receiver = my_rank == dst;
-  if (!(is_sender || is_receiver)) {
+  if (!(is_sender ^ is_receiver)) {
     return;
   }
 
@@ -502,80 +621,49 @@ void HostIrEvaluator::handle(P2PCommunication* communication) {
     return;
   }
 
-
-
-  // FIST TIME:
-  // sender exports cudaIpc mem handle on input buffer and put it to store
-  // sender signals recv it can open the mem handle. It needs to be CPU blocking
-  // recv opens the handle and gets the pointer.
-  // It copies the data and then signal sender on completion
-
-  // SECOND TIME:
-  // Sender signals recv it can copy
-  // Recv copies the data and signals sender on completion
-
-// each rank must have a bool "recvied" and a "sent" bool per rank. So n+1
-// each rank must have, per rank, a sent_to and a received_from a bool "recvied" and a "sent" bool per rank. So n+1
-
-
-
-  // std::string prefix = "nvfuser_ipc_tensor_info_" + communication->buffer()->name() + "_";
-  // IpcTensorInfo ipc_tensor_info;
-  // NVFUSER_CUDA_RT_SAFE_CALL(
-  //     cudaIpcGetMemHandle(&ipc_tensor_info.ipc_handle, buffer.data_ptr()));
-  // ipc_tensor_info.storage_offset = buffer.storage_offset();
-  // ipc_tensor_info.element_size = buffer.element_size();
-
-  // auto store = communicator_->getTcpStore();
-  // store->set(prefix + std::to_string(my_rank), toBytes(ipc_tensor_info));
-
-  // Team team = {my_rank, peer};
-  // communicator_->getBackendForTeam(team, CommunicatorBackend::kNccl)->barrier()->wait();
-
-  // for (int64_t rank : c10::irange(size())) {
-  //   if (rank == my_rank) {
-  //     remote_ptrs.at(rank) = tensor.data_ptr();
-  //   } else {
-  //     ipc_tensor_info =
-  //         fromBytes<IpcTensorInfo>(store->get(prefix + std::to_string(rank)));
-  //     void*& ptr = remote_ptrs.at(rank);
-  //     NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcOpenMemHandle(
-  //         &ptr, ipc_tensor_info.ipc_handle, cudaIpcMemLazyEnablePeerAccess));
-  //     // TODO: close ipc mem handle at shutdown
-  //     ptr = (void*)((uint8_t*)ptr +
-  //                   ipc_tensor_info.storage_offset *
-  //                       ipc_tensor_info.element_size);
-  //   }
-  // }
-
-  const auto current_stream = reinterpret_cast<CUstream>(c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream());
-  const std::vector<RemoteBufferInfo>& remote_buffers = communicator_->getRemoteBuffer(buffer, "");
+  const auto it = remote_buffers_.find(buffer);
+  NVF_ERROR(it != remote_buffers_.end(), "No remote buffer found for ", communication->toString(), " at rank ", my_rank);
+  const std::vector<std::unique_ptr<RemoteBufferInfo>>& remote_buffers = it->second;
   const int64_t peer = is_sender ? dst : src;
-  const RemoteBufferInfo& my_buffer = remote_buffers.at(my_rank);
-  const RemoteBufferInfo& peer_buffer = remote_buffers.at(peer);
+  const RemoteBufferInfo& my_buffer = *remote_buffers.at(my_rank);
+  const RemoteBufferInfo& peer_buffer = *remote_buffers.at(peer);
   const auto local_semaphore = reinterpret_cast<CUdeviceptr>(&my_buffer.semaphores()[peer]);
   const auto remote_semaphore = reinterpret_cast<CUdeviceptr>(&peer_buffer.semaphores()[my_rank]);
   static_assert(sizeof(IpcSemaphore) == sizeof(uint32_t), "IpcSemaphore must be 32 bits");
 
+  const auto current_stream = reinterpret_cast<CUstream>(c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream());
 
   if (is_receiver) {
-    std::cout << "RANK " << my_rank << " RECV, local semaphore=" << local_semaphore << ", remote semaphore=" << remote_semaphore << std::endl;
+    std::cout << "RANK " << my_rank << " RECV, peer=" << peer << ", local semaphore=" << local_semaphore << ", remote semaphore=" << remote_semaphore << ", my_buffer.ptr()=" << my_buffer.ptr() << ", buffer.data_ptr()=" << buffer.data_ptr() << "recv tensor=" << buffer << std::endl;
     // signal to self that transfer is in progress
     NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kTransferInProgress), CU_STREAM_WRITE_VALUE_DEFAULT));
     // signal sender that receiver is ready
     NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(current_stream, remote_semaphore, (cuuint32_t)(IpcSemaphore::kTransferInProgress), CU_STREAM_WRITE_VALUE_DEFAULT)); // passing CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER gives an error
+    std::cout << "RANK " << my_rank << " RECV BEFORE MEMCPY, peer=" << peer << ", local semaphore=" << local_semaphore << ", remote semaphore=" << remote_semaphore << ", my_buffer.ptr()=" << my_buffer.ptr() << ", buffer.data_ptr()=" << buffer.data_ptr() << "recv tensor=" << buffer << std::endl;
+    NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpy(
+        buffer.data_ptr(),
+        peer_buffer.ptr(),
+        // my_buffer.ptr(),
+        buffer.numel() * buffer.element_size(),
+        cudaMemcpyDeviceToDevice
+        // current_stream));
+        ));
+    std::cout << "RANK " << my_rank << " RECV AFTER MEMCPY, peer=" << peer << ", local semaphore=" << local_semaphore << ", remote semaphore=" << remote_semaphore << ", my_buffer.ptr()=" << my_buffer.ptr() << ", buffer.data_ptr()=" << buffer.data_ptr() << "recv tensor=" << buffer << std::endl;
   } else /*sender*/ {
-    std::cout << "RANK " << my_rank << " SEND, local semaphore=" << local_semaphore << ", remote semaphore=" << remote_semaphore << std::endl;
+    std::cout << "RANK " << my_rank << " SEND, peer=" << peer << ", local semaphore=" << local_semaphore << ", remote semaphore=" << remote_semaphore << std::endl;
     // wait for sender to be ready
-    // NVFUSER_CUDA_SAFE_CALL(cuStreamWaitValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kTransferInProgress), CU_STREAM_WAIT_VALUE_EQ));
-    std::cout << "RANK " << my_rank << " SEND after 1st WAIT" << std::endl;
+    NVFUSER_CUDA_SAFE_CALL(cuStreamWaitValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kTransferInProgress), CU_STREAM_WAIT_VALUE_EQ));
+    std::cout << "RANK " << my_rank << " SEND after 1st WAIT" << ", buffer.data_ptr()=" << buffer.data_ptr() << ", my_buffer.ptr()=" << my_buffer.ptr() << ", sent tensor=" << buffer << ", buffer.numel()=" << buffer.numel() << ", buffer.element_size()=" << buffer.element_size() << std::endl;
     // RDMA writes data from sender to receiver
-    NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync(
-        peer_buffer.ptr(),
-        my_buffer.ptr(),
-        buffer.numel() * buffer.element_size(),
-        cudaMemcpyDeviceToDevice,
-        current_stream));
+    // NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync(
+    // NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpy(
+    //     peer_buffer.ptr(),
+    //     buffer.data_ptr(),
+    //     // my_buffer.ptr(),
+    //     buffer.numel() * buffer.element_size(),
+    //     cudaMemcpyDeviceToDevice
+    //     // current_stream));
+    //     ));
     std::cout << "RANK " << my_rank << " SEND after memcpy" << std::endl;
     // Signals completion to self
     NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kReady), CU_STREAM_WRITE_VALUE_DEFAULT));
@@ -606,14 +694,18 @@ void HostIrEvaluator::handle(Wait* wait) {
   const int64_t my_rank = communicator_->deviceId();
   const bool is_receiver = my_rank == dst;
   if (is_receiver) {
-    // const auto current_stream = static_cast<CUstream>(c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream());
-    const std::vector<RemoteBufferInfo>& remote_buffers = communicator_->getRemoteBuffer(getKnownTensorOrUndefined(p2p_comm->buffer(), expr_evaluator_), std::to_string(p2p_comm->buffer()->name()));
-    const RemoteBufferInfo& my_buffer = remote_buffers.at(my_rank);
+    const auto current_stream = static_cast<CUstream>(c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream());
+    at::Tensor buffer = getKnownTensorOrUndefined(p2p_comm->buffer(), expr_evaluator_);
+    const auto it = remote_buffers_.find(buffer);
+    NVF_ERROR(it != remote_buffers_.end(), "No remote buffer found for ", p2p_comm->toString(), " at rank ", my_rank);
+    const std::vector<std::unique_ptr<RemoteBufferInfo>>& remote_buffers = it->second;
+
+    const RemoteBufferInfo& my_buffer = *remote_buffers.at(my_rank);
     const auto local_semaphore = reinterpret_cast<CUdeviceptr>(&my_buffer.semaphores()[src]);
 
     std::cout << "RANK " << my_rank << " WAIT RECV BEFORE cuStreamWaitValue32 on local semaphore " << local_semaphore << std::endl;
-    // NVFUSER_CUDA_SAFE_CALL(cuStreamWaitValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kReady), CU_STREAM_WAIT_VALUE_EQ));
-    std::cout << "RANK " << my_rank << " FINISHED WAIT RECV AFTER cuStreamWaitValue32 on local semaphore " << local_semaphore << std::endl;
+    NVFUSER_CUDA_SAFE_CALL(cuStreamWaitValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kReady), CU_STREAM_WAIT_VALUE_EQ));
+    std::cout << "RANK " << my_rank << " FINISHED WAIT RECV AFTER cuStreamWaitValue32 on local semaphore " << local_semaphore << ", buffer.data_ptr()=" << buffer.data_ptr() << ", my_buffer.ptr()=" << my_buffer.ptr() << "recv tensor=" << buffer << std::endl;
   }
 }
 
diff --git a/csrc/host_ir/executor.h b/csrc/host_ir/executor.h
index 8e281b66143..f052c1bfeb7 100644
--- a/csrc/host_ir/executor.h
+++ b/csrc/host_ir/executor.h
@@ -50,6 +50,35 @@ class HostIrExecutor : public ExecutorAbstract {
 
 namespace hir {
 
+enum class IpcSemaphore : cuuint32_t {
+  kReady,
+  kTransferInProgress
+};
+
+class RemoteBufferInfo {
+ public:
+
+  RemoteBufferInfo(at::Tensor tensor);
+  RemoteBufferInfo(std::vector<uint8_t> data); // means it is imported
+  ~RemoteBufferInfo();
+
+  void* ptr() const {
+    return ptr_;
+  }
+
+  auto semaphores() const {
+    return semaphores_;
+  }
+
+  void* ptr_;
+  int64_t storage_offset_;
+  int64_t element_size_;
+  bool is_imported_;
+  cudaIpcMemHandle_t ipc_handle_;
+  cudaIpcMemHandle_t semaphores_ipc_handle_;
+  IpcSemaphore* semaphores_;
+};
+
 /*
 a HostIrEvaluator evaluates a host programs represented through a
 HostIrContainer It is instantiated with the desired HostIrContainer, and runs
@@ -129,6 +158,7 @@ class HostIrEvaluator final : public OptOutDispatch {
   void handle(MatmulOp* matmul) override;
   void handle(LinearOp* linear) override;
   void handle(kir::Allocate* allocate) override;
+  void handle(ShareMemHandles* share_mem_handles) override;
   void unhandled(Statement* stmt) override;
 
   c10::cuda::CUDAStream getCUDAStream(Stream* stream);
@@ -145,6 +175,22 @@ class HostIrEvaluator final : public OptOutDispatch {
   std::unordered_map<StreamKey, c10::cuda::CUDAStream> streams_;
   std::unordered_map<Expr*, c10::intrusive_ptr<c10d::Work>> works_;
   const int64_t my_local_device_index_;
+  struct TensorHash {
+    std::size_t operator()(const at::Tensor& tensor) const {
+      auto ptr = reinterpret_cast<std::uintptr_t>(tensor.data_ptr());
+      auto offset = tensor.storage_offset();
+      auto element_size = tensor.element_size();
+      return std::hash<std::uintptr_t>()(ptr) ^ std::hash<int64_t>()(offset) ^
+          std::hash<int>()(element_size);
+    }
+  };
+  struct TensorEqual {
+    bool operator()(const at::Tensor& lhs, const at::Tensor& rhs) const {
+      return lhs.equal(rhs);
+    }
+  };
+  std::unordered_map<at::Tensor, std::vector<std::unique_ptr<RemoteBufferInfo>>, TensorHash, TensorEqual>
+      remote_buffers_;
 };
 
 } // namespace hir
diff --git a/csrc/host_ir/host_ir.cpp b/csrc/host_ir/host_ir.cpp
index c99ddb2f345..5ea51fd82ff 100644
--- a/csrc/host_ir/host_ir.cpp
+++ b/csrc/host_ir/host_ir.cpp
@@ -323,6 +323,28 @@ std::string EndCoalescing::toInlineString(int indent_size) const {
   NVF_CHECK(false, "Cannot be printed inline");
 }
 
+
+ShareMemHandles::ShareMemHandles(IrBuilderPasskey passkey, std::vector<P2PCommunication*> communications) : Expr(passkey) {
+  NVF_ERROR(passkey.ir_container_ != nullptr);
+  NVF_ERROR(
+      passkey.ir_container_->isA<HostIrContainer>(),
+      this,
+      "must be registered in a HostIrContainer");
+  addDataAttribute(std::move(communications));
+}
+
+NVFUSER_DEFINE_CLONE_AND_CREATE(ShareMemHandles)
+
+std::string ShareMemHandles::toString(int indent_size) const {
+  std::stringstream ss;
+  indent(ss, indent_size) << "ShareMemHandles" << std::endl;
+  return ss.str();
+}
+
+std::string ShareMemHandles::toInlineString(int indent_size) const {
+  NVF_CHECK(false, "Cannot be printed inline");
+}
+
 } // namespace hir
 
 } // namespace nvfuser
diff --git a/csrc/host_ir/host_ir.h b/csrc/host_ir/host_ir.h
index 3ca06779684..64cdb404a8c 100644
--- a/csrc/host_ir/host_ir.h
+++ b/csrc/host_ir/host_ir.h
@@ -315,6 +315,29 @@ class EndCoalescing : public Expr {
   }
 };
 
+class ShareMemHandles : public Expr {
+ public:
+  using Expr::Expr;
+  ShareMemHandles(IrBuilderPasskey passkey, std::vector<P2PCommunication*> communications);
+
+  ShareMemHandles(const ShareMemHandles& other) = delete;
+  ShareMemHandles& operator=(const ShareMemHandles& other) = delete;
+  ShareMemHandles(ShareMemHandles&& other) = delete;
+  ShareMemHandles& operator=(ShareMemHandles&& other) = delete;
+
+  NVFUSER_DECLARE_CLONE_AND_CREATE
+
+  std::string toString(int indent_size = 0) const override;
+  std::string toInlineString(int indent_size = 0) const override;
+  const char* getOpString() const override {
+    return "hir::ShareMemHandles";
+  }
+
+  const std::vector<P2PCommunication*>& communications() {
+    return attribute<std::vector<P2PCommunication*>>(0);
+  }
+};
+
 } // namespace hir
 
 } // namespace nvfuser
diff --git a/csrc/multidevice/communicator.cpp b/csrc/multidevice/communicator.cpp
index 902ceaaa64d..ce102695637 100644
--- a/csrc/multidevice/communicator.cpp
+++ b/csrc/multidevice/communicator.cpp
@@ -319,65 +319,4 @@ void Communicator::barrier(std::optional<CommunicatorBackend> backend) {
   getWorld(backend)->barrier(options)->wait();
 }
 
-RemoteBufferInfo::RemoteBufferInfo(at::Tensor tensor, int64_t size) : ptr_(tensor.data_ptr()), size_(size), storage_offset_(tensor.storage_offset()), element_size_(tensor.element_size()), is_imported_(false) {
-  NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcGetMemHandle(&ipc_handle_, tensor.data_ptr()));
-  NVFUSER_CUDA_RT_SAFE_CALL(cudaMalloc((void**)&semaphores_, size_ * sizeof(IpcSemaphore)));
-  static_assert(sizeof(IpcSemaphore) == sizeof(int), "IpcSemaphore must be same size as int");
-  NVFUSER_CUDA_RT_SAFE_CALL(cudaMemset((void*) semaphores_, (int)IpcSemaphore::kReady, size_ * sizeof(IpcSemaphore)));
-  NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcGetMemHandle(&semaphores_ipc_handle_, semaphores_));
-}
-
-RemoteBufferInfo::RemoteBufferInfo(std::vector<uint8_t> data) : is_imported_(true) {
-  RemoteBufferInfo imported_buffer = fromBytes<RemoteBufferInfo>(data);
-
-  size_ = imported_buffer.size_;
-  storage_offset_ = imported_buffer.storage_offset_;
-  element_size_ = imported_buffer.element_size_;
-  ipc_handle_ = imported_buffer.ipc_handle_;
-  semaphores_ipc_handle_ = imported_buffer.semaphores_ipc_handle_;
-
-  NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcOpenMemHandle(&ptr_, ipc_handle_, cudaIpcMemLazyEnablePeerAccess));
-  ptr_ = (void*)((uint8_t*)ptr_ + storage_offset_ * element_size_);
-
-  NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcOpenMemHandle((void**)&semaphores_, semaphores_ipc_handle_, cudaIpcMemLazyEnablePeerAccess));
-}
-
-RemoteBufferInfo::~RemoteBufferInfo() {
-  // if (is_imported_) {
-  //   NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcCloseMemHandle(&ipc_handle_));
-  //   NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcCloseMemHandle(&semaphores_ipc_handle_));
-  // } else {
-  //   NVFUSER_CUDA_RT_SAFE_CALL(cudaFree(semaphores_));
-  // }
-}
-
-
-std::vector<RemoteBufferInfo> Communicator::getRemoteBuffer(at::Tensor tensor, std::string key) {
-  auto it = remote_buffers_.find(tensor);
-  if (it == remote_buffers_.end()) {
-    RemoteBufferInfo buffer_handle(tensor, size_);
-
-    auto store = getTcpStore();
-    std::string prefix = "nvfuser_remote_buffer_info_" + key;
-    std::cout << "RANK " << deviceId() << "registers at key " << prefix + std::to_string(deviceId()) << std::endl;
-    store->set(prefix + std::to_string(deviceId()), toBytes(buffer_handle));
-
-    barrier();
-
-    std::cout << "RANK " << deviceId() << "after barrier for key " << prefix + std::to_string(deviceId()) << std::endl;
-    std::vector<RemoteBufferInfo> remote_buffers;
-    remote_buffers.reserve(size_);
-    for (int64_t rank : c10::irange(size_)) {
-      if (rank == deviceId()) {
-        remote_buffers.push_back(std::move(buffer_handle));
-      } else {
-        RemoteBufferInfo imported_remote_buffer_info(store->get(prefix + std::to_string(rank)));
-        remote_buffers.push_back(std::move(imported_remote_buffer_info));
-      }
-    }
-    it = remote_buffers_.emplace(tensor, std::move(remote_buffers)).first;
-  }
-  return it->second;
-}
-
 } // namespace nvfuser
diff --git a/csrc/multidevice/communicator.h b/csrc/multidevice/communicator.h
index 39c9d667bf4..53c9fbcead8 100644
--- a/csrc/multidevice/communicator.h
+++ b/csrc/multidevice/communicator.h
@@ -30,52 +30,17 @@
 namespace nvfuser {
 
 template <typename T>
-std::vector<uint8_t> toBytes(T data) {
+std::vector<uint8_t> toBytes(const T& data) {
   return std::vector<uint8_t>(
-      reinterpret_cast<uint8_t*>(&data),
-      reinterpret_cast<uint8_t*>(&data) + sizeof(T));
+      reinterpret_cast<const uint8_t*>(&data),
+      reinterpret_cast<const uint8_t*>(&data) + sizeof(T));
 }
 
 template <typename T>
-T fromBytes(std::vector<uint8_t> bytes) {
-  return *reinterpret_cast<T*>(bytes.data());
+const T& fromBytes(const std::vector<uint8_t>& bytes) {
+  return *reinterpret_cast<const T*>(bytes.data());
 }
 
-enum class IpcSemaphore : cuuint32_t {
-  kReady,
-  kTransferInProgress
-};
-
-class RemoteBufferInfo {
- public:
-
-  RemoteBufferInfo(at::Tensor tensor, int64_t size);
-  RemoteBufferInfo(std::vector<uint8_t> data); // means it is imported
-  ~RemoteBufferInfo();
-
-  void* ptr() const {
-    return ptr_;
-  }
-
-  auto semaphores() const {
-    return semaphores_;
-  }
-
-  auto size() const {
-    return size_;
-  }
-
- private:
-  void* ptr_;
-  int64_t size_;
-  int64_t storage_offset_;
-  int64_t element_size_;
-  bool is_imported_;
-  cudaIpcMemHandle_t ipc_handle_;
-  cudaIpcMemHandle_t semaphores_ipc_handle_;
-  IpcSemaphore* semaphores_;
-};
-
 // This file implements the class Communicator which sets up the inter-process
 // Backend. This class contains inter-process information, such as the rank, the
 // world size, as well as the Process Group that can be called to perform
@@ -194,25 +159,7 @@ class Communicator {
     return store_;
   }
 
-  std::vector<RemoteBufferInfo> getRemoteBuffer(at::Tensor tensor, std::string key);
-
  private:
-  struct TensorHash {
-    std::size_t operator()(const at::Tensor& tensor) const {
-      auto ptr = reinterpret_cast<std::uintptr_t>(tensor.data_ptr());
-      auto offset = tensor.storage_offset();
-      auto element_size = tensor.element_size();
-      return std::hash<std::uintptr_t>()(ptr) ^ std::hash<int64_t>()(offset) ^
-          std::hash<int>()(element_size);
-    }
-  };
-
-  struct TensorEqual {
-    bool operator()(const at::Tensor& lhs, const at::Tensor& rhs) const {
-      return lhs.equal(rhs);
-    }
-  };
-
   Communicator(
       CommunicatorBackend backend = comm_backend_default,
       RankType server_local_rank = comm_server_local_rank_default);
@@ -245,8 +192,6 @@ class Communicator {
   c10::intrusive_ptr<c10d::TCPStore> store_;
   // cache for the created backends. The keys are strings generated from Teams
   std::unordered_map<std::string, c10::intrusive_ptr<c10d::Backend>> backends_;
-  std::unordered_map<at::Tensor, std::vector<RemoteBufferInfo>, TensorHash, TensorEqual>
-      remote_buffers_;
 };
 
 } // namespace nvfuser
diff --git a/tests/cpp/test_multidevice_communications.cpp b/tests/cpp/test_multidevice_communications.cpp
index e5e6e3e78e1..ebd6ef0600b 100644
--- a/tests/cpp/test_multidevice_communications.cpp
+++ b/tests/cpp/test_multidevice_communications.cpp
@@ -417,65 +417,62 @@ using P2PCommunicationTest = MultiDeviceTest;
 
 TEST_F(P2PCommunicationTest, CudaComm) {
   static constexpr int kTensorSize = 8;
-  static constexpr int kNumRepetitions = 8;
+  static constexpr int kNumRepetitions = 2;
 
   if (communicator_->size() < 2 || torch::cuda::device_count() < 2) {
     GTEST_SKIP() << "This test needs at least 2 GPUs and 2 ranks.";
   }
 
-  if (communicator_->size() != 2) {
-    GTEST_SKIP() << "This test needs for now exactly 2 GPUs and 2 ranks.";
-  }
-
-
-
   const DeviceIdxType my_rank = communicator_->deviceId();
   const DeviceIdxType size = communicator_->size();
+  const DeviceIdxType send_peer = (my_rank + 1) % size;
+  const DeviceIdxType recv_peer = (size + my_rank - 1) % size;
 
   auto container = std::make_unique<hir::HostIrContainer>();
   FusionGuard fg(container.get());
-  auto* tv = makeContigTensor(1);
-  auto* val_my_rank = IrBuilder::create<Val>(my_rank, DataType::Int);
-  container->addInput(tv);
-  if (my_rank == 0) {
-    const DeviceIdxType send_peer = (my_rank + 1) % size;
-
-    auto* val_send_peer = IrBuilder::create<Val>(send_peer, DataType::Int);
-    auto send = IrBuilder::create<P2PCommunication>(tv, val_send_peer, val_my_rank, CommunicatorBackend::kCuda);
-    auto wait_send = IrBuilder::create<hir::Wait>(send);
-    container->pushBackTopLevelExprs(send);
-    container->pushBackTopLevelExprs(wait_send);
-  } else {
-    ASSERT_EQ(my_rank, 1);
-    const DeviceIdxType recv_peer = (size + my_rank - 1) % size;
-    auto* val_recv_peer = IrBuilder::create<Val>(recv_peer, DataType::Int);
-    auto recv = IrBuilder::create<P2PCommunication>(tv, val_my_rank, val_recv_peer, CommunicatorBackend::kCuda);
-    auto wait_recv = IrBuilder::create<hir::Wait>(recv);
-    container->pushBackTopLevelExprs(recv);
-    container->pushBackTopLevelExprs(wait_recv);
-  }
+
+  auto* my_rank_val = IrBuilder::create<Val>(my_rank, DataType::Int);
+  auto* recv_peer_val = IrBuilder::create<Val>(recv_peer, DataType::Int);
+  auto* send_peer_val = IrBuilder::create<Val>(send_peer, DataType::Int);
+
+  auto* send_tv = makeContigTensor(1);
+  auto* recv_tv = makeContigTensor(1);
+  container->addInput(send_tv);
+  container->addInput(recv_tv);
+
+  auto recv = IrBuilder::create<P2PCommunication>(recv_tv, my_rank_val, recv_peer_val, CommunicatorBackend::kCuda);
+  auto send = IrBuilder::create<P2PCommunication>(send_tv, send_peer_val, my_rank_val, CommunicatorBackend::kCuda);
+  std::vector<P2PCommunication*> grouped_communications = {recv, send};
+  auto share_mem_handles = IrBuilder::create<hir::ShareMemHandles>(std::move(grouped_communications));
+  auto wait_recv = IrBuilder::create<hir::Wait>(recv);
+  auto wait_send = IrBuilder::create<hir::Wait>(send);
+
+  container->pushBackTopLevelExprs(share_mem_handles);
+  container->pushBackTopLevelExprs(recv);
+  container->pushBackTopLevelExprs(send);
+  container->pushBackTopLevelExprs(wait_recv);
+  container->pushBackTopLevelExprs(wait_send);
 
   hir::HostIrEvaluator executor(std::move(container), communicator_);
 
-  at::Tensor tensor = at::empty({kTensorSize}, tensor_options);
+  at::Tensor send_tensor = at::empty({kTensorSize}, tensor_options);
+  at::Tensor recv_tensor = at::empty({kTensorSize}, tensor_options);
 
-  std::unordered_map<Val*, c10::IValue> inputs = {{tv, tensor}};
+  std::unordered_map<Val*, c10::IValue> inputs = {{send_tv, send_tensor}, {recv_tv, recv_tensor}};
 
   for (auto repetition : c10::irange(kNumRepetitions)) {
-    tensor.copy_(at::arange(kTensorSize, tensor_options) + (1+repetition) * 10 + 100* (1+(1-my_rank)));
+    send_tensor.copy_(at::arange(kTensorSize, tensor_options) + repetition * my_rank);
+    std::cout << "RANK " << my_rank << " REPETITION " << repetition << ", send_peer=" << send_peer << ", recv_peer=" << recv_peer << ", send_tensor=" << send_tensor << std::endl;
     torch::cuda::synchronize();
     communicator_->barrier();
-    std::cout << "RANK " << my_rank << " REPETITION " << repetition << ", tensor=" << tensor << std::endl;
 
     executor.runWithInput(inputs);
 
     torch::cuda::synchronize();
     communicator_->barrier();
     std::cout << "RANK " << my_rank << " validation at" << " REPETITION " << repetition << std::endl;
-    if (my_rank == 1) {
-      auto ref = at::arange(kTensorSize, tensor_options) + (1+repetition) * 10 + 100* (1+my_rank);
-      EXPECT_TRUE(torch::allclose(tensor, ref)) << "Rank " << my_rank << " failed at repetition " << repetition << " with tensor " << tensor << " and ref " << ref;
-    }
+    auto ref = at::arange(kTensorSize, tensor_options) + repetition * recv_peer;
+    EXPECT_TRUE(torch::allclose(recv_tensor, ref)) << "Rank " << my_rank << " failed at repetition " << repetition << " with recv tensor " << recv_tensor << " and ref " << ref;
   }
 }
 

From f7409b20b0a649d21bf4e7445e075f4f12d42498 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Wed, 12 Feb 2025 04:46:15 -0800
Subject: [PATCH 42/55] working checkpt with many ranks

---
 csrc/host_ir/executor.cpp                     | 6 ++++++
 tests/cpp/test_multidevice_communications.cpp | 4 ++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp
index 6b144ddd7d8..b1f45ee610c 100644
--- a/csrc/host_ir/executor.cpp
+++ b/csrc/host_ir/executor.cpp
@@ -511,7 +511,13 @@ void HostIrEvaluator::handle(ShareMemHandles* share_mem_handles) {
   for (P2PCommunication* communication: communications) {
     std::vector<std::unique_ptr<RemoteBufferInfo>> remote_buffers;
     remote_buffers.reserve(communicator_->size());
+    const auto dst = expr_evaluator_.evaluate(communication->dst()).as<int64_t>();
+    const auto src = expr_evaluator_.evaluate(communication->src()).as<int64_t>();
     for (int64_t rank : c10::irange(communicator_->size())) {
+      if (rank != src && rank != dst) {
+        remote_buffers.push_back(nullptr);
+        continue;
+      }
       std::cout << "RANK " << my_rank << " after barrier for key " << get_key(communication, rank) << std::endl;
       if (rank == my_rank) {
         // opening an ipc handle on the exporter's device is not supported
diff --git a/tests/cpp/test_multidevice_communications.cpp b/tests/cpp/test_multidevice_communications.cpp
index ebd6ef0600b..df43712d4d3 100644
--- a/tests/cpp/test_multidevice_communications.cpp
+++ b/tests/cpp/test_multidevice_communications.cpp
@@ -461,7 +461,7 @@ TEST_F(P2PCommunicationTest, CudaComm) {
   std::unordered_map<Val*, c10::IValue> inputs = {{send_tv, send_tensor}, {recv_tv, recv_tensor}};
 
   for (auto repetition : c10::irange(kNumRepetitions)) {
-    send_tensor.copy_(at::arange(kTensorSize, tensor_options) + repetition * my_rank);
+    send_tensor.copy_(at::arange(kTensorSize, tensor_options) + repetition * 10 + 100 * my_rank);
     std::cout << "RANK " << my_rank << " REPETITION " << repetition << ", send_peer=" << send_peer << ", recv_peer=" << recv_peer << ", send_tensor=" << send_tensor << std::endl;
     torch::cuda::synchronize();
     communicator_->barrier();
@@ -471,7 +471,7 @@ TEST_F(P2PCommunicationTest, CudaComm) {
     torch::cuda::synchronize();
     communicator_->barrier();
     std::cout << "RANK " << my_rank << " validation at" << " REPETITION " << repetition << std::endl;
-    auto ref = at::arange(kTensorSize, tensor_options) + repetition * recv_peer;
+    auto ref = at::arange(kTensorSize, tensor_options) + repetition * 10 + 100 * recv_peer;
     EXPECT_TRUE(torch::allclose(recv_tensor, ref)) << "Rank " << my_rank << " failed at repetition " << repetition << " with recv tensor " << recv_tensor << " and ref " << ref;
   }
 }

From 08f8fe03c041b3960943825551439279578dd5d2 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Wed, 12 Feb 2025 05:09:10 -0800
Subject: [PATCH 43/55] chkpt non blocking

---
 csrc/host_ir/executor.cpp                     | 39 +++++++++----------
 tests/cpp/test_multidevice_communications.cpp |  2 +-
 2 files changed, 20 insertions(+), 21 deletions(-)

diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp
index b1f45ee610c..d6cfa474e52 100644
--- a/csrc/host_ir/executor.cpp
+++ b/csrc/host_ir/executor.cpp
@@ -640,12 +640,11 @@ void HostIrEvaluator::handle(P2PCommunication* communication) {
   const auto current_stream = reinterpret_cast<CUstream>(c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream());
 
   if (is_receiver) {
-    std::cout << "RANK " << my_rank << " RECV, peer=" << peer << ", local semaphore=" << local_semaphore << ", remote semaphore=" << remote_semaphore << ", my_buffer.ptr()=" << my_buffer.ptr() << ", buffer.data_ptr()=" << buffer.data_ptr() << "recv tensor=" << buffer << std::endl;
-    // signal to self that transfer is in progress
-    NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kTransferInProgress), CU_STREAM_WRITE_VALUE_DEFAULT));
-    // signal sender that receiver is ready
-    NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(current_stream, remote_semaphore, (cuuint32_t)(IpcSemaphore::kTransferInProgress), CU_STREAM_WRITE_VALUE_DEFAULT)); // passing CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER gives an error
-    std::cout << "RANK " << my_rank << " RECV BEFORE MEMCPY, peer=" << peer << ", local semaphore=" << local_semaphore << ", remote semaphore=" << remote_semaphore << ", my_buffer.ptr()=" << my_buffer.ptr() << ", buffer.data_ptr()=" << buffer.data_ptr() << "recv tensor=" << buffer << std::endl;
+    // wait for sender to be ready
+    std::cout << "RANK " << my_rank << " RECV, peer=" << peer << ", local semaphore=" << local_semaphore << ", remote semaphore=" << remote_semaphore << std::endl;
+    NVFUSER_CUDA_SAFE_CALL(cuStreamWaitValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kTransferInProgress), CU_STREAM_WAIT_VALUE_EQ));
+    std::cout << "RANK " << my_rank << " RECV after 1st WAIT" << ", buffer.data_ptr()=" << buffer.data_ptr() << ", my_buffer.ptr()=" << my_buffer.ptr() << ", sent tensor=" << buffer << ", buffer.numel()=" << buffer.numel() << ", buffer.element_size()=" << buffer.element_size() << std::endl;
+    // RDMA get the data from the sender
     NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpy(
         buffer.data_ptr(),
         peer_buffer.ptr(),
@@ -654,13 +653,19 @@ void HostIrEvaluator::handle(P2PCommunication* communication) {
         cudaMemcpyDeviceToDevice
         // current_stream));
         ));
-    std::cout << "RANK " << my_rank << " RECV AFTER MEMCPY, peer=" << peer << ", local semaphore=" << local_semaphore << ", remote semaphore=" << remote_semaphore << ", my_buffer.ptr()=" << my_buffer.ptr() << ", buffer.data_ptr()=" << buffer.data_ptr() << "recv tensor=" << buffer << std::endl;
+    std::cout << "RANK " << my_rank << " RECV after memcpy" << std::endl;
+    // Signals completion to self
+    NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kReady), CU_STREAM_WRITE_VALUE_DEFAULT));
+    // Signals completion to receiver
+    NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(current_stream, remote_semaphore, (cuuint32_t)(IpcSemaphore::kReady), CU_STREAM_WRITE_VALUE_DEFAULT));
   } else /*sender*/ {
-    std::cout << "RANK " << my_rank << " SEND, peer=" << peer << ", local semaphore=" << local_semaphore << ", remote semaphore=" << remote_semaphore << std::endl;
-    // wait for sender to be ready
-    NVFUSER_CUDA_SAFE_CALL(cuStreamWaitValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kTransferInProgress), CU_STREAM_WAIT_VALUE_EQ));
-    std::cout << "RANK " << my_rank << " SEND after 1st WAIT" << ", buffer.data_ptr()=" << buffer.data_ptr() << ", my_buffer.ptr()=" << my_buffer.ptr() << ", sent tensor=" << buffer << ", buffer.numel()=" << buffer.numel() << ", buffer.element_size()=" << buffer.element_size() << std::endl;
-    // RDMA writes data from sender to receiver
+    std::cout << "RANK " << my_rank << " SEND, peer=" << peer << ", local semaphore=" << local_semaphore << ", remote semaphore=" << remote_semaphore << ", my_buffer.ptr()=" << my_buffer.ptr() << ", buffer.data_ptr()=" << buffer.data_ptr() << "recv tensor=" << buffer << std::endl;
+    std::cout << "RANK " << my_rank << " SEND BEFORE signaling, peer=" << peer << ", local semaphore=" << local_semaphore << ", remote semaphore=" << remote_semaphore << ", my_buffer.ptr()=" << my_buffer.ptr() << ", buffer.data_ptr()=" << buffer.data_ptr() << "recv tensor=" << buffer << std::endl;
+    // signal to self that transfer is in progress
+    NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kTransferInProgress), CU_STREAM_WRITE_VALUE_DEFAULT));
+    // signal to receiver that the buffer is ready
+    NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(current_stream, remote_semaphore, (cuuint32_t)(IpcSemaphore::kTransferInProgress), CU_STREAM_WRITE_VALUE_DEFAULT)); // passing CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER gives an error
+    std::cout << "RANK " << my_rank << " SEND AFTER signaling, peer=" << peer << ", local semaphore=" << local_semaphore << ", remote semaphore=" << remote_semaphore << ", my_buffer.ptr()=" << my_buffer.ptr() << ", buffer.data_ptr()=" << buffer.data_ptr() << "recv tensor=" << buffer << std::endl;
     // NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync(
     // NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpy(
     //     peer_buffer.ptr(),
@@ -670,11 +675,6 @@ void HostIrEvaluator::handle(P2PCommunication* communication) {
     //     cudaMemcpyDeviceToDevice
     //     // current_stream));
     //     ));
-    std::cout << "RANK " << my_rank << " SEND after memcpy" << std::endl;
-    // Signals completion to self
-    NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kReady), CU_STREAM_WRITE_VALUE_DEFAULT));
-    // Signals completion to receiver
-    NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(current_stream, remote_semaphore, (cuuint32_t)(IpcSemaphore::kReady), CU_STREAM_WRITE_VALUE_DEFAULT));
   }
 }
 
@@ -698,8 +698,7 @@ void HostIrEvaluator::handle(Wait* wait) {
   const auto dst = expr_evaluator_.evaluate(p2p_comm->dst()).as<int64_t>();
   const auto src = expr_evaluator_.evaluate(p2p_comm->src()).as<int64_t>();
   const int64_t my_rank = communicator_->deviceId();
-  const bool is_receiver = my_rank == dst;
-  if (is_receiver) {
+  if (my_rank == src) {
     const auto current_stream = static_cast<CUstream>(c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream());
     at::Tensor buffer = getKnownTensorOrUndefined(p2p_comm->buffer(), expr_evaluator_);
     const auto it = remote_buffers_.find(buffer);
@@ -707,7 +706,7 @@ void HostIrEvaluator::handle(Wait* wait) {
     const std::vector<std::unique_ptr<RemoteBufferInfo>>& remote_buffers = it->second;
 
     const RemoteBufferInfo& my_buffer = *remote_buffers.at(my_rank);
-    const auto local_semaphore = reinterpret_cast<CUdeviceptr>(&my_buffer.semaphores()[src]);
+    const auto local_semaphore = reinterpret_cast<CUdeviceptr>(&my_buffer.semaphores()[dst]);
 
     std::cout << "RANK " << my_rank << " WAIT RECV BEFORE cuStreamWaitValue32 on local semaphore " << local_semaphore << std::endl;
     NVFUSER_CUDA_SAFE_CALL(cuStreamWaitValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kReady), CU_STREAM_WAIT_VALUE_EQ));
diff --git a/tests/cpp/test_multidevice_communications.cpp b/tests/cpp/test_multidevice_communications.cpp
index df43712d4d3..c0a41348a1f 100644
--- a/tests/cpp/test_multidevice_communications.cpp
+++ b/tests/cpp/test_multidevice_communications.cpp
@@ -448,8 +448,8 @@ TEST_F(P2PCommunicationTest, CudaComm) {
   auto wait_send = IrBuilder::create<hir::Wait>(send);
 
   container->pushBackTopLevelExprs(share_mem_handles);
-  container->pushBackTopLevelExprs(recv);
   container->pushBackTopLevelExprs(send);
+  container->pushBackTopLevelExprs(recv);
   container->pushBackTopLevelExprs(wait_recv);
   container->pushBackTopLevelExprs(wait_send);
 

From de843bb8c13d3b9e4f412ccf38bdb02507eba175 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Wed, 12 Feb 2025 09:02:18 -0800
Subject: [PATCH 44/55] harden tests by removing hard syncs

---
 tests/cpp/test_multidevice_communications.cpp | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/tests/cpp/test_multidevice_communications.cpp b/tests/cpp/test_multidevice_communications.cpp
index c0a41348a1f..74efa49efbe 100644
--- a/tests/cpp/test_multidevice_communications.cpp
+++ b/tests/cpp/test_multidevice_communications.cpp
@@ -417,7 +417,7 @@ using P2PCommunicationTest = MultiDeviceTest;
 
 TEST_F(P2PCommunicationTest, CudaComm) {
   static constexpr int kTensorSize = 8;
-  static constexpr int kNumRepetitions = 2;
+  static constexpr int kNumRepetitions = 32;
 
   if (communicator_->size() < 2 || torch::cuda::device_count() < 2) {
     GTEST_SKIP() << "This test needs at least 2 GPUs and 2 ranks.";
@@ -463,13 +463,9 @@ TEST_F(P2PCommunicationTest, CudaComm) {
   for (auto repetition : c10::irange(kNumRepetitions)) {
     send_tensor.copy_(at::arange(kTensorSize, tensor_options) + repetition * 10 + 100 * my_rank);
     std::cout << "RANK " << my_rank << " REPETITION " << repetition << ", send_peer=" << send_peer << ", recv_peer=" << recv_peer << ", send_tensor=" << send_tensor << std::endl;
-    torch::cuda::synchronize();
-    communicator_->barrier();
 
     executor.runWithInput(inputs);
 
-    torch::cuda::synchronize();
-    communicator_->barrier();
     std::cout << "RANK " << my_rank << " validation at" << " REPETITION " << repetition << std::endl;
     auto ref = at::arange(kTensorSize, tensor_options) + repetition * 10 + 100 * recv_peer;
     EXPECT_TRUE(torch::allclose(recv_tensor, ref)) << "Rank " << my_rank << " failed at repetition " << repetition << " with recv tensor " << recv_tensor << " and ref " << ref;

From 4dc9936e9b22c6d5f9bbaaa356bd10dda6f6126b Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Wed, 12 Feb 2025 09:09:15 -0800
Subject: [PATCH 45/55] use cudaMemcpyAsync

---
 csrc/host_ir/executor.cpp | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp
index d6cfa474e52..f74e1a93306 100644
--- a/csrc/host_ir/executor.cpp
+++ b/csrc/host_ir/executor.cpp
@@ -645,15 +645,12 @@ void HostIrEvaluator::handle(P2PCommunication* communication) {
     NVFUSER_CUDA_SAFE_CALL(cuStreamWaitValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kTransferInProgress), CU_STREAM_WAIT_VALUE_EQ));
     std::cout << "RANK " << my_rank << " RECV after 1st WAIT" << ", buffer.data_ptr()=" << buffer.data_ptr() << ", my_buffer.ptr()=" << my_buffer.ptr() << ", sent tensor=" << buffer << ", buffer.numel()=" << buffer.numel() << ", buffer.element_size()=" << buffer.element_size() << std::endl;
     // RDMA get the data from the sender
-    NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpy(
+    NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync(
         buffer.data_ptr(),
         peer_buffer.ptr(),
-        // my_buffer.ptr(),
         buffer.numel() * buffer.element_size(),
-        cudaMemcpyDeviceToDevice
-        // current_stream));
-        ));
-    std::cout << "RANK " << my_rank << " RECV after memcpy" << std::endl;
+        cudaMemcpyDeviceToDevice,
+        current_stream));
     // Signals completion to self
     NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kReady), CU_STREAM_WRITE_VALUE_DEFAULT));
     // Signals completion to receiver

From 4e056093a10e170b29e5af3146c633285c3506db Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Wed, 12 Feb 2025 09:11:44 -0800
Subject: [PATCH 46/55] clean and lint

---
 csrc/host_ir/executor.cpp                     | 255 +++++++++---------
 csrc/host_ir/executor.h                       |  12 +-
 csrc/host_ir/host_ir.cpp                      |   6 +-
 csrc/host_ir/host_ir.h                        |   4 +-
 csrc/multidevice/communicator.h               |   5 +-
 tests/cpp/test_multidevice_communications.cpp |  27 +-
 tests/cpp/test_multidevice_gpu_comms.cpp      |  11 +-
 tests/cpp/test_multidevice_overlap.cpp        |   2 +-
 8 files changed, 173 insertions(+), 149 deletions(-)

diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp
index f74e1a93306..ae4dbb028b8 100644
--- a/csrc/host_ir/executor.cpp
+++ b/csrc/host_ir/executor.cpp
@@ -8,6 +8,7 @@
 
 #include <ATen/cuda/CUDAContext.h>
 
+#include <cuda.h>
 #include <dynamic_transform.h>
 #include <fusion_profiler.h>
 #include <host_ir/executor.h>
@@ -21,7 +22,6 @@
 #include <runtime/executor_dispatch.h>
 #include <runtime/executor_kernel_arg.h>
 #include <runtime/fusion_kernel_runtime.h>
-#include <cuda.h>
 
 namespace nvfuser {
 
@@ -410,17 +410,29 @@ void HostIrEvaluator::handle(PostOnStream* post_ir) {
   }
 }
 
-RemoteBufferInfo::RemoteBufferInfo(at::Tensor tensor) : ptr_(tensor.data_ptr()), storage_offset_(tensor.storage_offset()), element_size_(tensor.element_size()), is_imported_(false) {
-
-  NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcGetMemHandle(&ipc_handle_, tensor.data_ptr()));
+RemoteBufferInfo::RemoteBufferInfo(at::Tensor tensor)
+    : ptr_(tensor.data_ptr()),
+      storage_offset_(tensor.storage_offset()),
+      element_size_(tensor.element_size()),
+      is_imported_(false) {
+  NVFUSER_CUDA_RT_SAFE_CALL(
+      cudaIpcGetMemHandle(&ipc_handle_, tensor.data_ptr()));
   const auto number_of_semaphores = Communicator::getInstance().size();
-  NVFUSER_CUDA_RT_SAFE_CALL(cudaMalloc((void**)&semaphores_, number_of_semaphores * sizeof(IpcSemaphore)));
-  static_assert(sizeof(IpcSemaphore) == sizeof(int), "IpcSemaphore must be same size as int");
-  NVFUSER_CUDA_RT_SAFE_CALL(cudaMemset((void*) semaphores_, (int)IpcSemaphore::kReady, number_of_semaphores * sizeof(IpcSemaphore)));
-  NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcGetMemHandle(&semaphores_ipc_handle_, semaphores_));
+  NVFUSER_CUDA_RT_SAFE_CALL(cudaMalloc(
+      (void**)&semaphores_, number_of_semaphores * sizeof(IpcSemaphore)));
+  static_assert(
+      sizeof(IpcSemaphore) == sizeof(int),
+      "IpcSemaphore must be same size as int");
+  NVFUSER_CUDA_RT_SAFE_CALL(cudaMemset(
+      (void*)semaphores_,
+      (int)IpcSemaphore::kReady,
+      number_of_semaphores * sizeof(IpcSemaphore)));
+  NVFUSER_CUDA_RT_SAFE_CALL(
+      cudaIpcGetMemHandle(&semaphores_ipc_handle_, semaphores_));
 }
 
-RemoteBufferInfo::RemoteBufferInfo(std::vector<uint8_t> data) : is_imported_(true) {
+RemoteBufferInfo::RemoteBufferInfo(std::vector<uint8_t> data)
+    : is_imported_(true) {
   const RemoteBufferInfo& imported_buffer = fromBytes<RemoteBufferInfo>(data);
 
   storage_offset_ = imported_buffer.storage_offset_;
@@ -428,78 +440,79 @@ RemoteBufferInfo::RemoteBufferInfo(std::vector<uint8_t> data) : is_imported_(tru
   ipc_handle_ = imported_buffer.ipc_handle_;
   semaphores_ipc_handle_ = imported_buffer.semaphores_ipc_handle_;
 
-  NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcOpenMemHandle(&ptr_, ipc_handle_, cudaIpcMemLazyEnablePeerAccess));
+  NVFUSER_CUDA_RT_SAFE_CALL(
+      cudaIpcOpenMemHandle(&ptr_, ipc_handle_, cudaIpcMemLazyEnablePeerAccess));
   ptr_ = (void*)((uint8_t*)ptr_ + storage_offset_ * element_size_);
 
-  NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcOpenMemHandle((void**)&semaphores_, semaphores_ipc_handle_, cudaIpcMemLazyEnablePeerAccess));
+  NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcOpenMemHandle(
+      (void**)&semaphores_,
+      semaphores_ipc_handle_,
+      cudaIpcMemLazyEnablePeerAccess));
 }
 
 RemoteBufferInfo::~RemoteBufferInfo() {
   if (is_imported_) {
-    std::cout << "RANK " << Communicator::getInstance().deviceId() << " closes ipc handle" << std::endl;
     NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcCloseMemHandle(ptr_));
     NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcCloseMemHandle((void*)semaphores_));
   } else {
-    std::cout << "RANK " << Communicator::getInstance().deviceId() << " frees semaphores" << std::endl;
     NVFUSER_CUDA_RT_SAFE_CALL(cudaFree((void*)semaphores_));
   }
 }
 
-std::ostream& operator<<(std::ostream& os, const cudaIpcMemHandle_t& info) {
-  uint8_t* ptr = (uint8_t*)&info;
-  for (int i = 0; i < (int)sizeof(cudaIpcMemHandle_t); i++) {
-    os << ptr[i];
-  }
-  return os;
-}
-
-
-
 std::ostream& operator<<(std::ostream& os, const RemoteBufferInfo& info) {
   os << "RemoteBufferInfo(ptr=" << info.ptr_
      << ", storage_offset=" << info.storage_offset_
      << ", element_size=" << info.element_size_
      << ", is_imported=" << info.is_imported_
-     << ", semaphores_=" << info.semaphores_
-     << ", ipc_handle_=" << info.ipc_handle_
-     << ", semaphores_ipc_handle_=" << info.semaphores_ipc_handle_
-     << ")";
+     << ", semaphores_=" << info.semaphores_ << ")";
   return os;
 }
 
 void HostIrEvaluator::handle(ShareMemHandles* share_mem_handles) {
   const int64_t my_rank = communicator_->deviceId();
-  auto get_tensor = [this](P2PCommunication* communication) -> at::Tensor { 
-    return this->expr_evaluator_.evaluate(communication->buffer()).as<at::Tensor>();
+  auto get_tensor = [this](P2PCommunication* communication) -> at::Tensor {
+    return this->expr_evaluator_.evaluate(communication->buffer())
+        .as<at::Tensor>();
   };
 
   std::vector<P2PCommunication*> communications;
-  for (auto expr: share_mem_handles->communications()) {
+  for (auto expr : share_mem_handles->communications()) {
     auto communication = expr->as<P2PCommunication>();
-    const auto dst = expr_evaluator_.evaluate(communication->dst()).as<int64_t>();
-    const auto src = expr_evaluator_.evaluate(communication->src()).as<int64_t>();
+    const auto dst =
+        expr_evaluator_.evaluate(communication->dst()).as<int64_t>();
+    const auto src =
+        expr_evaluator_.evaluate(communication->src()).as<int64_t>();
     const bool is_sender = my_rank == src;
     const bool is_receiver = my_rank == dst;
-    if (!(is_sender ^ is_receiver)) { // REMOVE or adapt exporting/opening the handle
+    if (!(is_sender ^
+          is_receiver)) { // REMOVE or adapt exporting/opening the handle
       return;
     }
-    if (remote_buffers_.find(get_tensor(communication)) != remote_buffers_.end()) {
+    if (remote_buffers_.find(get_tensor(communication)) !=
+        remote_buffers_.end()) {
       continue;
     }
     communications.push_back(communication);
   }
 
   // put memhandles to TCP store
-  auto get_key = [this] (P2PCommunication* communication, int64_t rank) -> std::string { 
-    return "nvfuser_remote_buffer_info_P2PComm_dst=" + std::to_string(this->expr_evaluator_.evaluate(communication->dst()).as<int64_t>()) + "_src=" + std::to_string(this->expr_evaluator_.evaluate(communication->src()).as<int64_t>()) + "_rank=" + std::to_string(rank);
+  auto get_key =
+      [this](P2PCommunication* communication, int64_t rank) -> std::string {
+    return "nvfuser_remote_buffer_info_P2PComm_dst=" +
+        std::to_string(this->expr_evaluator_.evaluate(communication->dst())
+                           .as<int64_t>()) +
+        "_src=" +
+        std::to_string(this->expr_evaluator_.evaluate(communication->src())
+                           .as<int64_t>()) +
+        "_rank=" + std::to_string(rank);
   };
-  std::unordered_map<P2PCommunication*, std::unique_ptr<RemoteBufferInfo>> buffer_handles;
+  std::unordered_map<P2PCommunication*, std::unique_ptr<RemoteBufferInfo>>
+      buffer_handles;
   auto store = communicator_->getTcpStore();
-  for (P2PCommunication* communication: communications) {
-    auto buffer_handle = std::make_unique<RemoteBufferInfo>(get_tensor(communication));
-    std::cout << "RANK " << my_rank << " registers at key " << get_key(communication, my_rank) << std::endl;
+  for (P2PCommunication* communication : communications) {
+    auto buffer_handle =
+        std::make_unique<RemoteBufferInfo>(get_tensor(communication));
     store->set(get_key(communication, my_rank), toBytes(*buffer_handle));
-    std::cout << "RANK "  << my_rank << " creates buffer_handle " << *buffer_handle << std::endl;
     buffer_handles.emplace(communication, std::move(buffer_handle));
   }
 
@@ -507,34 +520,40 @@ void HostIrEvaluator::handle(ShareMemHandles* share_mem_handles) {
   // TODO: precisely select what ranks need to wait on that barrier.
   communicator_->barrier();
 
-    // get memhandles to TCP store
-  for (P2PCommunication* communication: communications) {
+  // get memhandles to TCP store
+  for (P2PCommunication* communication : communications) {
     std::vector<std::unique_ptr<RemoteBufferInfo>> remote_buffers;
     remote_buffers.reserve(communicator_->size());
-    const auto dst = expr_evaluator_.evaluate(communication->dst()).as<int64_t>();
-    const auto src = expr_evaluator_.evaluate(communication->src()).as<int64_t>();
+    const auto dst =
+        expr_evaluator_.evaluate(communication->dst()).as<int64_t>();
+    const auto src =
+        expr_evaluator_.evaluate(communication->src()).as<int64_t>();
     for (int64_t rank : c10::irange(communicator_->size())) {
       if (rank != src && rank != dst) {
         remote_buffers.push_back(nullptr);
         continue;
       }
-      std::cout << "RANK " << my_rank << " after barrier for key " << get_key(communication, rank) << std::endl;
       if (rank == my_rank) {
         // opening an ipc handle on the exporter's device is not supported
         remote_buffers.push_back(std::move(buffer_handles.at(communication)));
       } else {
         std::string key = get_key(communication, rank);
-        NVF_ERROR(store->check({key}), "key ", key, " not found in store at rank ", my_rank);
-        auto imported_remote_buffer_info = std::make_unique<RemoteBufferInfo>(store->get(key));
+        NVF_ERROR(
+            store->check({key}),
+            "key ",
+            key,
+            " not found in store at rank ",
+            my_rank);
+        auto imported_remote_buffer_info =
+            std::make_unique<RemoteBufferInfo>(store->get(key));
         remote_buffers.push_back(std::move(imported_remote_buffer_info));
       }
-      std::cout << "RANK "  << my_rank << " emplaces at rank " << rank << " remote buffer " << *remote_buffers.back() << std::endl;
     }
-    remote_buffers_.emplace(get_tensor(communication), std::move(remote_buffers));
+    remote_buffers_.emplace(
+        get_tensor(communication), std::move(remote_buffers));
   }
 }
 
-
 void HostIrEvaluator::handle(Communication* communication) {
   NVF_ERROR(
       communicator_ != nullptr && communicator_->is_available(),
@@ -560,41 +579,6 @@ void HostIrEvaluator::handle(Communication* communication) {
   }
 
   NVF_ERROR(communication->type() == CommunicationType::Allgather);
-
-  // std::vector<at::Tensor> output_tensors =
-  //     at::tensor_split(output_tensor.squeeze(), communication->team_size(), 0);
-  // const std::vector<void*>& input_ptrs = communicator_->getRemotePtrs(input_tensor);
-  // cudaStream_t current_stream =
-  //     c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream();
-  // // TODO: use multicast
-  // for (auto i = 0; i < communicator_->size(); i++) {
-  //   cudaStream_t stream = c10::cuda::getStreamFromPool(
-  //                             /*isHighPriority=*/false, my_local_device_index_)
-  //                             .stream();
-  //   cudaEvent_t event = {};
-  //   NVFUSER_CUDA_RT_SAFE_CALL(
-  //       cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
-  //   NVFUSER_CUDA_RT_SAFE_CALL(cudaEventRecord(event, current_stream));
-  //   NVFUSER_CUDA_RT_SAFE_CALL(
-  //       cudaStreamWaitEvent(stream, event, cudaEventWaitDefault));
-  //   NVFUSER_CUDA_RT_SAFE_CALL(cudaEventDestroy(event));
-
-  //   auto output = output_tensors.at(i);
-  //   NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync(
-  //       output.data_ptr(),
-  //       input_ptrs.at(i),
-  //       output.numel() * output.element_size(),
-  //       cudaMemcpyDeviceToDevice,
-  //       stream));
-
-  //   // sync
-  //   NVFUSER_CUDA_RT_SAFE_CALL(
-  //       cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
-  //   NVFUSER_CUDA_RT_SAFE_CALL(cudaEventRecord(event, stream));
-  //   NVFUSER_CUDA_RT_SAFE_CALL(
-  //       cudaStreamWaitEvent(current_stream, event, cudaEventWaitDefault));
-  //   NVFUSER_CUDA_RT_SAFE_CALL(cudaEventDestroy(event));
-  // }
 }
 
 void HostIrEvaluator::handle(P2PCommunication* communication) {
@@ -615,7 +599,6 @@ void HostIrEvaluator::handle(P2PCommunication* communication) {
   at::Tensor buffer =
       getKnownTensorOrUndefined(communication->buffer(), expr_evaluator_);
 
-
   if (backend_type != CommunicatorBackend::kCuda) {
     works_[communication] = postSingleCommunication(
         communication,
@@ -628,22 +611,34 @@ void HostIrEvaluator::handle(P2PCommunication* communication) {
   }
 
   const auto it = remote_buffers_.find(buffer);
-  NVF_ERROR(it != remote_buffers_.end(), "No remote buffer found for ", communication->toString(), " at rank ", my_rank);
-  const std::vector<std::unique_ptr<RemoteBufferInfo>>& remote_buffers = it->second;
+  NVF_ERROR(
+      it != remote_buffers_.end(),
+      "No remote buffer found for ",
+      communication->toString(),
+      " at rank ",
+      my_rank);
+  const std::vector<std::unique_ptr<RemoteBufferInfo>>& remote_buffers =
+      it->second;
   const int64_t peer = is_sender ? dst : src;
   const RemoteBufferInfo& my_buffer = *remote_buffers.at(my_rank);
   const RemoteBufferInfo& peer_buffer = *remote_buffers.at(peer);
-  const auto local_semaphore = reinterpret_cast<CUdeviceptr>(&my_buffer.semaphores()[peer]);
-  const auto remote_semaphore = reinterpret_cast<CUdeviceptr>(&peer_buffer.semaphores()[my_rank]);
-  static_assert(sizeof(IpcSemaphore) == sizeof(uint32_t), "IpcSemaphore must be 32 bits");
+  const auto local_semaphore =
+      reinterpret_cast<CUdeviceptr>(&my_buffer.semaphores()[peer]);
+  const auto remote_semaphore =
+      reinterpret_cast<CUdeviceptr>(&peer_buffer.semaphores()[my_rank]);
+  static_assert(
+      sizeof(IpcSemaphore) == sizeof(uint32_t), "IpcSemaphore must be 32 bits");
 
-  const auto current_stream = reinterpret_cast<CUstream>(c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream());
+  const auto current_stream = reinterpret_cast<CUstream>(
+      c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream());
 
   if (is_receiver) {
     // wait for sender to be ready
-    std::cout << "RANK " << my_rank << " RECV, peer=" << peer << ", local semaphore=" << local_semaphore << ", remote semaphore=" << remote_semaphore << std::endl;
-    NVFUSER_CUDA_SAFE_CALL(cuStreamWaitValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kTransferInProgress), CU_STREAM_WAIT_VALUE_EQ));
-    std::cout << "RANK " << my_rank << " RECV after 1st WAIT" << ", buffer.data_ptr()=" << buffer.data_ptr() << ", my_buffer.ptr()=" << my_buffer.ptr() << ", sent tensor=" << buffer << ", buffer.numel()=" << buffer.numel() << ", buffer.element_size()=" << buffer.element_size() << std::endl;
+    NVFUSER_CUDA_SAFE_CALL(cuStreamWaitValue32(
+        current_stream,
+        local_semaphore,
+        (cuuint32_t)(IpcSemaphore::kTransferInProgress),
+        CU_STREAM_WAIT_VALUE_EQ));
     // RDMA get the data from the sender
     NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync(
         buffer.data_ptr(),
@@ -652,26 +647,32 @@ void HostIrEvaluator::handle(P2PCommunication* communication) {
         cudaMemcpyDeviceToDevice,
         current_stream));
     // Signals completion to self
-    NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kReady), CU_STREAM_WRITE_VALUE_DEFAULT));
+    NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(
+        current_stream,
+        local_semaphore,
+        (cuuint32_t)(IpcSemaphore::kReady),
+        CU_STREAM_WRITE_VALUE_DEFAULT));
     // Signals completion to receiver
-    NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(current_stream, remote_semaphore, (cuuint32_t)(IpcSemaphore::kReady), CU_STREAM_WRITE_VALUE_DEFAULT));
+    NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(
+        current_stream,
+        remote_semaphore,
+        (cuuint32_t)(IpcSemaphore::kReady),
+        CU_STREAM_WRITE_VALUE_DEFAULT));
   } else /*sender*/ {
-    std::cout << "RANK " << my_rank << " SEND, peer=" << peer << ", local semaphore=" << local_semaphore << ", remote semaphore=" << remote_semaphore << ", my_buffer.ptr()=" << my_buffer.ptr() << ", buffer.data_ptr()=" << buffer.data_ptr() << "recv tensor=" << buffer << std::endl;
-    std::cout << "RANK " << my_rank << " SEND BEFORE signaling, peer=" << peer << ", local semaphore=" << local_semaphore << ", remote semaphore=" << remote_semaphore << ", my_buffer.ptr()=" << my_buffer.ptr() << ", buffer.data_ptr()=" << buffer.data_ptr() << "recv tensor=" << buffer << std::endl;
     // signal to self that transfer is in progress
-    NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kTransferInProgress), CU_STREAM_WRITE_VALUE_DEFAULT));
+    NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(
+        current_stream,
+        local_semaphore,
+        (cuuint32_t)(IpcSemaphore::kTransferInProgress),
+        CU_STREAM_WRITE_VALUE_DEFAULT));
     // signal to receiver that the buffer is ready
-    NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(current_stream, remote_semaphore, (cuuint32_t)(IpcSemaphore::kTransferInProgress), CU_STREAM_WRITE_VALUE_DEFAULT)); // passing CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER gives an error
-    std::cout << "RANK " << my_rank << " SEND AFTER signaling, peer=" << peer << ", local semaphore=" << local_semaphore << ", remote semaphore=" << remote_semaphore << ", my_buffer.ptr()=" << my_buffer.ptr() << ", buffer.data_ptr()=" << buffer.data_ptr() << "recv tensor=" << buffer << std::endl;
-    // NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync(
-    // NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpy(
-    //     peer_buffer.ptr(),
-    //     buffer.data_ptr(),
-    //     // my_buffer.ptr(),
-    //     buffer.numel() * buffer.element_size(),
-    //     cudaMemcpyDeviceToDevice
-    //     // current_stream));
-    //     ));
+    NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(
+        current_stream,
+        remote_semaphore,
+        (cuuint32_t)(IpcSemaphore::kTransferInProgress),
+        CU_STREAM_WRITE_VALUE_DEFAULT)); // passing
+                                         // CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER
+                                         // gives an error
   }
 }
 
@@ -691,23 +692,33 @@ void HostIrEvaluator::handle(Wait* wait) {
     return;
   }
 
-
   const auto dst = expr_evaluator_.evaluate(p2p_comm->dst()).as<int64_t>();
   const auto src = expr_evaluator_.evaluate(p2p_comm->src()).as<int64_t>();
   const int64_t my_rank = communicator_->deviceId();
   if (my_rank == src) {
-    const auto current_stream = static_cast<CUstream>(c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream());
-    at::Tensor buffer = getKnownTensorOrUndefined(p2p_comm->buffer(), expr_evaluator_);
+    const auto current_stream = static_cast<CUstream>(
+        c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream());
+    at::Tensor buffer =
+        getKnownTensorOrUndefined(p2p_comm->buffer(), expr_evaluator_);
     const auto it = remote_buffers_.find(buffer);
-    NVF_ERROR(it != remote_buffers_.end(), "No remote buffer found for ", p2p_comm->toString(), " at rank ", my_rank);
-    const std::vector<std::unique_ptr<RemoteBufferInfo>>& remote_buffers = it->second;
+    NVF_ERROR(
+        it != remote_buffers_.end(),
+        "No remote buffer found for ",
+        p2p_comm->toString(),
+        " at rank ",
+        my_rank);
+    const std::vector<std::unique_ptr<RemoteBufferInfo>>& remote_buffers =
+        it->second;
 
     const RemoteBufferInfo& my_buffer = *remote_buffers.at(my_rank);
-    const auto local_semaphore = reinterpret_cast<CUdeviceptr>(&my_buffer.semaphores()[dst]);
-
-    std::cout << "RANK " << my_rank << " WAIT RECV BEFORE cuStreamWaitValue32 on local semaphore " << local_semaphore << std::endl;
-    NVFUSER_CUDA_SAFE_CALL(cuStreamWaitValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kReady), CU_STREAM_WAIT_VALUE_EQ));
-    std::cout << "RANK " << my_rank << " FINISHED WAIT RECV AFTER cuStreamWaitValue32 on local semaphore " << local_semaphore << ", buffer.data_ptr()=" << buffer.data_ptr() << ", my_buffer.ptr()=" << my_buffer.ptr() << "recv tensor=" << buffer << std::endl;
+    const auto local_semaphore =
+        reinterpret_cast<CUdeviceptr>(&my_buffer.semaphores()[dst]);
+
+    NVFUSER_CUDA_SAFE_CALL(cuStreamWaitValue32(
+        current_stream,
+        local_semaphore,
+        (cuuint32_t)(IpcSemaphore::kReady),
+        CU_STREAM_WAIT_VALUE_EQ));
   }
 }
 
diff --git a/csrc/host_ir/executor.h b/csrc/host_ir/executor.h
index f052c1bfeb7..6727fc7622a 100644
--- a/csrc/host_ir/executor.h
+++ b/csrc/host_ir/executor.h
@@ -50,14 +50,10 @@ class HostIrExecutor : public ExecutorAbstract {
 
 namespace hir {
 
-enum class IpcSemaphore : cuuint32_t {
-  kReady,
-  kTransferInProgress
-};
+enum class IpcSemaphore : cuuint32_t { kReady, kTransferInProgress };
 
 class RemoteBufferInfo {
  public:
-
   RemoteBufferInfo(at::Tensor tensor);
   RemoteBufferInfo(std::vector<uint8_t> data); // means it is imported
   ~RemoteBufferInfo();
@@ -189,7 +185,11 @@ class HostIrEvaluator final : public OptOutDispatch {
       return lhs.equal(rhs);
     }
   };
-  std::unordered_map<at::Tensor, std::vector<std::unique_ptr<RemoteBufferInfo>>, TensorHash, TensorEqual>
+  std::unordered_map<
+      at::Tensor,
+      std::vector<std::unique_ptr<RemoteBufferInfo>>,
+      TensorHash,
+      TensorEqual>
       remote_buffers_;
 };
 
diff --git a/csrc/host_ir/host_ir.cpp b/csrc/host_ir/host_ir.cpp
index 5ea51fd82ff..edc9c476eaf 100644
--- a/csrc/host_ir/host_ir.cpp
+++ b/csrc/host_ir/host_ir.cpp
@@ -323,8 +323,10 @@ std::string EndCoalescing::toInlineString(int indent_size) const {
   NVF_CHECK(false, "Cannot be printed inline");
 }
 
-
-ShareMemHandles::ShareMemHandles(IrBuilderPasskey passkey, std::vector<P2PCommunication*> communications) : Expr(passkey) {
+ShareMemHandles::ShareMemHandles(
+    IrBuilderPasskey passkey,
+    std::vector<P2PCommunication*> communications)
+    : Expr(passkey) {
   NVF_ERROR(passkey.ir_container_ != nullptr);
   NVF_ERROR(
       passkey.ir_container_->isA<HostIrContainer>(),
diff --git a/csrc/host_ir/host_ir.h b/csrc/host_ir/host_ir.h
index 64cdb404a8c..efb23b95d67 100644
--- a/csrc/host_ir/host_ir.h
+++ b/csrc/host_ir/host_ir.h
@@ -318,7 +318,9 @@ class EndCoalescing : public Expr {
 class ShareMemHandles : public Expr {
  public:
   using Expr::Expr;
-  ShareMemHandles(IrBuilderPasskey passkey, std::vector<P2PCommunication*> communications);
+  ShareMemHandles(
+      IrBuilderPasskey passkey,
+      std::vector<P2PCommunication*> communications);
 
   ShareMemHandles(const ShareMemHandles& other) = delete;
   ShareMemHandles& operator=(const ShareMemHandles& other) = delete;
diff --git a/csrc/multidevice/communicator.h b/csrc/multidevice/communicator.h
index 53c9fbcead8..65b994aa125 100644
--- a/csrc/multidevice/communicator.h
+++ b/csrc/multidevice/communicator.h
@@ -11,10 +11,9 @@
 #include <ATen/core/ivalue.h>
 #include <c10/util/intrusive_ptr.h>
 // #include <cuda.h>
-#include <driver_api.h>
-#include <cuda_utils.h>
 #include <cuda_runtime.h>
-
+#include <cuda_utils.h>
+#include <driver_api.h>
 
 #include <exceptions.h>
 #include <multidevice/multidevice.h>
diff --git a/tests/cpp/test_multidevice_communications.cpp b/tests/cpp/test_multidevice_communications.cpp
index 74efa49efbe..9db4f3a78eb 100644
--- a/tests/cpp/test_multidevice_communications.cpp
+++ b/tests/cpp/test_multidevice_communications.cpp
@@ -413,7 +413,7 @@ INSTANTIATE_TEST_SUITE_P(
     testing::Values(CommunicatorBackend::kNccl, CommunicatorBackend::kUcc),
     testing::PrintToStringParamName());
 
-using P2PCommunicationTest = MultiDeviceTest; 
+using P2PCommunicationTest = MultiDeviceTest;
 
 TEST_F(P2PCommunicationTest, CudaComm) {
   static constexpr int kTensorSize = 8;
@@ -440,10 +440,13 @@ TEST_F(P2PCommunicationTest, CudaComm) {
   container->addInput(send_tv);
   container->addInput(recv_tv);
 
-  auto recv = IrBuilder::create<P2PCommunication>(recv_tv, my_rank_val, recv_peer_val, CommunicatorBackend::kCuda);
-  auto send = IrBuilder::create<P2PCommunication>(send_tv, send_peer_val, my_rank_val, CommunicatorBackend::kCuda);
+  auto recv = IrBuilder::create<P2PCommunication>(
+      recv_tv, my_rank_val, recv_peer_val, CommunicatorBackend::kCuda);
+  auto send = IrBuilder::create<P2PCommunication>(
+      send_tv, send_peer_val, my_rank_val, CommunicatorBackend::kCuda);
   std::vector<P2PCommunication*> grouped_communications = {recv, send};
-  auto share_mem_handles = IrBuilder::create<hir::ShareMemHandles>(std::move(grouped_communications));
+  auto share_mem_handles = IrBuilder::create<hir::ShareMemHandles>(
+      std::move(grouped_communications));
   auto wait_recv = IrBuilder::create<hir::Wait>(recv);
   auto wait_send = IrBuilder::create<hir::Wait>(send);
 
@@ -458,17 +461,21 @@ TEST_F(P2PCommunicationTest, CudaComm) {
   at::Tensor send_tensor = at::empty({kTensorSize}, tensor_options);
   at::Tensor recv_tensor = at::empty({kTensorSize}, tensor_options);
 
-  std::unordered_map<Val*, c10::IValue> inputs = {{send_tv, send_tensor}, {recv_tv, recv_tensor}};
+  std::unordered_map<Val*, c10::IValue> inputs = {
+      {send_tv, send_tensor}, {recv_tv, recv_tensor}};
 
   for (auto repetition : c10::irange(kNumRepetitions)) {
-    send_tensor.copy_(at::arange(kTensorSize, tensor_options) + repetition * 10 + 100 * my_rank);
-    std::cout << "RANK " << my_rank << " REPETITION " << repetition << ", send_peer=" << send_peer << ", recv_peer=" << recv_peer << ", send_tensor=" << send_tensor << std::endl;
+    send_tensor.copy_(
+        at::arange(kTensorSize, tensor_options) + repetition * 10 +
+        100 * my_rank);
 
     executor.runWithInput(inputs);
 
-    std::cout << "RANK " << my_rank << " validation at" << " REPETITION " << repetition << std::endl;
-    auto ref = at::arange(kTensorSize, tensor_options) + repetition * 10 + 100 * recv_peer;
-    EXPECT_TRUE(torch::allclose(recv_tensor, ref)) << "Rank " << my_rank << " failed at repetition " << repetition << " with recv tensor " << recv_tensor << " and ref " << ref;
+    auto ref = at::arange(kTensorSize, tensor_options) + repetition * 10 +
+        100 * recv_peer;
+    EXPECT_TRUE(torch::allclose(recv_tensor, ref))
+        << "Rank " << my_rank << " failed at repetition " << repetition
+        << " with recv tensor " << recv_tensor << " and ref " << ref;
   }
 }
 
diff --git a/tests/cpp/test_multidevice_gpu_comms.cpp b/tests/cpp/test_multidevice_gpu_comms.cpp
index 413df0f06a4..a46fb5c1758 100644
--- a/tests/cpp/test_multidevice_gpu_comms.cpp
+++ b/tests/cpp/test_multidevice_gpu_comms.cpp
@@ -5,6 +5,7 @@
 * SPDX-License-Identifier: BSD-3-Clause
 */
 // clang-format on
+#include <cuda.h>
 #include <cuda_profiler_api.h>
 #include <fusion.h>
 #include <host_ir/container.h>
@@ -13,7 +14,6 @@
 #include <ops/all_ops.h>
 #include <tests/cpp/multidevice.h>
 #include <tests/cpp/multidevice_kernels.h>
-#include <cuda.h>
 
 namespace nvfuser {
 
@@ -148,9 +148,12 @@ TEST_F(StreamOpTest, StreamWriteValue32) {
   NVFUSER_CUDA_RT_SAFE_CALL(cudaSetDevice(0));
   NVFUSER_CUDA_RT_SAFE_CALL(cudaStreamCreate(&stream));
   NVFUSER_CUDA_RT_SAFE_CALL(cudaMalloc(&buf, sizeof(int)));
-  NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync(buf, &value, sizeof(int), cudaMemcpyHostToDevice, stream));
-  NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(stream, (CUdeviceptr)buf, new_value, CU_STREAM_WRITE_VALUE_DEFAULT));
-  NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync(&value, buf, sizeof(int), cudaMemcpyDeviceToHost, stream));
+  NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync(
+      buf, &value, sizeof(int), cudaMemcpyHostToDevice, stream));
+  NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(
+      stream, (CUdeviceptr)buf, new_value, CU_STREAM_WRITE_VALUE_DEFAULT));
+  NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync(
+      &value, buf, sizeof(int), cudaMemcpyDeviceToHost, stream));
   NVFUSER_CUDA_RT_SAFE_CALL(cudaStreamSynchronize(stream));
   EXPECT_EQ(value, new_value);
 }
diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp
index c1df5684c47..af7e153ba46 100644
--- a/tests/cpp/test_multidevice_overlap.cpp
+++ b/tests/cpp/test_multidevice_overlap.cpp
@@ -10,9 +10,9 @@
 #include <c10/cuda/CUDAStream.h>
 #include <c10/util/ArrayRef.h>
 // #include <cuda.h>
-#include <driver_api.h>
 #include <cuda_profiler_api.h>
 #include <cuda_runtime.h>
+#include <driver_api.h>
 #include <fusion.h>
 #include <host_ir/container.h>
 #include <host_ir/executor.h>

From 326b683932536d067e2b3c6063e90e72e117a560 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Wed, 12 Feb 2025 09:39:27 -0800
Subject: [PATCH 47/55] Move distributed tensors to separate file

---
 CMakeLists.txt                           |   1 +
 csrc/host_ir/executor.cpp                | 111 ++++++-----------------
 csrc/host_ir/executor.h                  |  30 +-----
 csrc/multidevice/communicator.h          |  12 ---
 tests/cpp/test_multidevice_gpu_comms.cpp |   1 +
 5 files changed, 31 insertions(+), 124 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5e59cfddc65..ee371dc64dc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -169,6 +169,7 @@ list(APPEND NVFUSER_SRCS
   ${NVFUSER_SRCS_DIR}/mma_type.cpp
   ${NVFUSER_SRCS_DIR}/multidevice/communication.cpp
   ${NVFUSER_SRCS_DIR}/multidevice/communicator.cpp
+  ${NVFUSER_SRCS_DIR}/multidevice/distributed_buffer.cpp
   ${NVFUSER_SRCS_DIR}/multidevice/device_mesh.cpp
   ${NVFUSER_SRCS_DIR}/multidevice/executor.cpp
   ${NVFUSER_SRCS_DIR}/multidevice/utils.cpp
diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp
index ae4dbb028b8..bb5462d6985 100644
--- a/csrc/host_ir/executor.cpp
+++ b/csrc/host_ir/executor.cpp
@@ -8,7 +8,6 @@
 
 #include <ATen/cuda/CUDAContext.h>
 
-#include <cuda.h>
 #include <dynamic_transform.h>
 #include <fusion_profiler.h>
 #include <host_ir/executor.h>
@@ -410,64 +409,6 @@ void HostIrEvaluator::handle(PostOnStream* post_ir) {
   }
 }
 
-RemoteBufferInfo::RemoteBufferInfo(at::Tensor tensor)
-    : ptr_(tensor.data_ptr()),
-      storage_offset_(tensor.storage_offset()),
-      element_size_(tensor.element_size()),
-      is_imported_(false) {
-  NVFUSER_CUDA_RT_SAFE_CALL(
-      cudaIpcGetMemHandle(&ipc_handle_, tensor.data_ptr()));
-  const auto number_of_semaphores = Communicator::getInstance().size();
-  NVFUSER_CUDA_RT_SAFE_CALL(cudaMalloc(
-      (void**)&semaphores_, number_of_semaphores * sizeof(IpcSemaphore)));
-  static_assert(
-      sizeof(IpcSemaphore) == sizeof(int),
-      "IpcSemaphore must be same size as int");
-  NVFUSER_CUDA_RT_SAFE_CALL(cudaMemset(
-      (void*)semaphores_,
-      (int)IpcSemaphore::kReady,
-      number_of_semaphores * sizeof(IpcSemaphore)));
-  NVFUSER_CUDA_RT_SAFE_CALL(
-      cudaIpcGetMemHandle(&semaphores_ipc_handle_, semaphores_));
-}
-
-RemoteBufferInfo::RemoteBufferInfo(std::vector<uint8_t> data)
-    : is_imported_(true) {
-  const RemoteBufferInfo& imported_buffer = fromBytes<RemoteBufferInfo>(data);
-
-  storage_offset_ = imported_buffer.storage_offset_;
-  element_size_ = imported_buffer.element_size_;
-  ipc_handle_ = imported_buffer.ipc_handle_;
-  semaphores_ipc_handle_ = imported_buffer.semaphores_ipc_handle_;
-
-  NVFUSER_CUDA_RT_SAFE_CALL(
-      cudaIpcOpenMemHandle(&ptr_, ipc_handle_, cudaIpcMemLazyEnablePeerAccess));
-  ptr_ = (void*)((uint8_t*)ptr_ + storage_offset_ * element_size_);
-
-  NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcOpenMemHandle(
-      (void**)&semaphores_,
-      semaphores_ipc_handle_,
-      cudaIpcMemLazyEnablePeerAccess));
-}
-
-RemoteBufferInfo::~RemoteBufferInfo() {
-  if (is_imported_) {
-    NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcCloseMemHandle(ptr_));
-    NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcCloseMemHandle((void*)semaphores_));
-  } else {
-    NVFUSER_CUDA_RT_SAFE_CALL(cudaFree((void*)semaphores_));
-  }
-}
-
-std::ostream& operator<<(std::ostream& os, const RemoteBufferInfo& info) {
-  os << "RemoteBufferInfo(ptr=" << info.ptr_
-     << ", storage_offset=" << info.storage_offset_
-     << ", element_size=" << info.element_size_
-     << ", is_imported=" << info.is_imported_
-     << ", semaphores_=" << info.semaphores_ << ")";
-  return os;
-}
-
 void HostIrEvaluator::handle(ShareMemHandles* share_mem_handles) {
   const int64_t my_rank = communicator_->deviceId();
   auto get_tensor = [this](P2PCommunication* communication) -> at::Tensor {
@@ -488,8 +429,8 @@ void HostIrEvaluator::handle(ShareMemHandles* share_mem_handles) {
           is_receiver)) { // REMOVE or adapt exporting/opening the handle
       return;
     }
-    if (remote_buffers_.find(get_tensor(communication)) !=
-        remote_buffers_.end()) {
+    if (distributed_buffers_.find(get_tensor(communication)) !=
+        distributed_buffers_.end()) {
       continue;
     }
     communications.push_back(communication);
@@ -498,7 +439,7 @@ void HostIrEvaluator::handle(ShareMemHandles* share_mem_handles) {
   // put memhandles to TCP store
   auto get_key =
       [this](P2PCommunication* communication, int64_t rank) -> std::string {
-    return "nvfuser_remote_buffer_info_P2PComm_dst=" +
+    return "nvfuser_distributed_buffer_info_P2PComm_dst=" +
         std::to_string(this->expr_evaluator_.evaluate(communication->dst())
                            .as<int64_t>()) +
         "_src=" +
@@ -506,12 +447,12 @@ void HostIrEvaluator::handle(ShareMemHandles* share_mem_handles) {
                            .as<int64_t>()) +
         "_rank=" + std::to_string(rank);
   };
-  std::unordered_map<P2PCommunication*, std::unique_ptr<RemoteBufferInfo>>
+  std::unordered_map<P2PCommunication*, std::unique_ptr<DistributedBuffer>>
       buffer_handles;
   auto store = communicator_->getTcpStore();
   for (P2PCommunication* communication : communications) {
     auto buffer_handle =
-        std::make_unique<RemoteBufferInfo>(get_tensor(communication));
+        std::make_unique<DistributedBuffer>(get_tensor(communication));
     store->set(get_key(communication, my_rank), toBytes(*buffer_handle));
     buffer_handles.emplace(communication, std::move(buffer_handle));
   }
@@ -522,20 +463,20 @@ void HostIrEvaluator::handle(ShareMemHandles* share_mem_handles) {
 
   // get memhandles to TCP store
   for (P2PCommunication* communication : communications) {
-    std::vector<std::unique_ptr<RemoteBufferInfo>> remote_buffers;
-    remote_buffers.reserve(communicator_->size());
+    std::vector<std::unique_ptr<DistributedBuffer>> distributed_buffers;
+    distributed_buffers.reserve(communicator_->size());
     const auto dst =
         expr_evaluator_.evaluate(communication->dst()).as<int64_t>();
     const auto src =
         expr_evaluator_.evaluate(communication->src()).as<int64_t>();
     for (int64_t rank : c10::irange(communicator_->size())) {
       if (rank != src && rank != dst) {
-        remote_buffers.push_back(nullptr);
+        distributed_buffers.push_back(nullptr);
         continue;
       }
       if (rank == my_rank) {
         // opening an ipc handle on the exporter's device is not supported
-        remote_buffers.push_back(std::move(buffer_handles.at(communication)));
+        distributed_buffers.push_back(std::move(buffer_handles.at(communication)));
       } else {
         std::string key = get_key(communication, rank);
         NVF_ERROR(
@@ -544,13 +485,13 @@ void HostIrEvaluator::handle(ShareMemHandles* share_mem_handles) {
             key,
             " not found in store at rank ",
             my_rank);
-        auto imported_remote_buffer_info =
-            std::make_unique<RemoteBufferInfo>(store->get(key));
-        remote_buffers.push_back(std::move(imported_remote_buffer_info));
+        auto imported_distributed_buffer_info =
+            std::make_unique<DistributedBuffer>(store->get(key));
+        distributed_buffers.push_back(std::move(imported_distributed_buffer_info));
       }
     }
-    remote_buffers_.emplace(
-        get_tensor(communication), std::move(remote_buffers));
+    distributed_buffers_.emplace(
+        get_tensor(communication), std::move(distributed_buffers));
   }
 }
 
@@ -610,18 +551,18 @@ void HostIrEvaluator::handle(P2PCommunication* communication) {
     return;
   }
 
-  const auto it = remote_buffers_.find(buffer);
+  const auto it = distributed_buffers_.find(buffer);
   NVF_ERROR(
-      it != remote_buffers_.end(),
+      it != distributed_buffers_.end(),
       "No remote buffer found for ",
       communication->toString(),
       " at rank ",
       my_rank);
-  const std::vector<std::unique_ptr<RemoteBufferInfo>>& remote_buffers =
+  const std::vector<std::unique_ptr<DistributedBuffer>>& distributed_buffers =
       it->second;
   const int64_t peer = is_sender ? dst : src;
-  const RemoteBufferInfo& my_buffer = *remote_buffers.at(my_rank);
-  const RemoteBufferInfo& peer_buffer = *remote_buffers.at(peer);
+  const DistributedBuffer& my_buffer = *distributed_buffers.at(my_rank);
+  const DistributedBuffer& peer_buffer = *distributed_buffers.at(peer);
   const auto local_semaphore =
       reinterpret_cast<CUdeviceptr>(&my_buffer.semaphores()[peer]);
   const auto remote_semaphore =
@@ -637,7 +578,7 @@ void HostIrEvaluator::handle(P2PCommunication* communication) {
     NVFUSER_CUDA_SAFE_CALL(cuStreamWaitValue32(
         current_stream,
         local_semaphore,
-        (cuuint32_t)(IpcSemaphore::kTransferInProgress),
+        (cuuint32_t)(IpcSemaphore::kInUse),
         CU_STREAM_WAIT_VALUE_EQ));
     // RDMA get the data from the sender
     NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync(
@@ -663,13 +604,13 @@ void HostIrEvaluator::handle(P2PCommunication* communication) {
     NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(
         current_stream,
         local_semaphore,
-        (cuuint32_t)(IpcSemaphore::kTransferInProgress),
+        (cuuint32_t)(IpcSemaphore::kInUse),
         CU_STREAM_WRITE_VALUE_DEFAULT));
     // signal to receiver that the buffer is ready
     NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(
         current_stream,
         remote_semaphore,
-        (cuuint32_t)(IpcSemaphore::kTransferInProgress),
+        (cuuint32_t)(IpcSemaphore::kInUse),
         CU_STREAM_WRITE_VALUE_DEFAULT)); // passing
                                          // CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER
                                          // gives an error
@@ -700,17 +641,17 @@ void HostIrEvaluator::handle(Wait* wait) {
         c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream());
     at::Tensor buffer =
         getKnownTensorOrUndefined(p2p_comm->buffer(), expr_evaluator_);
-    const auto it = remote_buffers_.find(buffer);
+    const auto it = distributed_buffers_.find(buffer);
     NVF_ERROR(
-        it != remote_buffers_.end(),
+        it != distributed_buffers_.end(),
         "No remote buffer found for ",
         p2p_comm->toString(),
         " at rank ",
         my_rank);
-    const std::vector<std::unique_ptr<RemoteBufferInfo>>& remote_buffers =
+    const std::vector<std::unique_ptr<DistributedBuffer>>& distributed_buffers =
         it->second;
 
-    const RemoteBufferInfo& my_buffer = *remote_buffers.at(my_rank);
+    const DistributedBuffer& my_buffer = *distributed_buffers.at(my_rank);
     const auto local_semaphore =
         reinterpret_cast<CUdeviceptr>(&my_buffer.semaphores()[dst]);
 
diff --git a/csrc/host_ir/executor.h b/csrc/host_ir/executor.h
index 6727fc7622a..634b27c755e 100644
--- a/csrc/host_ir/executor.h
+++ b/csrc/host_ir/executor.h
@@ -12,6 +12,7 @@
 #include <host_ir/container.h>
 #include <host_ir/host_ir.h>
 #include <multidevice/communicator.h>
+#include <multidevice/distributed_buffer.h>
 #include <runtime/executor.h>
 #include <runtime/executor_abstract.h>
 #include <runtime/executor_params.h>
@@ -50,31 +51,6 @@ class HostIrExecutor : public ExecutorAbstract {
 
 namespace hir {
 
-enum class IpcSemaphore : cuuint32_t { kReady, kTransferInProgress };
-
-class RemoteBufferInfo {
- public:
-  RemoteBufferInfo(at::Tensor tensor);
-  RemoteBufferInfo(std::vector<uint8_t> data); // means it is imported
-  ~RemoteBufferInfo();
-
-  void* ptr() const {
-    return ptr_;
-  }
-
-  auto semaphores() const {
-    return semaphores_;
-  }
-
-  void* ptr_;
-  int64_t storage_offset_;
-  int64_t element_size_;
-  bool is_imported_;
-  cudaIpcMemHandle_t ipc_handle_;
-  cudaIpcMemHandle_t semaphores_ipc_handle_;
-  IpcSemaphore* semaphores_;
-};
-
 /*
 a HostIrEvaluator evaluates a host programs represented through a
 HostIrContainer It is instantiated with the desired HostIrContainer, and runs
@@ -187,10 +163,10 @@ class HostIrEvaluator final : public OptOutDispatch {
   };
   std::unordered_map<
       at::Tensor,
-      std::vector<std::unique_ptr<RemoteBufferInfo>>,
+      std::vector<std::unique_ptr<DistributedBuffer>>,
       TensorHash,
       TensorEqual>
-      remote_buffers_;
+      distributed_buffers_;
 };
 
 } // namespace hir
diff --git a/csrc/multidevice/communicator.h b/csrc/multidevice/communicator.h
index 65b994aa125..e8b71df1465 100644
--- a/csrc/multidevice/communicator.h
+++ b/csrc/multidevice/communicator.h
@@ -28,18 +28,6 @@
 
 namespace nvfuser {
 
-template <typename T>
-std::vector<uint8_t> toBytes(const T& data) {
-  return std::vector<uint8_t>(
-      reinterpret_cast<const uint8_t*>(&data),
-      reinterpret_cast<const uint8_t*>(&data) + sizeof(T));
-}
-
-template <typename T>
-const T& fromBytes(const std::vector<uint8_t>& bytes) {
-  return *reinterpret_cast<const T*>(bytes.data());
-}
-
 // This file implements the class Communicator which sets up the inter-process
 // Backend. This class contains inter-process information, such as the rank, the
 // world size, as well as the Process Group that can be called to perform
diff --git a/tests/cpp/test_multidevice_gpu_comms.cpp b/tests/cpp/test_multidevice_gpu_comms.cpp
index a46fb5c1758..acddba06547 100644
--- a/tests/cpp/test_multidevice_gpu_comms.cpp
+++ b/tests/cpp/test_multidevice_gpu_comms.cpp
@@ -11,6 +11,7 @@
 #include <host_ir/container.h>
 #include <host_ir/executor.h>
 #include <ir/all_nodes.h>
+#include <multidevice/distributed_buffer.h>
 #include <ops/all_ops.h>
 #include <tests/cpp/multidevice.h>
 #include <tests/cpp/multidevice_kernels.h>

From cf8991c033eebe08186e05fbe9e08f7bafe88fe9 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Wed, 12 Feb 2025 09:42:48 -0800
Subject: [PATCH 48/55] rename DistributedBuffer to IpcHandle

---
 CMakeLists.txt                           |  2 +-
 csrc/host_ir/executor.cpp                | 46 ++++++++++++------------
 csrc/host_ir/executor.h                  |  6 ++--
 tests/cpp/test_multidevice_gpu_comms.cpp |  2 +-
 4 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ee371dc64dc..3c862cc5a3e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -169,7 +169,7 @@ list(APPEND NVFUSER_SRCS
   ${NVFUSER_SRCS_DIR}/mma_type.cpp
   ${NVFUSER_SRCS_DIR}/multidevice/communication.cpp
   ${NVFUSER_SRCS_DIR}/multidevice/communicator.cpp
-  ${NVFUSER_SRCS_DIR}/multidevice/distributed_buffer.cpp
+  ${NVFUSER_SRCS_DIR}/multidevice/ipc_handle.cpp
   ${NVFUSER_SRCS_DIR}/multidevice/device_mesh.cpp
   ${NVFUSER_SRCS_DIR}/multidevice/executor.cpp
   ${NVFUSER_SRCS_DIR}/multidevice/utils.cpp
diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp
index bb5462d6985..a4e9ec1a701 100644
--- a/csrc/host_ir/executor.cpp
+++ b/csrc/host_ir/executor.cpp
@@ -429,8 +429,8 @@ void HostIrEvaluator::handle(ShareMemHandles* share_mem_handles) {
           is_receiver)) { // REMOVE or adapt exporting/opening the handle
       return;
     }
-    if (distributed_buffers_.find(get_tensor(communication)) !=
-        distributed_buffers_.end()) {
+    if (ipc_handles_.find(get_tensor(communication)) !=
+        ipc_handles_.end()) {
       continue;
     }
     communications.push_back(communication);
@@ -439,7 +439,7 @@ void HostIrEvaluator::handle(ShareMemHandles* share_mem_handles) {
   // put memhandles to TCP store
   auto get_key =
       [this](P2PCommunication* communication, int64_t rank) -> std::string {
-    return "nvfuser_distributed_buffer_info_P2PComm_dst=" +
+    return "nvfuser_ipc_handle_info_P2PComm_dst=" +
         std::to_string(this->expr_evaluator_.evaluate(communication->dst())
                            .as<int64_t>()) +
         "_src=" +
@@ -447,12 +447,12 @@ void HostIrEvaluator::handle(ShareMemHandles* share_mem_handles) {
                            .as<int64_t>()) +
         "_rank=" + std::to_string(rank);
   };
-  std::unordered_map<P2PCommunication*, std::unique_ptr<DistributedBuffer>>
+  std::unordered_map<P2PCommunication*, std::unique_ptr<IpcHandle>>
       buffer_handles;
   auto store = communicator_->getTcpStore();
   for (P2PCommunication* communication : communications) {
     auto buffer_handle =
-        std::make_unique<DistributedBuffer>(get_tensor(communication));
+        std::make_unique<IpcHandle>(get_tensor(communication));
     store->set(get_key(communication, my_rank), toBytes(*buffer_handle));
     buffer_handles.emplace(communication, std::move(buffer_handle));
   }
@@ -463,20 +463,20 @@ void HostIrEvaluator::handle(ShareMemHandles* share_mem_handles) {
 
   // get memhandles to TCP store
   for (P2PCommunication* communication : communications) {
-    std::vector<std::unique_ptr<DistributedBuffer>> distributed_buffers;
-    distributed_buffers.reserve(communicator_->size());
+    std::vector<std::unique_ptr<IpcHandle>> ipc_handles;
+    ipc_handles.reserve(communicator_->size());
     const auto dst =
         expr_evaluator_.evaluate(communication->dst()).as<int64_t>();
     const auto src =
         expr_evaluator_.evaluate(communication->src()).as<int64_t>();
     for (int64_t rank : c10::irange(communicator_->size())) {
       if (rank != src && rank != dst) {
-        distributed_buffers.push_back(nullptr);
+        ipc_handles.push_back(nullptr);
         continue;
       }
       if (rank == my_rank) {
         // opening an ipc handle on the exporter's device is not supported
-        distributed_buffers.push_back(std::move(buffer_handles.at(communication)));
+        ipc_handles.push_back(std::move(buffer_handles.at(communication)));
       } else {
         std::string key = get_key(communication, rank);
         NVF_ERROR(
@@ -485,13 +485,13 @@ void HostIrEvaluator::handle(ShareMemHandles* share_mem_handles) {
             key,
             " not found in store at rank ",
             my_rank);
-        auto imported_distributed_buffer_info =
-            std::make_unique<DistributedBuffer>(store->get(key));
-        distributed_buffers.push_back(std::move(imported_distributed_buffer_info));
+        auto imported_ipc_handle_info =
+            std::make_unique<IpcHandle>(store->get(key));
+        ipc_handles.push_back(std::move(imported_ipc_handle_info));
       }
     }
-    distributed_buffers_.emplace(
-        get_tensor(communication), std::move(distributed_buffers));
+    ipc_handles_.emplace(
+        get_tensor(communication), std::move(ipc_handles));
   }
 }
 
@@ -551,18 +551,18 @@ void HostIrEvaluator::handle(P2PCommunication* communication) {
     return;
   }
 
-  const auto it = distributed_buffers_.find(buffer);
+  const auto it = ipc_handles_.find(buffer);
   NVF_ERROR(
-      it != distributed_buffers_.end(),
+      it != ipc_handles_.end(),
       "No remote buffer found for ",
       communication->toString(),
       " at rank ",
       my_rank);
-  const std::vector<std::unique_ptr<DistributedBuffer>>& distributed_buffers =
+  const std::vector<std::unique_ptr<IpcHandle>>& ipc_handles =
       it->second;
   const int64_t peer = is_sender ? dst : src;
-  const DistributedBuffer& my_buffer = *distributed_buffers.at(my_rank);
-  const DistributedBuffer& peer_buffer = *distributed_buffers.at(peer);
+  const IpcHandle& my_buffer = *ipc_handles.at(my_rank);
+  const IpcHandle& peer_buffer = *ipc_handles.at(peer);
   const auto local_semaphore =
       reinterpret_cast<CUdeviceptr>(&my_buffer.semaphores()[peer]);
   const auto remote_semaphore =
@@ -641,17 +641,17 @@ void HostIrEvaluator::handle(Wait* wait) {
         c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream());
     at::Tensor buffer =
         getKnownTensorOrUndefined(p2p_comm->buffer(), expr_evaluator_);
-    const auto it = distributed_buffers_.find(buffer);
+    const auto it = ipc_handles_.find(buffer);
     NVF_ERROR(
-        it != distributed_buffers_.end(),
+        it != ipc_handles_.end(),
         "No remote buffer found for ",
         p2p_comm->toString(),
         " at rank ",
         my_rank);
-    const std::vector<std::unique_ptr<DistributedBuffer>>& distributed_buffers =
+    const std::vector<std::unique_ptr<IpcHandle>>& ipc_handles =
         it->second;
 
-    const DistributedBuffer& my_buffer = *distributed_buffers.at(my_rank);
+    const IpcHandle& my_buffer = *ipc_handles.at(my_rank);
     const auto local_semaphore =
         reinterpret_cast<CUdeviceptr>(&my_buffer.semaphores()[dst]);
 
diff --git a/csrc/host_ir/executor.h b/csrc/host_ir/executor.h
index 634b27c755e..2badda7f516 100644
--- a/csrc/host_ir/executor.h
+++ b/csrc/host_ir/executor.h
@@ -12,7 +12,7 @@
 #include <host_ir/container.h>
 #include <host_ir/host_ir.h>
 #include <multidevice/communicator.h>
-#include <multidevice/distributed_buffer.h>
+#include <multidevice/ipc_handle.h>
 #include <runtime/executor.h>
 #include <runtime/executor_abstract.h>
 #include <runtime/executor_params.h>
@@ -163,10 +163,10 @@ class HostIrEvaluator final : public OptOutDispatch {
   };
   std::unordered_map<
       at::Tensor,
-      std::vector<std::unique_ptr<DistributedBuffer>>,
+      std::vector<std::unique_ptr<IpcHandle>>,
       TensorHash,
       TensorEqual>
-      distributed_buffers_;
+      ipc_handles_;
 };
 
 } // namespace hir
diff --git a/tests/cpp/test_multidevice_gpu_comms.cpp b/tests/cpp/test_multidevice_gpu_comms.cpp
index acddba06547..75a6aeba472 100644
--- a/tests/cpp/test_multidevice_gpu_comms.cpp
+++ b/tests/cpp/test_multidevice_gpu_comms.cpp
@@ -11,7 +11,7 @@
 #include <host_ir/container.h>
 #include <host_ir/executor.h>
 #include <ir/all_nodes.h>
-#include <multidevice/distributed_buffer.h>
+#include <multidevice/ipc_handle.h>
 #include <ops/all_ops.h>
 #include <tests/cpp/multidevice.h>
 #include <tests/cpp/multidevice_kernels.h>

From 541fe8020b57477744edd8f78c8084c6c5b10691 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Wed, 12 Feb 2025 13:14:46 -0800
Subject: [PATCH 49/55] working chkpt. Added in the commit the new files that
 were forgotten before

---
 csrc/host_ir/executor.cpp       |  31 +++---
 csrc/host_ir/executor.h         |  21 +----
 csrc/multidevice/ipc_handle.cpp |  63 +++++++++++++
 csrc/multidevice/ipc_handle.h   | 161 ++++++++++++++++++++++++++++++++
 4 files changed, 239 insertions(+), 37 deletions(-)
 create mode 100644 csrc/multidevice/ipc_handle.cpp
 create mode 100644 csrc/multidevice/ipc_handle.h

diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp
index a4e9ec1a701..7cebe36f792 100644
--- a/csrc/host_ir/executor.cpp
+++ b/csrc/host_ir/executor.cpp
@@ -429,8 +429,7 @@ void HostIrEvaluator::handle(ShareMemHandles* share_mem_handles) {
           is_receiver)) { // REMOVE or adapt exporting/opening the handle
       return;
     }
-    if (ipc_handles_.find(get_tensor(communication)) !=
-        ipc_handles_.end()) {
+    if (ipc_handle_cache_.find(communication, expr_evaluator_) != nullptr) {
       continue;
     }
     communications.push_back(communication);
@@ -463,20 +462,20 @@ void HostIrEvaluator::handle(ShareMemHandles* share_mem_handles) {
 
   // get memhandles to TCP store
   for (P2PCommunication* communication : communications) {
-    std::vector<std::unique_ptr<IpcHandle>> ipc_handles;
-    ipc_handles.reserve(communicator_->size());
+    auto ipc_handles = std::make_unique<std::vector<std::unique_ptr<IpcHandle>>>();
+    ipc_handles->reserve(communicator_->size());
     const auto dst =
         expr_evaluator_.evaluate(communication->dst()).as<int64_t>();
     const auto src =
         expr_evaluator_.evaluate(communication->src()).as<int64_t>();
     for (int64_t rank : c10::irange(communicator_->size())) {
       if (rank != src && rank != dst) {
-        ipc_handles.push_back(nullptr);
+        ipc_handles->push_back(nullptr);
         continue;
       }
       if (rank == my_rank) {
         // opening an ipc handle on the exporter's device is not supported
-        ipc_handles.push_back(std::move(buffer_handles.at(communication)));
+        ipc_handles->push_back(std::move(buffer_handles.at(communication)));
       } else {
         std::string key = get_key(communication, rank);
         NVF_ERROR(
@@ -487,11 +486,11 @@ void HostIrEvaluator::handle(ShareMemHandles* share_mem_handles) {
             my_rank);
         auto imported_ipc_handle_info =
             std::make_unique<IpcHandle>(store->get(key));
-        ipc_handles.push_back(std::move(imported_ipc_handle_info));
+        ipc_handles->push_back(std::move(imported_ipc_handle_info));
       }
     }
-    ipc_handles_.emplace(
-        get_tensor(communication), std::move(ipc_handles));
+    ipc_handle_cache_.insert(
+      communication, expr_evaluator_, std::move(ipc_handles));
   }
 }
 
@@ -551,15 +550,14 @@ void HostIrEvaluator::handle(P2PCommunication* communication) {
     return;
   }
 
-  const auto it = ipc_handles_.find(buffer);
+  const auto it = ipc_handle_cache_.find(communication, expr_evaluator_);
   NVF_ERROR(
-      it != ipc_handles_.end(),
+      it != nullptr,
       "No remote buffer found for ",
       communication->toString(),
       " at rank ",
       my_rank);
-  const std::vector<std::unique_ptr<IpcHandle>>& ipc_handles =
-      it->second;
+  const std::vector<std::unique_ptr<IpcHandle>>& ipc_handles = *it;
   const int64_t peer = is_sender ? dst : src;
   const IpcHandle& my_buffer = *ipc_handles.at(my_rank);
   const IpcHandle& peer_buffer = *ipc_handles.at(peer);
@@ -641,15 +639,14 @@ void HostIrEvaluator::handle(Wait* wait) {
         c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream());
     at::Tensor buffer =
         getKnownTensorOrUndefined(p2p_comm->buffer(), expr_evaluator_);
-    const auto it = ipc_handles_.find(buffer);
+    const auto it = ipc_handle_cache_.find(p2p_comm, expr_evaluator_);
     NVF_ERROR(
-        it != ipc_handles_.end(),
+        it != nullptr,
         "No remote buffer found for ",
         p2p_comm->toString(),
         " at rank ",
         my_rank);
-    const std::vector<std::unique_ptr<IpcHandle>>& ipc_handles =
-        it->second;
+    const std::vector<std::unique_ptr<IpcHandle>>& ipc_handles = *it;
 
     const IpcHandle& my_buffer = *ipc_handles.at(my_rank);
     const auto local_semaphore =
diff --git a/csrc/host_ir/executor.h b/csrc/host_ir/executor.h
index 2badda7f516..baac74b6756 100644
--- a/csrc/host_ir/executor.h
+++ b/csrc/host_ir/executor.h
@@ -147,26 +147,7 @@ class HostIrEvaluator final : public OptOutDispatch {
   std::unordered_map<StreamKey, c10::cuda::CUDAStream> streams_;
   std::unordered_map<Expr*, c10::intrusive_ptr<c10d::Work>> works_;
   const int64_t my_local_device_index_;
-  struct TensorHash {
-    std::size_t operator()(const at::Tensor& tensor) const {
-      auto ptr = reinterpret_cast<std::uintptr_t>(tensor.data_ptr());
-      auto offset = tensor.storage_offset();
-      auto element_size = tensor.element_size();
-      return std::hash<std::uintptr_t>()(ptr) ^ std::hash<int64_t>()(offset) ^
-          std::hash<int>()(element_size);
-    }
-  };
-  struct TensorEqual {
-    bool operator()(const at::Tensor& lhs, const at::Tensor& rhs) const {
-      return lhs.equal(rhs);
-    }
-  };
-  std::unordered_map<
-      at::Tensor,
-      std::vector<std::unique_ptr<IpcHandle>>,
-      TensorHash,
-      TensorEqual>
-      ipc_handles_;
+  IpcHandleCache ipc_handle_cache_;
 };
 
 } // namespace hir
diff --git a/csrc/multidevice/ipc_handle.cpp b/csrc/multidevice/ipc_handle.cpp
new file mode 100644
index 00000000000..ccaf9bf5c4d
--- /dev/null
+++ b/csrc/multidevice/ipc_handle.cpp
@@ -0,0 +1,63 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#include<cuda_utils.h>
+#include<multidevice/communicator.h>
+#include<multidevice/ipc_handle.h>
+
+namespace nvfuser {
+
+IpcHandle::IpcHandle(at::Tensor tensor)
+    : ptr_(tensor.data_ptr()),
+      storage_offset_(tensor.storage_offset()),
+      element_size_(tensor.element_size()),
+      is_imported_(false) {
+  NVFUSER_CUDA_RT_SAFE_CALL(
+      cudaIpcGetMemHandle(&ipc_handle_, tensor.data_ptr()));
+  const auto number_of_semaphores = Communicator::getInstance().size();
+  NVFUSER_CUDA_RT_SAFE_CALL(cudaMalloc(
+      (void**)&semaphores_, number_of_semaphores * sizeof(IpcSemaphore)));
+  static_assert(
+      sizeof(IpcSemaphore) == sizeof(int),
+      "IpcSemaphore must be same size as int");
+  NVFUSER_CUDA_RT_SAFE_CALL(cudaMemset(
+      (void*)semaphores_,
+      (int)IpcSemaphore::kReady,
+      number_of_semaphores * sizeof(IpcSemaphore)));
+  NVFUSER_CUDA_RT_SAFE_CALL(
+      cudaIpcGetMemHandle(&semaphores_ipc_handle_, semaphores_));
+}
+
+IpcHandle::IpcHandle(std::vector<uint8_t> data)
+    : is_imported_(true) {
+  const IpcHandle& imported_buffer = fromBytes<IpcHandle>(data);
+
+  storage_offset_ = imported_buffer.storage_offset_;
+  element_size_ = imported_buffer.element_size_;
+  ipc_handle_ = imported_buffer.ipc_handle_;
+  semaphores_ipc_handle_ = imported_buffer.semaphores_ipc_handle_;
+
+  NVFUSER_CUDA_RT_SAFE_CALL(
+      cudaIpcOpenMemHandle(&ptr_, ipc_handle_, cudaIpcMemLazyEnablePeerAccess));
+  ptr_ = (void*)((uint8_t*)ptr_ + storage_offset_ * element_size_);
+
+  NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcOpenMemHandle(
+      (void**)&semaphores_,
+      semaphores_ipc_handle_,
+      cudaIpcMemLazyEnablePeerAccess));
+}
+
+IpcHandle::~IpcHandle() {
+  if (is_imported_) {
+    NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcCloseMemHandle(ptr_));
+    NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcCloseMemHandle((void*)semaphores_));
+  } else {
+    NVFUSER_CUDA_RT_SAFE_CALL(cudaFree((void*)semaphores_));
+  }
+}
+
+} // nvfuser
diff --git a/csrc/multidevice/ipc_handle.h b/csrc/multidevice/ipc_handle.h
new file mode 100644
index 00000000000..5ab790f7e8f
--- /dev/null
+++ b/csrc/multidevice/ipc_handle.h
@@ -0,0 +1,161 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+#include <cuda.h>
+#include <expr_evaluator.h>
+#include <ATen/core/TensorBody.h>
+
+namespace nvfuser {
+
+template <typename T>
+std::vector<uint8_t> toBytes(const T& data) {
+  return std::vector<uint8_t>(
+      reinterpret_cast<const uint8_t*>(&data),
+      reinterpret_cast<const uint8_t*>(&data) + sizeof(T));
+}
+
+template <typename T>
+const T& fromBytes(const std::vector<uint8_t>& bytes) {
+  return *reinterpret_cast<const T*>(bytes.data());
+}
+
+enum class IpcSemaphore : cuuint32_t { kReady, kInUse };
+
+class IpcHandle {
+ public:
+  IpcHandle(at::Tensor tensor);
+  IpcHandle(std::vector<uint8_t> data); // means it is imported
+  ~IpcHandle();
+
+  void* ptr() const {
+    return ptr_;
+  }
+
+  auto semaphores() const {
+    return semaphores_;
+  }
+
+ private:
+  void* ptr_;
+  int64_t storage_offset_;
+  int64_t element_size_;
+  bool is_imported_;
+  cudaIpcMemHandle_t ipc_handle_;
+  cudaIpcMemHandle_t semaphores_ipc_handle_;
+  IpcSemaphore* semaphores_;
+};
+
+using P2pIpcHandle = std::vector<std::unique_ptr<IpcHandle>>;
+// class P2pIpcHandle {
+//  public:
+//   P2pIpcHandle(IpcHandle local_handle, IpcHandle peer_handle)
+//       : local_handle_(local_handle), peer_handle_(peer_handle) {};
+
+//   ~P2pIpcHandle();
+
+//   const auto& local() {
+//     return local_handle_;
+//   }
+
+//   const auto& peer() {
+//     return peer_handle_;
+//   }
+
+//  private:
+//   IpcHandle local_handle_;
+//   IpcHandle peer_handle_;
+// };
+
+
+class IpcHandleCache {
+  public:
+    IpcHandleCache() = default;
+    ~IpcHandleCache() = default;
+
+
+  P2pIpcHandle* find(P2PCommunication* comm, ExpressionEvaluator& expr_evaluator) {
+    return find(getKey(comm, expr_evaluator));
+  }
+
+  void insert(P2PCommunication* comm, ExpressionEvaluator& expr_evaluator, std::unique_ptr<P2pIpcHandle> handle) {
+    handles_[getKey(comm, expr_evaluator)] = std::move(handle);
+  }
+
+ private:
+  using KeyType = std::tuple<int64_t, int64_t, at::Tensor, P2PCommunication*>;
+
+  KeyType getKey(P2PCommunication* comm, ExpressionEvaluator& expr_evaluator) {
+    int64_t dst = expr_evaluator.evaluate(comm->dst()).as<int64_t>();
+    int64_t src = expr_evaluator.evaluate(comm->src()).as<int64_t>();
+    at::Tensor buffer = expr_evaluator.evaluate(comm->buffer()).as<at::Tensor>();
+    return std::make_tuple(dst, src, buffer, comm);
+  }
+
+  void insert(KeyType key, std::unique_ptr<P2pIpcHandle> handle) {
+    handles_[key] = std::move(handle);
+  }
+
+  P2pIpcHandle* find(KeyType key) {
+    auto it = handles_.find(key);
+    if (it == handles_.end()) {
+      return nullptr;
+    }
+    return it->second.get();
+  }
+
+  struct TensorHash {
+    std::size_t operator()(const at::Tensor& tensor) const {
+      auto ptr = reinterpret_cast<std::uintptr_t>(tensor.data_ptr());
+      auto offset = tensor.storage_offset();
+      auto element_size = tensor.element_size();
+      return std::hash<std::uintptr_t>()(ptr) ^ std::hash<int64_t>()(offset) ^
+          std::hash<int>()(element_size);
+    }
+  };
+
+  struct TensorEqual {
+    bool operator()(const at::Tensor& lhs, const at::Tensor& rhs) const {
+      return lhs.equal(rhs);
+    }
+  };
+
+
+
+  struct KeyHash {
+    std::size_t operator()(const KeyType& key) const {
+      return (std::hash<int64_t>()(std::get<0>(key)) << 13) ^
+         (std::hash<int64_t>()(std::get<1>(key)) << 7) ^
+         (TensorHash{}(std::get<2>(key))) ^
+         (std::hash<P2PCommunication*>()(std::get<3>(key)));
+    }
+  };
+
+  struct KeyEqual {
+    bool operator()(const KeyType& lhs, const KeyType& rhs) const {
+      return std::get<0>(lhs) == std::get<0>(rhs) &&
+             std::get<1>(lhs) == std::get<1>(rhs) &&
+             TensorEqual{}(std::get<2>(lhs), std::get<2>(rhs)) &&
+             std::get<3>(lhs) == std::get<3>(rhs);
+    }
+  };
+
+  std::unordered_map<
+    KeyType,
+    std::unique_ptr<P2pIpcHandle>,
+    KeyHash,
+    KeyEqual>
+    handles_;
+};
+
+
+// The cache key must be match on (dst, src, tensor, Id of SendComm, Id of RecvComm) or (int64_t dst, int64_t src, tensor, P2PCommunication*)
+// we need a counter on Tensor+P2PCommunication* for each given dst, src
+// In the store, we need the key to be computed on (dst, src, counter), also bc it cannot depend nor on tensor neither on P2PCommunication* (not even its ID)
+// We could store separately the local and remote handles, or by first mapping with the IpcHandle's rank. Btw, we need to add rank to IpcHandle.
+
+} // nvfuser

From a4960044cb1ee41c5eacc733d17c11761791df27 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Wed, 12 Feb 2025 14:39:41 -0800
Subject: [PATCH 50/55] refactor

---
 csrc/host_ir/executor.cpp                | 115 ++-------------------
 csrc/multidevice/ipc_handle.cpp          | 124 ++++++++++++++++++++---
 csrc/multidevice/ipc_handle.h            |  85 +++++++---------
 tests/cpp/test_multidevice_gpu_comms.cpp |  12 +++
 4 files changed, 165 insertions(+), 171 deletions(-)

diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp
index 7cebe36f792..4d0bbc34776 100644
--- a/csrc/host_ir/executor.cpp
+++ b/csrc/host_ir/executor.cpp
@@ -410,88 +410,7 @@ void HostIrEvaluator::handle(PostOnStream* post_ir) {
 }
 
 void HostIrEvaluator::handle(ShareMemHandles* share_mem_handles) {
-  const int64_t my_rank = communicator_->deviceId();
-  auto get_tensor = [this](P2PCommunication* communication) -> at::Tensor {
-    return this->expr_evaluator_.evaluate(communication->buffer())
-        .as<at::Tensor>();
-  };
-
-  std::vector<P2PCommunication*> communications;
-  for (auto expr : share_mem_handles->communications()) {
-    auto communication = expr->as<P2PCommunication>();
-    const auto dst =
-        expr_evaluator_.evaluate(communication->dst()).as<int64_t>();
-    const auto src =
-        expr_evaluator_.evaluate(communication->src()).as<int64_t>();
-    const bool is_sender = my_rank == src;
-    const bool is_receiver = my_rank == dst;
-    if (!(is_sender ^
-          is_receiver)) { // REMOVE or adapt exporting/opening the handle
-      return;
-    }
-    if (ipc_handle_cache_.find(communication, expr_evaluator_) != nullptr) {
-      continue;
-    }
-    communications.push_back(communication);
-  }
-
-  // put memhandles to TCP store
-  auto get_key =
-      [this](P2PCommunication* communication, int64_t rank) -> std::string {
-    return "nvfuser_ipc_handle_info_P2PComm_dst=" +
-        std::to_string(this->expr_evaluator_.evaluate(communication->dst())
-                           .as<int64_t>()) +
-        "_src=" +
-        std::to_string(this->expr_evaluator_.evaluate(communication->src())
-                           .as<int64_t>()) +
-        "_rank=" + std::to_string(rank);
-  };
-  std::unordered_map<P2PCommunication*, std::unique_ptr<IpcHandle>>
-      buffer_handles;
-  auto store = communicator_->getTcpStore();
-  for (P2PCommunication* communication : communications) {
-    auto buffer_handle =
-        std::make_unique<IpcHandle>(get_tensor(communication));
-    store->set(get_key(communication, my_rank), toBytes(*buffer_handle));
-    buffer_handles.emplace(communication, std::move(buffer_handle));
-  }
-
-  // barrier to ensure all ranks have pushed their memhandles to the store
-  // TODO: precisely select what ranks need to wait on that barrier.
-  communicator_->barrier();
-
-  // get memhandles to TCP store
-  for (P2PCommunication* communication : communications) {
-    auto ipc_handles = std::make_unique<std::vector<std::unique_ptr<IpcHandle>>>();
-    ipc_handles->reserve(communicator_->size());
-    const auto dst =
-        expr_evaluator_.evaluate(communication->dst()).as<int64_t>();
-    const auto src =
-        expr_evaluator_.evaluate(communication->src()).as<int64_t>();
-    for (int64_t rank : c10::irange(communicator_->size())) {
-      if (rank != src && rank != dst) {
-        ipc_handles->push_back(nullptr);
-        continue;
-      }
-      if (rank == my_rank) {
-        // opening an ipc handle on the exporter's device is not supported
-        ipc_handles->push_back(std::move(buffer_handles.at(communication)));
-      } else {
-        std::string key = get_key(communication, rank);
-        NVF_ERROR(
-            store->check({key}),
-            "key ",
-            key,
-            " not found in store at rank ",
-            my_rank);
-        auto imported_ipc_handle_info =
-            std::make_unique<IpcHandle>(store->get(key));
-        ipc_handles->push_back(std::move(imported_ipc_handle_info));
-      }
-    }
-    ipc_handle_cache_.insert(
-      communication, expr_evaluator_, std::move(ipc_handles));
-  }
+  ipc_handle_cache_.exchangeHandles(share_mem_handles->communications(), expr_evaluator_);
 }
 
 void HostIrEvaluator::handle(Communication* communication) {
@@ -550,21 +469,12 @@ void HostIrEvaluator::handle(P2PCommunication* communication) {
     return;
   }
 
-  const auto it = ipc_handle_cache_.find(communication, expr_evaluator_);
-  NVF_ERROR(
-      it != nullptr,
-      "No remote buffer found for ",
-      communication->toString(),
-      " at rank ",
-      my_rank);
-  const std::vector<std::unique_ptr<IpcHandle>>& ipc_handles = *it;
-  const int64_t peer = is_sender ? dst : src;
-  const IpcHandle& my_buffer = *ipc_handles.at(my_rank);
-  const IpcHandle& peer_buffer = *ipc_handles.at(peer);
+  const P2pIpcHandle& ipc_handles = ipc_handle_cache_.get(communication, expr_evaluator_);
+  const IpcHandle& peer_buffer = ipc_handles.peer();
   const auto local_semaphore =
-      reinterpret_cast<CUdeviceptr>(&my_buffer.semaphores()[peer]);
+      reinterpret_cast<CUdeviceptr>(ipc_handles.local().semaphore());
   const auto remote_semaphore =
-      reinterpret_cast<CUdeviceptr>(&peer_buffer.semaphores()[my_rank]);
+      reinterpret_cast<CUdeviceptr>(ipc_handles.peer().semaphore());
   static_assert(
       sizeof(IpcSemaphore) == sizeof(uint32_t), "IpcSemaphore must be 32 bits");
 
@@ -631,7 +541,6 @@ void HostIrEvaluator::handle(Wait* wait) {
     return;
   }
 
-  const auto dst = expr_evaluator_.evaluate(p2p_comm->dst()).as<int64_t>();
   const auto src = expr_evaluator_.evaluate(p2p_comm->src()).as<int64_t>();
   const int64_t my_rank = communicator_->deviceId();
   if (my_rank == src) {
@@ -639,18 +548,10 @@ void HostIrEvaluator::handle(Wait* wait) {
         c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream());
     at::Tensor buffer =
         getKnownTensorOrUndefined(p2p_comm->buffer(), expr_evaluator_);
-    const auto it = ipc_handle_cache_.find(p2p_comm, expr_evaluator_);
-    NVF_ERROR(
-        it != nullptr,
-        "No remote buffer found for ",
-        p2p_comm->toString(),
-        " at rank ",
-        my_rank);
-    const std::vector<std::unique_ptr<IpcHandle>>& ipc_handles = *it;
-
-    const IpcHandle& my_buffer = *ipc_handles.at(my_rank);
+
+    const P2pIpcHandle& ipc_handles = ipc_handle_cache_.get(p2p_comm, expr_evaluator_);
     const auto local_semaphore =
-        reinterpret_cast<CUdeviceptr>(&my_buffer.semaphores()[dst]);
+        reinterpret_cast<CUdeviceptr>(ipc_handles.local().semaphore());
 
     NVFUSER_CUDA_SAFE_CALL(cuStreamWaitValue32(
         current_stream,
diff --git a/csrc/multidevice/ipc_handle.cpp b/csrc/multidevice/ipc_handle.cpp
index ccaf9bf5c4d..9e3c893f223 100644
--- a/csrc/multidevice/ipc_handle.cpp
+++ b/csrc/multidevice/ipc_handle.cpp
@@ -11,52 +11,146 @@
 
 namespace nvfuser {
 
+namespace {
+
+template <typename T>
+std::vector<uint8_t> toBytes(const T& data) {
+  return std::vector<uint8_t>(
+      reinterpret_cast<const uint8_t*>(&data),
+      reinterpret_cast<const uint8_t*>(&data) + sizeof(T));
+}
+
+template <typename T>
+const T& fromBytes(const std::vector<uint8_t>& bytes) {
+  return *reinterpret_cast<const T*>(bytes.data());
+}
+
+} // namespace
+
+
 IpcHandle::IpcHandle(at::Tensor tensor)
     : ptr_(tensor.data_ptr()),
       storage_offset_(tensor.storage_offset()),
       element_size_(tensor.element_size()),
-      is_imported_(false) {
+      rank_(Communicator::getInstance().deviceId()) {
   NVFUSER_CUDA_RT_SAFE_CALL(
       cudaIpcGetMemHandle(&ipc_handle_, tensor.data_ptr()));
-  const auto number_of_semaphores = Communicator::getInstance().size();
   NVFUSER_CUDA_RT_SAFE_CALL(cudaMalloc(
-      (void**)&semaphores_, number_of_semaphores * sizeof(IpcSemaphore)));
+      (void**)&semaphore_, sizeof(IpcSemaphore)));
   static_assert(
       sizeof(IpcSemaphore) == sizeof(int),
       "IpcSemaphore must be same size as int");
   NVFUSER_CUDA_RT_SAFE_CALL(cudaMemset(
-      (void*)semaphores_,
+      (void*)semaphore_,
       (int)IpcSemaphore::kReady,
-      number_of_semaphores * sizeof(IpcSemaphore)));
+      sizeof(IpcSemaphore)));
   NVFUSER_CUDA_RT_SAFE_CALL(
-      cudaIpcGetMemHandle(&semaphores_ipc_handle_, semaphores_));
+      cudaIpcGetMemHandle(&semaphore_ipc_handle_, semaphore_));
 }
 
-IpcHandle::IpcHandle(std::vector<uint8_t> data)
-    : is_imported_(true) {
+IpcHandle::IpcHandle(std::vector<uint8_t> data) {
   const IpcHandle& imported_buffer = fromBytes<IpcHandle>(data);
 
   storage_offset_ = imported_buffer.storage_offset_;
   element_size_ = imported_buffer.element_size_;
   ipc_handle_ = imported_buffer.ipc_handle_;
-  semaphores_ipc_handle_ = imported_buffer.semaphores_ipc_handle_;
+  semaphore_ipc_handle_ = imported_buffer.semaphore_ipc_handle_;
+  rank_ = imported_buffer.rank_;
 
   NVFUSER_CUDA_RT_SAFE_CALL(
       cudaIpcOpenMemHandle(&ptr_, ipc_handle_, cudaIpcMemLazyEnablePeerAccess));
   ptr_ = (void*)((uint8_t*)ptr_ + storage_offset_ * element_size_);
 
   NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcOpenMemHandle(
-      (void**)&semaphores_,
-      semaphores_ipc_handle_,
+      (void**)&semaphore_,
+      semaphore_ipc_handle_,
       cudaIpcMemLazyEnablePeerAccess));
 }
 
 IpcHandle::~IpcHandle() {
-  if (is_imported_) {
-    NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcCloseMemHandle(ptr_));
-    NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcCloseMemHandle((void*)semaphores_));
+  if (rank_ == Communicator::getInstance().deviceId()) {
+    NVFUSER_CUDA_RT_SAFE_CALL(cudaFree((void*)semaphore_));
   } else {
-    NVFUSER_CUDA_RT_SAFE_CALL(cudaFree((void*)semaphores_));
+    NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcCloseMemHandle(ptr_));
+    NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcCloseMemHandle((void*)semaphore_));
+  }
+}
+
+
+void IpcHandleCache::exchangeHandles(const std::vector<P2PCommunication*>& communications, const ExpressionEvaluator& expr_evaluator) {
+  Communicator* communicator = &Communicator::getInstance();
+  const int64_t my_rank = communicator->deviceId();
+  auto get_tensor = [&expr_evaluator](P2PCommunication* communication) -> at::Tensor {
+    return expr_evaluator.evaluate(communication->buffer())
+        .as<at::Tensor>();
+  };
+
+  std::vector<P2PCommunication*> non_cached_communications;
+  for (auto communication : communications) {
+    const auto dst =
+        expr_evaluator.evaluate(communication->dst()).as<int64_t>();
+    const auto src =
+        expr_evaluator.evaluate(communication->src()).as<int64_t>();
+    const bool is_sender = my_rank == src;
+    const bool is_receiver = my_rank == dst;
+    NVF_ERROR(is_sender || is_receiver, "RANK ", my_rank, " is not involved in the p2p comm ", communication);
+    if (is_sender && is_receiver) {
+      continue;
+    }
+    if (find(communication, expr_evaluator) != nullptr) {
+      continue;
+    }
+    non_cached_communications.push_back(communication);
+  }
+
+  // put memhandles to TCP store
+  auto get_tcp_store_key =
+      [&expr_evaluator](P2PCommunication* communication, int64_t rank) -> std::string {
+    return "nvfuser_ipc_handle_info_P2PComm_dst=" +
+        std::to_string(expr_evaluator.evaluate(communication->dst())
+                          .as<int64_t>()) +
+        "_src=" +
+        std::to_string(expr_evaluator.evaluate(communication->src())
+                          .as<int64_t>()) +
+        "_rank=" + std::to_string(rank);
+  };
+  std::unordered_map<P2PCommunication*, std::unique_ptr<IpcHandle>>
+      local_ipc_handles;
+  auto store = communicator->getTcpStore();
+  for (P2PCommunication* communication : non_cached_communications) {
+    auto buffer_handle =
+        std::make_unique<IpcHandle>(get_tensor(communication));
+    store->set(get_tcp_store_key(communication, my_rank), toBytes(*buffer_handle));
+    local_ipc_handles.emplace(communication, std::move(buffer_handle));
+  }
+
+  // barrier to ensure all ranks have pushed their memhandles to the store
+  // TODO: precisely select what ranks need to wait on that barrier.
+  communicator->barrier();
+
+  // get memhandles from TCP store
+  for (P2PCommunication* communication : non_cached_communications) {
+    const auto dst =
+    expr_evaluator.evaluate(communication->dst()).as<int64_t>();
+    const auto src =
+    expr_evaluator.evaluate(communication->src()).as<int64_t>();
+    int64_t peer = (my_rank == dst) ? src : dst;
+
+    auto& local_ipc_handle = local_ipc_handles.at(communication);
+
+    std::string key = get_tcp_store_key(communication, peer);
+    NVF_ERROR(
+      store->check({key}),
+      "key ",
+      key,
+      " not found in store at rank ",
+      my_rank);
+    auto peer_ipc_handle = std::make_unique<IpcHandle>(store->get(key));
+
+    auto ipc_handles = std::make_unique<P2pIpcHandle>(std::move(local_ipc_handle), std::move(peer_ipc_handle));
+
+    insert(
+      communication, expr_evaluator, std::move(ipc_handles));
   }
 }
 
diff --git a/csrc/multidevice/ipc_handle.h b/csrc/multidevice/ipc_handle.h
index 5ab790f7e8f..fb8c025261d 100644
--- a/csrc/multidevice/ipc_handle.h
+++ b/csrc/multidevice/ipc_handle.h
@@ -12,18 +12,6 @@
 
 namespace nvfuser {
 
-template <typename T>
-std::vector<uint8_t> toBytes(const T& data) {
-  return std::vector<uint8_t>(
-      reinterpret_cast<const uint8_t*>(&data),
-      reinterpret_cast<const uint8_t*>(&data) + sizeof(T));
-}
-
-template <typename T>
-const T& fromBytes(const std::vector<uint8_t>& bytes) {
-  return *reinterpret_cast<const T*>(bytes.data());
-}
-
 enum class IpcSemaphore : cuuint32_t { kReady, kInUse };
 
 class IpcHandle {
@@ -36,68 +24,69 @@ class IpcHandle {
     return ptr_;
   }
 
-  auto semaphores() const {
-    return semaphores_;
+  auto semaphore() const {
+    return semaphore_;
   }
 
  private:
   void* ptr_;
   int64_t storage_offset_;
   int64_t element_size_;
-  bool is_imported_;
   cudaIpcMemHandle_t ipc_handle_;
-  cudaIpcMemHandle_t semaphores_ipc_handle_;
-  IpcSemaphore* semaphores_;
+  cudaIpcMemHandle_t semaphore_ipc_handle_;
+  IpcSemaphore* semaphore_;
+  int64_t rank_;
 };
 
-using P2pIpcHandle = std::vector<std::unique_ptr<IpcHandle>>;
-// class P2pIpcHandle {
-//  public:
-//   P2pIpcHandle(IpcHandle local_handle, IpcHandle peer_handle)
-//       : local_handle_(local_handle), peer_handle_(peer_handle) {};
-
-//   ~P2pIpcHandle();
+class P2pIpcHandle {
+ public:
 
-//   const auto& local() {
-//     return local_handle_;
-//   }
+  P2pIpcHandle(std::unique_ptr<IpcHandle> local, std::unique_ptr<IpcHandle> peer) : local_(std::move(local)), peer_(std::move(peer)) {}
 
-//   const auto& peer() {
-//     return peer_handle_;
-//   }
+  const auto& local() const {
+    return *local_;
+  }
 
-//  private:
-//   IpcHandle local_handle_;
-//   IpcHandle peer_handle_;
-// };
+  const auto& peer() const {
+    return *peer_;
+  }
 
+ private:
+  std::unique_ptr<IpcHandle> local_;
+  std::unique_ptr<IpcHandle> peer_;
+};
 
 class IpcHandleCache {
-  public:
-    IpcHandleCache() = default;
-    ~IpcHandleCache() = default;
-
-
-  P2pIpcHandle* find(P2PCommunication* comm, ExpressionEvaluator& expr_evaluator) {
-    return find(getKey(comm, expr_evaluator));
+ public:
+  IpcHandleCache() = default;
+  ~IpcHandleCache() = default;
+
+  const P2pIpcHandle& get(P2PCommunication* communication, ExpressionEvaluator& expr_evaluator) {
+    auto it = find(getKey(communication, expr_evaluator));
+    NVF_ERROR(
+      it != nullptr,
+      "No remote buffer found for ",
+      communication->toString());
+    return *it;
   }
 
-  void insert(P2PCommunication* comm, ExpressionEvaluator& expr_evaluator, std::unique_ptr<P2pIpcHandle> handle) {
-    handles_[getKey(comm, expr_evaluator)] = std::move(handle);
-  }
+  void exchangeHandles(const std::vector<P2PCommunication*>& communications, const ExpressionEvaluator& expr_evaluator);
 
  private:
   using KeyType = std::tuple<int64_t, int64_t, at::Tensor, P2PCommunication*>;
 
-  KeyType getKey(P2PCommunication* comm, ExpressionEvaluator& expr_evaluator) {
+  KeyType getKey(P2PCommunication* comm, const ExpressionEvaluator& expr_evaluator) {
     int64_t dst = expr_evaluator.evaluate(comm->dst()).as<int64_t>();
     int64_t src = expr_evaluator.evaluate(comm->src()).as<int64_t>();
     at::Tensor buffer = expr_evaluator.evaluate(comm->buffer()).as<at::Tensor>();
     return std::make_tuple(dst, src, buffer, comm);
   }
+  void insert(P2PCommunication* comm, const ExpressionEvaluator& expr_evaluator, std::unique_ptr<P2pIpcHandle> handle) {
+    handles_[getKey(comm, expr_evaluator)] = std::move(handle);
+  }
 
-  void insert(KeyType key, std::unique_ptr<P2pIpcHandle> handle) {
-    handles_[key] = std::move(handle);
+  P2pIpcHandle* find(P2PCommunication* comm, const ExpressionEvaluator& expr_evaluator) {
+    return find(getKey(comm, expr_evaluator));
   }
 
   P2pIpcHandle* find(KeyType key) {
@@ -124,8 +113,6 @@ class IpcHandleCache {
     }
   };
 
-
-
   struct KeyHash {
     std::size_t operator()(const KeyType& key) const {
       return (std::hash<int64_t>()(std::get<0>(key)) << 13) ^
diff --git a/tests/cpp/test_multidevice_gpu_comms.cpp b/tests/cpp/test_multidevice_gpu_comms.cpp
index 75a6aeba472..c9159d3f20c 100644
--- a/tests/cpp/test_multidevice_gpu_comms.cpp
+++ b/tests/cpp/test_multidevice_gpu_comms.cpp
@@ -20,6 +20,18 @@ namespace nvfuser {
 
 #define CUDA_CALL(call) ASSERT_EQ((call), cudaSuccess)
 
+template <typename T>
+std::vector<uint8_t> toBytes(const T& data) {
+  return std::vector<uint8_t>(
+      reinterpret_cast<const uint8_t*>(&data),
+      reinterpret_cast<const uint8_t*>(&data) + sizeof(T));
+}
+
+template <typename T>
+const T& fromBytes(const std::vector<uint8_t>& bytes) {
+  return *reinterpret_cast<const T*>(bytes.data());
+}
+
 class GpuCommTest : public MultiDeviceTest {};
 
 TEST_F(GpuCommTest, IpcMemHandle) {

From 263d95c046714f32f72075d090f49dff8d641005 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Wed, 12 Feb 2025 14:54:40 -0800
Subject: [PATCH 51/55] minor cleanup

---
 csrc/multidevice/ipc_handle.cpp |  1 -
 csrc/multidevice/ipc_handle.h   | 29 ++++++++++++-----------------
 2 files changed, 12 insertions(+), 18 deletions(-)

diff --git a/csrc/multidevice/ipc_handle.cpp b/csrc/multidevice/ipc_handle.cpp
index 9e3c893f223..3cb36f1963d 100644
--- a/csrc/multidevice/ipc_handle.cpp
+++ b/csrc/multidevice/ipc_handle.cpp
@@ -76,7 +76,6 @@ IpcHandle::~IpcHandle() {
   }
 }
 
-
 void IpcHandleCache::exchangeHandles(const std::vector<P2PCommunication*>& communications, const ExpressionEvaluator& expr_evaluator) {
   Communicator* communicator = &Communicator::getInstance();
   const int64_t my_rank = communicator->deviceId();
diff --git a/csrc/multidevice/ipc_handle.h b/csrc/multidevice/ipc_handle.h
index fb8c025261d..f81d74550ae 100644
--- a/csrc/multidevice/ipc_handle.h
+++ b/csrc/multidevice/ipc_handle.h
@@ -56,13 +56,17 @@ class P2pIpcHandle {
   std::unique_ptr<IpcHandle> peer_;
 };
 
+// The cache key must be match on (dst, src, tensor, Id of SendComm, Id of RecvComm) or (int64_t dst, int64_t src, tensor, P2PCommunication*)
+// we need a counter on Tensor+P2PCommunication* for each given dst, src
+// In the store, we need the key to be computed on (dst, src, counter), also bc it cannot depend nor on tensor neither on P2PCommunication* (not even its ID)
+// We could store separately the local and remote handles, or by first mapping with the IpcHandle's rank. Btw, we need to add rank to IpcHandle.
 class IpcHandleCache {
  public:
   IpcHandleCache() = default;
   ~IpcHandleCache() = default;
 
-  const P2pIpcHandle& get(P2PCommunication* communication, ExpressionEvaluator& expr_evaluator) {
-    auto it = find(getKey(communication, expr_evaluator));
+  const P2pIpcHandle& get(P2PCommunication* communication, ExpressionEvaluator& expr_evaluator) const {
+    auto it = find(communication, expr_evaluator);
     NVF_ERROR(
       it != nullptr,
       "No remote buffer found for ",
@@ -75,22 +79,19 @@ class IpcHandleCache {
  private:
   using KeyType = std::tuple<int64_t, int64_t, at::Tensor, P2PCommunication*>;
 
-  KeyType getKey(P2PCommunication* comm, const ExpressionEvaluator& expr_evaluator) {
+  KeyType getKey(P2PCommunication* comm, const ExpressionEvaluator& expr_evaluator) const  {
     int64_t dst = expr_evaluator.evaluate(comm->dst()).as<int64_t>();
     int64_t src = expr_evaluator.evaluate(comm->src()).as<int64_t>();
     at::Tensor buffer = expr_evaluator.evaluate(comm->buffer()).as<at::Tensor>();
     return std::make_tuple(dst, src, buffer, comm);
   }
-  void insert(P2PCommunication* comm, const ExpressionEvaluator& expr_evaluator, std::unique_ptr<P2pIpcHandle> handle) {
-    handles_[getKey(comm, expr_evaluator)] = std::move(handle);
-  }
 
-  P2pIpcHandle* find(P2PCommunication* comm, const ExpressionEvaluator& expr_evaluator) {
-    return find(getKey(comm, expr_evaluator));
+  void insert(P2PCommunication* comm, const ExpressionEvaluator& expr_evaluator, std::unique_ptr<P2pIpcHandle> handle)  {
+    handles_[getKey(comm, expr_evaluator)] = std::move(handle);
   }
 
-  P2pIpcHandle* find(KeyType key) {
-    auto it = handles_.find(key);
+  P2pIpcHandle* find(P2PCommunication* comm, const ExpressionEvaluator& expr_evaluator) const  {
+    auto it = handles_.find(getKey(comm, expr_evaluator));
     if (it == handles_.end()) {
       return nullptr;
     }
@@ -102,7 +103,7 @@ class IpcHandleCache {
       auto ptr = reinterpret_cast<std::uintptr_t>(tensor.data_ptr());
       auto offset = tensor.storage_offset();
       auto element_size = tensor.element_size();
-      return std::hash<std::uintptr_t>()(ptr) ^ std::hash<int64_t>()(offset) ^
+      return std::hash<std::uintptr_t>()(ptr) ^ std::hash<int64_t>()(offset) << 32 ^
           std::hash<int>()(element_size);
     }
   };
@@ -139,10 +140,4 @@ class IpcHandleCache {
     handles_;
 };
 
-
-// The cache key must be match on (dst, src, tensor, Id of SendComm, Id of RecvComm) or (int64_t dst, int64_t src, tensor, P2PCommunication*)
-// we need a counter on Tensor+P2PCommunication* for each given dst, src
-// In the store, we need the key to be computed on (dst, src, counter), also bc it cannot depend nor on tensor neither on P2PCommunication* (not even its ID)
-// We could store separately the local and remote handles, or by first mapping with the IpcHandle's rank. Btw, we need to add rank to IpcHandle.
-
 } // nvfuser

From 106d29579e0466062a6b2c4c47ecce4249c61406 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Wed, 12 Feb 2025 14:55:22 -0800
Subject: [PATCH 52/55] lint

---
 csrc/host_ir/executor.cpp       |  9 ++--
 csrc/multidevice/ipc_handle.cpp | 73 +++++++++++++++++---------------
 csrc/multidevice/ipc_handle.h   | 75 +++++++++++++++++++--------------
 3 files changed, 89 insertions(+), 68 deletions(-)

diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp
index 4d0bbc34776..f8ef8572dfe 100644
--- a/csrc/host_ir/executor.cpp
+++ b/csrc/host_ir/executor.cpp
@@ -410,7 +410,8 @@ void HostIrEvaluator::handle(PostOnStream* post_ir) {
 }
 
 void HostIrEvaluator::handle(ShareMemHandles* share_mem_handles) {
-  ipc_handle_cache_.exchangeHandles(share_mem_handles->communications(), expr_evaluator_);
+  ipc_handle_cache_.exchangeHandles(
+      share_mem_handles->communications(), expr_evaluator_);
 }
 
 void HostIrEvaluator::handle(Communication* communication) {
@@ -469,7 +470,8 @@ void HostIrEvaluator::handle(P2PCommunication* communication) {
     return;
   }
 
-  const P2pIpcHandle& ipc_handles = ipc_handle_cache_.get(communication, expr_evaluator_);
+  const P2pIpcHandle& ipc_handles =
+      ipc_handle_cache_.get(communication, expr_evaluator_);
   const IpcHandle& peer_buffer = ipc_handles.peer();
   const auto local_semaphore =
       reinterpret_cast<CUdeviceptr>(ipc_handles.local().semaphore());
@@ -549,7 +551,8 @@ void HostIrEvaluator::handle(Wait* wait) {
     at::Tensor buffer =
         getKnownTensorOrUndefined(p2p_comm->buffer(), expr_evaluator_);
 
-    const P2pIpcHandle& ipc_handles = ipc_handle_cache_.get(p2p_comm, expr_evaluator_);
+    const P2pIpcHandle& ipc_handles =
+        ipc_handle_cache_.get(p2p_comm, expr_evaluator_);
     const auto local_semaphore =
         reinterpret_cast<CUdeviceptr>(ipc_handles.local().semaphore());
 
diff --git a/csrc/multidevice/ipc_handle.cpp b/csrc/multidevice/ipc_handle.cpp
index 3cb36f1963d..089071f22d7 100644
--- a/csrc/multidevice/ipc_handle.cpp
+++ b/csrc/multidevice/ipc_handle.cpp
@@ -5,9 +5,9 @@
  * SPDX-License-Identifier: BSD-3-Clause
  */
 // clang-format on
-#include<cuda_utils.h>
-#include<multidevice/communicator.h>
-#include<multidevice/ipc_handle.h>
+#include <cuda_utils.h>
+#include <multidevice/communicator.h>
+#include <multidevice/ipc_handle.h>
 
 namespace nvfuser {
 
@@ -27,7 +27,6 @@ const T& fromBytes(const std::vector<uint8_t>& bytes) {
 
 } // namespace
 
-
 IpcHandle::IpcHandle(at::Tensor tensor)
     : ptr_(tensor.data_ptr()),
       storage_offset_(tensor.storage_offset()),
@@ -35,15 +34,13 @@ IpcHandle::IpcHandle(at::Tensor tensor)
       rank_(Communicator::getInstance().deviceId()) {
   NVFUSER_CUDA_RT_SAFE_CALL(
       cudaIpcGetMemHandle(&ipc_handle_, tensor.data_ptr()));
-  NVFUSER_CUDA_RT_SAFE_CALL(cudaMalloc(
-      (void**)&semaphore_, sizeof(IpcSemaphore)));
+  NVFUSER_CUDA_RT_SAFE_CALL(
+      cudaMalloc((void**)&semaphore_, sizeof(IpcSemaphore)));
   static_assert(
       sizeof(IpcSemaphore) == sizeof(int),
       "IpcSemaphore must be same size as int");
   NVFUSER_CUDA_RT_SAFE_CALL(cudaMemset(
-      (void*)semaphore_,
-      (int)IpcSemaphore::kReady,
-      sizeof(IpcSemaphore)));
+      (void*)semaphore_, (int)IpcSemaphore::kReady, sizeof(IpcSemaphore)));
   NVFUSER_CUDA_RT_SAFE_CALL(
       cudaIpcGetMemHandle(&semaphore_ipc_handle_, semaphore_));
 }
@@ -76,12 +73,14 @@ IpcHandle::~IpcHandle() {
   }
 }
 
-void IpcHandleCache::exchangeHandles(const std::vector<P2PCommunication*>& communications, const ExpressionEvaluator& expr_evaluator) {
+void IpcHandleCache::exchangeHandles(
+    const std::vector<P2PCommunication*>& communications,
+    const ExpressionEvaluator& expr_evaluator) {
   Communicator* communicator = &Communicator::getInstance();
   const int64_t my_rank = communicator->deviceId();
-  auto get_tensor = [&expr_evaluator](P2PCommunication* communication) -> at::Tensor {
-    return expr_evaluator.evaluate(communication->buffer())
-        .as<at::Tensor>();
+  auto get_tensor =
+      [&expr_evaluator](P2PCommunication* communication) -> at::Tensor {
+    return expr_evaluator.evaluate(communication->buffer()).as<at::Tensor>();
   };
 
   std::vector<P2PCommunication*> non_cached_communications;
@@ -92,7 +91,12 @@ void IpcHandleCache::exchangeHandles(const std::vector<P2PCommunication*>& commu
         expr_evaluator.evaluate(communication->src()).as<int64_t>();
     const bool is_sender = my_rank == src;
     const bool is_receiver = my_rank == dst;
-    NVF_ERROR(is_sender || is_receiver, "RANK ", my_rank, " is not involved in the p2p comm ", communication);
+    NVF_ERROR(
+        is_sender || is_receiver,
+        "RANK ",
+        my_rank,
+        " is not involved in the p2p comm ",
+        communication);
     if (is_sender && is_receiver) {
       continue;
     }
@@ -103,23 +107,24 @@ void IpcHandleCache::exchangeHandles(const std::vector<P2PCommunication*>& commu
   }
 
   // put memhandles to TCP store
-  auto get_tcp_store_key =
-      [&expr_evaluator](P2PCommunication* communication, int64_t rank) -> std::string {
+  auto get_tcp_store_key = [&expr_evaluator](
+                               P2PCommunication* communication,
+                               int64_t rank) -> std::string {
     return "nvfuser_ipc_handle_info_P2PComm_dst=" +
-        std::to_string(expr_evaluator.evaluate(communication->dst())
-                          .as<int64_t>()) +
+        std::to_string(
+               expr_evaluator.evaluate(communication->dst()).as<int64_t>()) +
         "_src=" +
-        std::to_string(expr_evaluator.evaluate(communication->src())
-                          .as<int64_t>()) +
+        std::to_string(
+               expr_evaluator.evaluate(communication->src()).as<int64_t>()) +
         "_rank=" + std::to_string(rank);
   };
   std::unordered_map<P2PCommunication*, std::unique_ptr<IpcHandle>>
       local_ipc_handles;
   auto store = communicator->getTcpStore();
   for (P2PCommunication* communication : non_cached_communications) {
-    auto buffer_handle =
-        std::make_unique<IpcHandle>(get_tensor(communication));
-    store->set(get_tcp_store_key(communication, my_rank), toBytes(*buffer_handle));
+    auto buffer_handle = std::make_unique<IpcHandle>(get_tensor(communication));
+    store->set(
+        get_tcp_store_key(communication, my_rank), toBytes(*buffer_handle));
     local_ipc_handles.emplace(communication, std::move(buffer_handle));
   }
 
@@ -130,27 +135,27 @@ void IpcHandleCache::exchangeHandles(const std::vector<P2PCommunication*>& commu
   // get memhandles from TCP store
   for (P2PCommunication* communication : non_cached_communications) {
     const auto dst =
-    expr_evaluator.evaluate(communication->dst()).as<int64_t>();
+        expr_evaluator.evaluate(communication->dst()).as<int64_t>();
     const auto src =
-    expr_evaluator.evaluate(communication->src()).as<int64_t>();
+        expr_evaluator.evaluate(communication->src()).as<int64_t>();
     int64_t peer = (my_rank == dst) ? src : dst;
 
     auto& local_ipc_handle = local_ipc_handles.at(communication);
 
     std::string key = get_tcp_store_key(communication, peer);
     NVF_ERROR(
-      store->check({key}),
-      "key ",
-      key,
-      " not found in store at rank ",
-      my_rank);
+        store->check({key}),
+        "key ",
+        key,
+        " not found in store at rank ",
+        my_rank);
     auto peer_ipc_handle = std::make_unique<IpcHandle>(store->get(key));
 
-    auto ipc_handles = std::make_unique<P2pIpcHandle>(std::move(local_ipc_handle), std::move(peer_ipc_handle));
+    auto ipc_handles = std::make_unique<P2pIpcHandle>(
+        std::move(local_ipc_handle), std::move(peer_ipc_handle));
 
-    insert(
-      communication, expr_evaluator, std::move(ipc_handles));
+    insert(communication, expr_evaluator, std::move(ipc_handles));
   }
 }
 
-} // nvfuser
+} // namespace nvfuser
diff --git a/csrc/multidevice/ipc_handle.h b/csrc/multidevice/ipc_handle.h
index f81d74550ae..cba4d50b83c 100644
--- a/csrc/multidevice/ipc_handle.h
+++ b/csrc/multidevice/ipc_handle.h
@@ -6,9 +6,9 @@
  */
 // clang-format on
 #pragma once
+#include <ATen/core/TensorBody.h>
 #include <cuda.h>
 #include <expr_evaluator.h>
-#include <ATen/core/TensorBody.h>
 
 namespace nvfuser {
 
@@ -40,8 +40,10 @@ class IpcHandle {
 
 class P2pIpcHandle {
  public:
-
-  P2pIpcHandle(std::unique_ptr<IpcHandle> local, std::unique_ptr<IpcHandle> peer) : local_(std::move(local)), peer_(std::move(peer)) {}
+  P2pIpcHandle(
+      std::unique_ptr<IpcHandle> local,
+      std::unique_ptr<IpcHandle> peer)
+      : local_(std::move(local)), peer_(std::move(peer)) {}
 
   const auto& local() const {
     return *local_;
@@ -56,41 +58,56 @@ class P2pIpcHandle {
   std::unique_ptr<IpcHandle> peer_;
 };
 
-// The cache key must be match on (dst, src, tensor, Id of SendComm, Id of RecvComm) or (int64_t dst, int64_t src, tensor, P2PCommunication*)
-// we need a counter on Tensor+P2PCommunication* for each given dst, src
-// In the store, we need the key to be computed on (dst, src, counter), also bc it cannot depend nor on tensor neither on P2PCommunication* (not even its ID)
-// We could store separately the local and remote handles, or by first mapping with the IpcHandle's rank. Btw, we need to add rank to IpcHandle.
+// The cache key must be match on (dst, src, tensor, Id of SendComm, Id of
+// RecvComm) or (int64_t dst, int64_t src, tensor, P2PCommunication*) we need a
+// counter on Tensor+P2PCommunication* for each given dst, src In the store, we
+// need the key to be computed on (dst, src, counter), also bc it cannot depend
+// nor on tensor neither on P2PCommunication* (not even its ID) We could store
+// separately the local and remote handles, or by first mapping with the
+// IpcHandle's rank. Btw, we need to add rank to IpcHandle.
 class IpcHandleCache {
  public:
   IpcHandleCache() = default;
   ~IpcHandleCache() = default;
 
-  const P2pIpcHandle& get(P2PCommunication* communication, ExpressionEvaluator& expr_evaluator) const {
+  const P2pIpcHandle& get(
+      P2PCommunication* communication,
+      ExpressionEvaluator& expr_evaluator) const {
     auto it = find(communication, expr_evaluator);
     NVF_ERROR(
-      it != nullptr,
-      "No remote buffer found for ",
-      communication->toString());
+        it != nullptr,
+        "No remote buffer found for ",
+        communication->toString());
     return *it;
   }
 
-  void exchangeHandles(const std::vector<P2PCommunication*>& communications, const ExpressionEvaluator& expr_evaluator);
+  void exchangeHandles(
+      const std::vector<P2PCommunication*>& communications,
+      const ExpressionEvaluator& expr_evaluator);
 
  private:
   using KeyType = std::tuple<int64_t, int64_t, at::Tensor, P2PCommunication*>;
 
-  KeyType getKey(P2PCommunication* comm, const ExpressionEvaluator& expr_evaluator) const  {
+  KeyType getKey(
+      P2PCommunication* comm,
+      const ExpressionEvaluator& expr_evaluator) const {
     int64_t dst = expr_evaluator.evaluate(comm->dst()).as<int64_t>();
     int64_t src = expr_evaluator.evaluate(comm->src()).as<int64_t>();
-    at::Tensor buffer = expr_evaluator.evaluate(comm->buffer()).as<at::Tensor>();
+    at::Tensor buffer =
+        expr_evaluator.evaluate(comm->buffer()).as<at::Tensor>();
     return std::make_tuple(dst, src, buffer, comm);
   }
 
-  void insert(P2PCommunication* comm, const ExpressionEvaluator& expr_evaluator, std::unique_ptr<P2pIpcHandle> handle)  {
+  void insert(
+      P2PCommunication* comm,
+      const ExpressionEvaluator& expr_evaluator,
+      std::unique_ptr<P2pIpcHandle> handle) {
     handles_[getKey(comm, expr_evaluator)] = std::move(handle);
   }
 
-  P2pIpcHandle* find(P2PCommunication* comm, const ExpressionEvaluator& expr_evaluator) const  {
+  P2pIpcHandle* find(
+      P2PCommunication* comm,
+      const ExpressionEvaluator& expr_evaluator) const {
     auto it = handles_.find(getKey(comm, expr_evaluator));
     if (it == handles_.end()) {
       return nullptr;
@@ -103,8 +120,8 @@ class IpcHandleCache {
       auto ptr = reinterpret_cast<std::uintptr_t>(tensor.data_ptr());
       auto offset = tensor.storage_offset();
       auto element_size = tensor.element_size();
-      return std::hash<std::uintptr_t>()(ptr) ^ std::hash<int64_t>()(offset) << 32 ^
-          std::hash<int>()(element_size);
+      return std::hash<std::uintptr_t>()(ptr) ^
+          std::hash<int64_t>()(offset) << 32 ^ std::hash<int>()(element_size);
     }
   };
 
@@ -117,27 +134,23 @@ class IpcHandleCache {
   struct KeyHash {
     std::size_t operator()(const KeyType& key) const {
       return (std::hash<int64_t>()(std::get<0>(key)) << 13) ^
-         (std::hash<int64_t>()(std::get<1>(key)) << 7) ^
-         (TensorHash{}(std::get<2>(key))) ^
-         (std::hash<P2PCommunication*>()(std::get<3>(key)));
+          (std::hash<int64_t>()(std::get<1>(key)) << 7) ^
+          (TensorHash{}(std::get<2>(key))) ^
+          (std::hash<P2PCommunication*>()(std::get<3>(key)));
     }
   };
 
   struct KeyEqual {
     bool operator()(const KeyType& lhs, const KeyType& rhs) const {
       return std::get<0>(lhs) == std::get<0>(rhs) &&
-             std::get<1>(lhs) == std::get<1>(rhs) &&
-             TensorEqual{}(std::get<2>(lhs), std::get<2>(rhs)) &&
-             std::get<3>(lhs) == std::get<3>(rhs);
+          std::get<1>(lhs) == std::get<1>(rhs) &&
+          TensorEqual{}(std::get<2>(lhs), std::get<2>(rhs)) &&
+          std::get<3>(lhs) == std::get<3>(rhs);
     }
   };
 
-  std::unordered_map<
-    KeyType,
-    std::unique_ptr<P2pIpcHandle>,
-    KeyHash,
-    KeyEqual>
-    handles_;
+  std::unordered_map<KeyType, std::unique_ptr<P2pIpcHandle>, KeyHash, KeyEqual>
+      handles_;
 };
 
-} // nvfuser
+} // namespace nvfuser

From ed69f75d4c4fcd92f44c06f23cb5d8c77c892aef Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Wed, 12 Feb 2025 14:58:38 -0800
Subject: [PATCH 53/55] minor

---
 csrc/multidevice/ipc_handle.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/csrc/multidevice/ipc_handle.h b/csrc/multidevice/ipc_handle.h
index cba4d50b83c..5df55fec441 100644
--- a/csrc/multidevice/ipc_handle.h
+++ b/csrc/multidevice/ipc_handle.h
@@ -121,7 +121,7 @@ class IpcHandleCache {
       auto offset = tensor.storage_offset();
       auto element_size = tensor.element_size();
       return std::hash<std::uintptr_t>()(ptr) ^
-          std::hash<int64_t>()(offset) << 32 ^ std::hash<int>()(element_size);
+          std::hash<int64_t>()(offset << 8) ^ std::hash<int>()(element_size);
     }
   };
 
@@ -133,8 +133,8 @@ class IpcHandleCache {
 
   struct KeyHash {
     std::size_t operator()(const KeyType& key) const {
-      return (std::hash<int64_t>()(std::get<0>(key)) << 13) ^
-          (std::hash<int64_t>()(std::get<1>(key)) << 7) ^
+      return (std::hash<int64_t>()(std::get<0>(key) << 13)) ^
+          (std::hash<int64_t>()(std::get<1>(key) << 7)) ^
           (TensorHash{}(std::get<2>(key))) ^
           (std::hash<P2PCommunication*>()(std::get<3>(key)));
     }

From 929ae0df55efbda3bb342229f56f94b61a6138a8 Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Wed, 12 Feb 2025 15:13:49 -0800
Subject: [PATCH 54/55] minor

---
 csrc/host_ir/host_ir.cpp           | 6 +++++-
 csrc/host_ir/host_ir.h             | 2 +-
 csrc/multidevice/communication.cpp | 8 ++++----
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/csrc/host_ir/host_ir.cpp b/csrc/host_ir/host_ir.cpp
index edc9c476eaf..b5cacfd71b8 100644
--- a/csrc/host_ir/host_ir.cpp
+++ b/csrc/host_ir/host_ir.cpp
@@ -339,7 +339,11 @@ NVFUSER_DEFINE_CLONE_AND_CREATE(ShareMemHandles)
 
 std::string ShareMemHandles::toString(int indent_size) const {
   std::stringstream ss;
-  indent(ss, indent_size) << "ShareMemHandles" << std::endl;
+  indent(ss, indent_size) << "ShareMemHandles(";
+  for (auto communication: communications()) {
+    ss << communication->toInlineString() << ", ";
+  }
+  ss << std::endl;
   return ss.str();
 }
 
diff --git a/csrc/host_ir/host_ir.h b/csrc/host_ir/host_ir.h
index efb23b95d67..d7fa4512db8 100644
--- a/csrc/host_ir/host_ir.h
+++ b/csrc/host_ir/host_ir.h
@@ -335,7 +335,7 @@ class ShareMemHandles : public Expr {
     return "hir::ShareMemHandles";
   }
 
-  const std::vector<P2PCommunication*>& communications() {
+  const std::vector<P2PCommunication*>& communications() const {
     return attribute<std::vector<P2PCommunication*>>(0);
   }
 };
diff --git a/csrc/multidevice/communication.cpp b/csrc/multidevice/communication.cpp
index e48290241b0..1c1ada7f32c 100644
--- a/csrc/multidevice/communication.cpp
+++ b/csrc/multidevice/communication.cpp
@@ -230,17 +230,17 @@ P2PCommunication::P2PCommunication(
 
 NVFUSER_DEFINE_CLONE_AND_CREATE(P2PCommunication)
 
-std::string P2PCommunication::toString(const int indent_size) const {
+std::string P2PCommunication::toInlineString(const int indent_size) const {
   std::stringstream ss;
   indent(ss, indent_size) << "P2PCommunication " << name() << " ("
                           << "buffer=" << buffer() << ", "
                           << "dst=" << dst() << ", "
-                          << "src=" << src() << ")\n";
+                          << "src=" << src() << ")";
   return ss.str();
 }
 
-std::string P2PCommunication::toInlineString(int indent_size) const {
-  return toString(indent_size);
+std::string P2PCommunication::toString(int indent_size) const {
+  return toInlineString(indent_size) + "\n";
 }
 
 namespace {

From 359779d2778a1d4ed337499f669fb33123c3b9bb Mon Sep 17 00:00:00 2001
From: snordmann <snordmann@nvidia.com>
Date: Wed, 12 Feb 2025 15:44:09 -0800
Subject: [PATCH 55/55] move p2p runtime in separate file

---
 CMakeLists.txt                                |  1 +
 csrc/host_ir/executor.cpp                     | 70 +++----------------
 csrc/multidevice/cuda_p2p.cpp                 | 70 +++++++++++++++++++
 csrc/multidevice/cuda_p2p.h                   | 22 ++++++
 csrc/multidevice/ipc_handle.h                 |  1 +
 tests/cpp/test_multidevice_communications.cpp | 14 ++--
 6 files changed, 109 insertions(+), 69 deletions(-)
 create mode 100644 csrc/multidevice/cuda_p2p.cpp
 create mode 100644 csrc/multidevice/cuda_p2p.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9e2afe3cb6e..6b16cb0c075 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -171,6 +171,7 @@ list(APPEND NVFUSER_SRCS
   ${NVFUSER_SRCS_DIR}/mma_type.cpp
   ${NVFUSER_SRCS_DIR}/multidevice/communication.cpp
   ${NVFUSER_SRCS_DIR}/multidevice/communicator.cpp
+  ${NVFUSER_SRCS_DIR}/multidevice/cuda_p2p.cpp
   ${NVFUSER_SRCS_DIR}/multidevice/ipc_handle.cpp
   ${NVFUSER_SRCS_DIR}/multidevice/device_mesh.cpp
   ${NVFUSER_SRCS_DIR}/multidevice/executor.cpp
diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp
index f8ef8572dfe..7bd38e90c15 100644
--- a/csrc/host_ir/executor.cpp
+++ b/csrc/host_ir/executor.cpp
@@ -15,6 +15,7 @@
 #include <instrumentation.h>
 #include <ir/utils.h>
 #include <multidevice/communication.h>
+#include <multidevice/cuda_p2p.h>
 #include <multidevice/utils.h>
 #include <options.h>
 #include <runtime/allocations.h>
@@ -472,58 +473,12 @@ void HostIrEvaluator::handle(P2PCommunication* communication) {
 
   const P2pIpcHandle& ipc_handles =
       ipc_handle_cache_.get(communication, expr_evaluator_);
-  const IpcHandle& peer_buffer = ipc_handles.peer();
-  const auto local_semaphore =
-      reinterpret_cast<CUdeviceptr>(ipc_handles.local().semaphore());
-  const auto remote_semaphore =
-      reinterpret_cast<CUdeviceptr>(ipc_handles.peer().semaphore());
-  static_assert(
-      sizeof(IpcSemaphore) == sizeof(uint32_t), "IpcSemaphore must be 32 bits");
-
-  const auto current_stream = reinterpret_cast<CUstream>(
-      c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream());
-
+  const auto current_stream = static_cast<CUstream>(
+    c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream());
   if (is_receiver) {
-    // wait for sender to be ready
-    NVFUSER_CUDA_SAFE_CALL(cuStreamWaitValue32(
-        current_stream,
-        local_semaphore,
-        (cuuint32_t)(IpcSemaphore::kInUse),
-        CU_STREAM_WAIT_VALUE_EQ));
-    // RDMA get the data from the sender
-    NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync(
-        buffer.data_ptr(),
-        peer_buffer.ptr(),
-        buffer.numel() * buffer.element_size(),
-        cudaMemcpyDeviceToDevice,
-        current_stream));
-    // Signals completion to self
-    NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(
-        current_stream,
-        local_semaphore,
-        (cuuint32_t)(IpcSemaphore::kReady),
-        CU_STREAM_WRITE_VALUE_DEFAULT));
-    // Signals completion to receiver
-    NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(
-        current_stream,
-        remote_semaphore,
-        (cuuint32_t)(IpcSemaphore::kReady),
-        CU_STREAM_WRITE_VALUE_DEFAULT));
+    getZcopy::RecvPost(ipc_handles, buffer.numel() * buffer.element_size(), current_stream);
   } else /*sender*/ {
-    // signal to self that transfer is in progress
-    NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(
-        current_stream,
-        local_semaphore,
-        (cuuint32_t)(IpcSemaphore::kInUse),
-        CU_STREAM_WRITE_VALUE_DEFAULT));
-    // signal to receiver that the buffer is ready
-    NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(
-        current_stream,
-        remote_semaphore,
-        (cuuint32_t)(IpcSemaphore::kInUse),
-        CU_STREAM_WRITE_VALUE_DEFAULT)); // passing
-                                         // CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER
-                                         // gives an error
+    getZcopy::SendPost(ipc_handles, current_stream);
   }
 }
 
@@ -544,23 +499,14 @@ void HostIrEvaluator::handle(Wait* wait) {
   }
 
   const auto src = expr_evaluator_.evaluate(p2p_comm->src()).as<int64_t>();
+  const auto dst = expr_evaluator_.evaluate(p2p_comm->dst()).as<int64_t>();
   const int64_t my_rank = communicator_->deviceId();
-  if (my_rank == src) {
+  if (my_rank == src && src != dst) {
     const auto current_stream = static_cast<CUstream>(
         c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream());
-    at::Tensor buffer =
-        getKnownTensorOrUndefined(p2p_comm->buffer(), expr_evaluator_);
-
     const P2pIpcHandle& ipc_handles =
         ipc_handle_cache_.get(p2p_comm, expr_evaluator_);
-    const auto local_semaphore =
-        reinterpret_cast<CUdeviceptr>(ipc_handles.local().semaphore());
-
-    NVFUSER_CUDA_SAFE_CALL(cuStreamWaitValue32(
-        current_stream,
-        local_semaphore,
-        (cuuint32_t)(IpcSemaphore::kReady),
-        CU_STREAM_WAIT_VALUE_EQ));
+    getZcopy::SendWait(ipc_handles, current_stream);
   }
 }
 
diff --git a/csrc/multidevice/cuda_p2p.cpp b/csrc/multidevice/cuda_p2p.cpp
new file mode 100644
index 00000000000..d4aa148cb4a
--- /dev/null
+++ b/csrc/multidevice/cuda_p2p.cpp
@@ -0,0 +1,70 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#include<cuda_utils.h>
+#include<multidevice/cuda_p2p.h>
+
+namespace nvfuser {
+
+namespace getZcopy {
+
+void RecvPost(const P2pIpcHandle& ipc_handles, int64_t count, CUstream stream) {
+  // wait for sender to be ready
+  NVFUSER_CUDA_SAFE_CALL(cuStreamWaitValue32(
+      stream,
+      reinterpret_cast<CUdeviceptr>(ipc_handles.local().semaphore()),
+      (cuuint32_t)(IpcSemaphore::kInUse),
+      CU_STREAM_WAIT_VALUE_EQ));
+  // RDMA get the data from the sender
+  NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync(
+      ipc_handles.local().ptr(),
+      ipc_handles.peer().ptr(),
+      count,
+      cudaMemcpyDeviceToDevice,
+      stream));
+  // Signals completion to self
+  NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(
+      stream,
+      reinterpret_cast<CUdeviceptr>(ipc_handles.local().semaphore()),
+      (cuuint32_t)(IpcSemaphore::kReady),
+      CU_STREAM_WRITE_VALUE_DEFAULT));
+  // Signals completion to receiver
+  NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(
+      stream,
+      reinterpret_cast<CUdeviceptr>(ipc_handles.peer().semaphore()),
+      (cuuint32_t)(IpcSemaphore::kReady),
+      CU_STREAM_WRITE_VALUE_DEFAULT));
+}
+
+void SendPost(const P2pIpcHandle& ipc_handles, CUstream stream) {
+  // signal to self that transfer is in progress
+  NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(
+      stream,
+      reinterpret_cast<CUdeviceptr>(ipc_handles.local().semaphore()),
+      (cuuint32_t)(IpcSemaphore::kInUse),
+      CU_STREAM_WRITE_VALUE_DEFAULT));
+  // signal to receiver that the buffer is ready
+  NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(
+      stream,
+      reinterpret_cast<CUdeviceptr>(ipc_handles.peer().semaphore()),
+      (cuuint32_t)(IpcSemaphore::kInUse),
+      CU_STREAM_WRITE_VALUE_DEFAULT)); // passing
+                                        // CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER
+                                        // gives an error
+}
+
+void SendWait(const P2pIpcHandle& ipc_handles, CUstream stream) {
+  NVFUSER_CUDA_SAFE_CALL(cuStreamWaitValue32(
+    stream,
+    reinterpret_cast<CUdeviceptr>(ipc_handles.local().semaphore()),
+    (cuuint32_t)(IpcSemaphore::kReady),
+    CU_STREAM_WAIT_VALUE_EQ));
+}
+
+} // namespace getZcopy
+
+} // namespace nvfuser
diff --git a/csrc/multidevice/cuda_p2p.h b/csrc/multidevice/cuda_p2p.h
new file mode 100644
index 00000000000..45d2fdd2558
--- /dev/null
+++ b/csrc/multidevice/cuda_p2p.h
@@ -0,0 +1,22 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+#include <cuda.h>
+#include <multidevice/ipc_handle.h>
+
+namespace nvfuser {
+
+namespace getZcopy {
+
+void RecvPost(const P2pIpcHandle& ipc_handles, int64_t count, CUstream stream);
+void SendPost(const P2pIpcHandle& ipc_handles, CUstream stream);
+void SendWait(const P2pIpcHandle& ipc_handles, CUstream stream);
+
+} // namespace getZcopy
+
+} // namespace nvfuser
diff --git a/csrc/multidevice/ipc_handle.h b/csrc/multidevice/ipc_handle.h
index 5df55fec441..70e5a3bf560 100644
--- a/csrc/multidevice/ipc_handle.h
+++ b/csrc/multidevice/ipc_handle.h
@@ -151,6 +151,7 @@ class IpcHandleCache {
 
   std::unordered_map<KeyType, std::unique_ptr<P2pIpcHandle>, KeyHash, KeyEqual>
       handles_;
+  // TODO: add counter to support multiple send/recv per pair of ranks
 };
 
 } // namespace nvfuser
diff --git a/tests/cpp/test_multidevice_communications.cpp b/tests/cpp/test_multidevice_communications.cpp
index 9db4f3a78eb..d0a3da5a26f 100644
--- a/tests/cpp/test_multidevice_communications.cpp
+++ b/tests/cpp/test_multidevice_communications.cpp
@@ -432,29 +432,29 @@ TEST_F(P2PCommunicationTest, CudaComm) {
   FusionGuard fg(container.get());
 
   auto* my_rank_val = IrBuilder::create<Val>(my_rank, DataType::Int);
-  auto* recv_peer_val = IrBuilder::create<Val>(recv_peer, DataType::Int);
   auto* send_peer_val = IrBuilder::create<Val>(send_peer, DataType::Int);
+  auto* recv_peer_val = IrBuilder::create<Val>(recv_peer, DataType::Int);
 
   auto* send_tv = makeContigTensor(1);
   auto* recv_tv = makeContigTensor(1);
   container->addInput(send_tv);
   container->addInput(recv_tv);
 
-  auto recv = IrBuilder::create<P2PCommunication>(
-      recv_tv, my_rank_val, recv_peer_val, CommunicatorBackend::kCuda);
   auto send = IrBuilder::create<P2PCommunication>(
-      send_tv, send_peer_val, my_rank_val, CommunicatorBackend::kCuda);
-  std::vector<P2PCommunication*> grouped_communications = {recv, send};
+    send_tv, send_peer_val, my_rank_val, CommunicatorBackend::kCuda);
+    auto recv = IrBuilder::create<P2PCommunication>(
+        recv_tv, my_rank_val, recv_peer_val, CommunicatorBackend::kCuda);
+  std::vector<P2PCommunication*> grouped_communications = {send, recv};
   auto share_mem_handles = IrBuilder::create<hir::ShareMemHandles>(
       std::move(grouped_communications));
-  auto wait_recv = IrBuilder::create<hir::Wait>(recv);
   auto wait_send = IrBuilder::create<hir::Wait>(send);
+  auto wait_recv = IrBuilder::create<hir::Wait>(recv);
 
   container->pushBackTopLevelExprs(share_mem_handles);
   container->pushBackTopLevelExprs(send);
   container->pushBackTopLevelExprs(recv);
-  container->pushBackTopLevelExprs(wait_recv);
   container->pushBackTopLevelExprs(wait_send);
+  container->pushBackTopLevelExprs(wait_recv);
 
   hir::HostIrEvaluator executor(std::move(container), communicator_);