From 2c04df94cfd34b37b9ce2c27f937afdba8e79ecc Mon Sep 17 00:00:00 2001 From: snordmann Date: Thu, 24 Oct 2024 02:08:07 +0300 Subject: [PATCH 01/55] working simple benchmark --- csrc/multidevice/communicator.cpp | 2 + tests/cpp/test_multidevice_overlap.cpp | 103 +++++++++++++++++++++++++ 2 files changed, 105 insertions(+) diff --git a/csrc/multidevice/communicator.cpp b/csrc/multidevice/communicator.cpp index 8197ea224f4..ae6fc1fd9b4 100644 --- a/csrc/multidevice/communicator.cpp +++ b/csrc/multidevice/communicator.cpp @@ -196,6 +196,8 @@ Communicator::Communicator( return; } + cudaSetDevice(local_rank_); + #ifdef NVFUSER_DISTRIBUTED c10d::TCPStoreOptions store_opts; { diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp index 39cab67cd13..5def14c8045 100644 --- a/tests/cpp/test_multidevice_overlap.cpp +++ b/tests/cpp/test_multidevice_overlap.cpp @@ -15,6 +15,7 @@ #include #include #include +#include namespace nvfuser { @@ -40,6 +41,108 @@ void synchronizeStreams(const std::vector& streams) { } // namespace +using OverlapBenchmarkParams = std::tuple< + CommunicatorBackend, + /*S=*/int64_t, + /*M=*/int64_t, + /*K=*/int64_t, + /*N=*/int64_t, + /*number_of_streams=*/int64_t>; + +class OverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterface { + protected: + static std::unordered_map times; + + static void TearDownTestSuite() { + auto rank = Communicator::getInstance().deviceId(); + for (auto it: times) { + std::cout << "rank " << rank << ": " << it.first << ": " << it.second << std::endl; + } + } +}; + +std::unordered_map OverlapBenchmark::times = {}; + +TEST_P(OverlapBenchmark, DummyBenchmark) { + constexpr int64_t number_of_warmups = 120; + constexpr int64_t number_of_iterations = 500; + const int64_t D = communicator_->size(); + auto [backend, + S, + M, + K, + N, + number_of_streams] = GetParam(); + + GTEST_ASSERT_EQ(M % S, 0); + + auto world = communicator_->getWorld(backend); + + std::vector streams = + createStreams(number_of_streams, communicator_->deviceId()); + + auto options = at::TensorOptions().dtype(at::kFloat).device(communicator_->device()); + auto ta = at::randn({S, M/S,K}, options); + auto ta_unsharded = at::empty({S, D, M/S,K}, options); + auto tb = at::randn({K,N}, options); + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + for (const auto& iteration : + c10::irange(number_of_warmups + number_of_iterations)) { + if (iteration == number_of_warmups) { + cudaEventRecord(start); + } + for (auto j : c10::irange(S)) { + int64_t stream_index = j % streams.size(); + setCurrentCUDAStream(streams.at(stream_index)); + + auto ta_j = ta.select(0, j); + auto ta_unsharded_j = ta_unsharded.select(0, j); + + // communication + world->_allgather_base(ta_unsharded_j, ta_j)->wait(); + // compute + auto tc_j = torch::matmul(ta_unsharded_j,tb); + } + setCurrentCUDAStream(c10::cuda::getDefaultCUDAStream(communicator_->deviceId())); + synchronizeStreams(streams); + } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + milliseconds /= number_of_iterations; + + std::string test_name = ::testing::UnitTest::GetInstance()->current_test_info()->name(); + times.insert({test_name, milliseconds}); +} + +INSTANTIATE_TEST_SUITE_P( + , + OverlapBenchmark, + testing::Combine( + testing::Values(CommunicatorBackend::kNccl, CommunicatorBackend::kUcc), + /*S=*/testing::Values(1,2,4,8), + /*M=*/testing::Values(pow(2,10), pow(2,15)), + /*K=*/testing::Values(pow(2,10), pow(2,15)), + /*N=*/testing::Values(pow(2,10)), + /*number_of_streams=*/testing::Values(3, 8)), + [](const testing::TestParamInfo& info) + -> std::string { + std::ostringstream os; + os << /*backend*/std::get<0>(info.param) << "_" + << "S" << std::get<1>(info.param) << "_" + << "M" << std::get<2>(info.param) << "_" + << "K" << std::get<3>(info.param) << "_" + << "N" << std::get<4>(info.param) << "_" + << "Streams" << std::get<5>(info.param); + return os.str(); + }); + + struct OverlapTestParams { // Tensors sizes int64_t M = std::pow(2, 6); From af36cf14ac7622945ba6ec6ff8ce68434cc94230 Mon Sep 17 00:00:00 2001 From: snordmann Date: Fri, 25 Oct 2024 03:54:42 +0300 Subject: [PATCH 02/55] minor --- tests/cpp/test_multidevice_overlap.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp index 5def14c8045..76e8192d1fd 100644 --- a/tests/cpp/test_multidevice_overlap.cpp +++ b/tests/cpp/test_multidevice_overlap.cpp @@ -51,7 +51,7 @@ using OverlapBenchmarkParams = std::tuple< class OverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterface { protected: - static std::unordered_map times; + static std::map times; static void TearDownTestSuite() { auto rank = Communicator::getInstance().deviceId(); @@ -61,11 +61,13 @@ class OverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterf } }; -std::unordered_map OverlapBenchmark::times = {}; +std::map OverlapBenchmark::times = {}; TEST_P(OverlapBenchmark, DummyBenchmark) { - constexpr int64_t number_of_warmups = 120; - constexpr int64_t number_of_iterations = 500; + int64_t number_of_warmups = 50; + constexpr int64_t number_of_iterations = 100; + + const int64_t D = communicator_->size(); auto [backend, S, @@ -118,6 +120,7 @@ TEST_P(OverlapBenchmark, DummyBenchmark) { std::string test_name = ::testing::UnitTest::GetInstance()->current_test_info()->name(); times.insert({test_name, milliseconds}); + std::cout << "rank " << communicator_->deviceId() << ", " << test_name << " : " << milliseconds << std::endl; } INSTANTIATE_TEST_SUITE_P( From 68b858a7fcd16e0f79fd62cafe6401496c924c60 Mon Sep 17 00:00:00 2001 From: snordmann Date: Fri, 25 Oct 2024 07:22:06 -0700 Subject: [PATCH 03/55] test script --- bench/process_outputs | 2 ++ bench/test | 35 ++++++++++++++++++++++++++ tests/cpp/test_multidevice_overlap.cpp | 4 +-- 3 files changed, 39 insertions(+), 2 deletions(-) create mode 100644 bench/process_outputs create mode 100755 bench/test diff --git a/bench/process_outputs b/bench/process_outputs new file mode 100644 index 00000000000..139597f9cb0 --- /dev/null +++ b/bench/process_outputs @@ -0,0 +1,2 @@ + + diff --git a/bench/test b/bench/test new file mode 100755 index 00000000000..f0d5728fb4b --- /dev/null +++ b/bench/test @@ -0,0 +1,35 @@ +#!/bin/bash +EXPERIMENT=tl_nccl +DATE=$(date +%Y%m%d-%H%M) +LOG_BASE="/opt/pytorch/Fuser/bench/logs" + +export LOGS="${LOG_BASE}/${EXPERIMENT}_${DATE}" + +mkdir -p $LOGS +LOG_FILE_INFO="${LOGS}/info" +echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO + +NP=8 +BACKEND=UCC +S=* +M=* +K=* +N=* +Streams=* +export GTEST_FILTER="OverlapBenchmark.DummyBenchmark/${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}" +echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO + +MPIFLAGS=" -np $NP" +MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1" +# MPIFLAGS+=" -x UCC_CL_BASIC_TLS=^sharp,mlx5" +# MPIFLAGS+=" -x UCC_COLL_TRACE=info" +MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl" +echo "mpi flags: $MPIFLAGS" | tee -a $LOG_FILE_INFO + +TEST_CMD="$BUILD_DIRECTORY/test_multidevice --gtest_filter=${GTEST_FILTER}" +echo "test cmd: $TEST_CMD" | tee -a $LOG_FILE_INFO + +CMD="mpirun $MPIFLAGS $TEST_CMD" +echo $CMD | tee -a $LOG_FILE_INFO +$CMD | tee -a $LOG_FILE_INFO + diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp index 76e8192d1fd..b8c998618b4 100644 --- a/tests/cpp/test_multidevice_overlap.cpp +++ b/tests/cpp/test_multidevice_overlap.cpp @@ -128,11 +128,11 @@ INSTANTIATE_TEST_SUITE_P( OverlapBenchmark, testing::Combine( testing::Values(CommunicatorBackend::kNccl, CommunicatorBackend::kUcc), - /*S=*/testing::Values(1,2,4,8), + /*S=*/testing::Values(1,2,4,8, 16, 32), /*M=*/testing::Values(pow(2,10), pow(2,15)), /*K=*/testing::Values(pow(2,10), pow(2,15)), /*N=*/testing::Values(pow(2,10)), - /*number_of_streams=*/testing::Values(3, 8)), + /*number_of_streams=*/testing::Values(3, 8, 32)), [](const testing::TestParamInfo& info) -> std::string { std::ostringstream os; From 0c3493b6c1782d27b5f417d2237751a0b37bf8df Mon Sep 17 00:00:00 2001 From: snordmann Date: Mon, 28 Oct 2024 13:13:09 +0200 Subject: [PATCH 04/55] minor --- bench/process_outputs | 5 +++++ bench/test | 2 +- tests/cpp/test_multidevice_overlap.cpp | 5 ++++- 3 files changed, 10 insertions(+), 2 deletions(-) mode change 100644 => 100755 bench/process_outputs diff --git a/bench/process_outputs b/bench/process_outputs old mode 100644 new mode 100755 index 139597f9cb0..c1781394dbc --- a/bench/process_outputs +++ b/bench/process_outputs @@ -1,2 +1,7 @@ +#!/bin/bash +FILE="/opt/pytorch/Fuser/bench/logs/${1}/info" +cat $FILE | grep "rank 0: " #| awk '{print $4}' + +# | grep -E 'Streams32\b' \ No newline at end of file diff --git a/bench/test b/bench/test index f0d5728fb4b..b6375719387 100755 --- a/bench/test +++ b/bench/test @@ -10,7 +10,7 @@ LOG_FILE_INFO="${LOGS}/info" echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO NP=8 -BACKEND=UCC +BACKEND=NCCL S=* M=* K=* diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp index b8c998618b4..2febd097b62 100644 --- a/tests/cpp/test_multidevice_overlap.cpp +++ b/tests/cpp/test_multidevice_overlap.cpp @@ -55,8 +55,11 @@ class OverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterf static void TearDownTestSuite() { auto rank = Communicator::getInstance().deviceId(); + if (rank != 0) { + return; + } for (auto it: times) { - std::cout << "rank " << rank << ": " << it.first << ": " << it.second << std::endl; + std::cout << "time " << rank << ": " << it.first << ": " << it.second << std::endl; } } }; From b30b44bb897c0ec290f37f0e0e02d82ceea3421f Mon Sep 17 00:00:00 2001 From: snordmann Date: Tue, 29 Oct 2024 09:46:09 -0700 Subject: [PATCH 05/55] add nsight profiling --- bench/test | 36 ++++++++++++++++---------- tests/cpp/test_multidevice_overlap.cpp | 7 +++++ 2 files changed, 30 insertions(+), 13 deletions(-) diff --git a/bench/test b/bench/test index b6375719387..8ce85c8ff0f 100755 --- a/bench/test +++ b/bench/test @@ -1,35 +1,45 @@ #!/bin/bash -EXPERIMENT=tl_nccl +EXPERIMENT=profile DATE=$(date +%Y%m%d-%H%M) LOG_BASE="/opt/pytorch/Fuser/bench/logs" export LOGS="${LOG_BASE}/${EXPERIMENT}_${DATE}" mkdir -p $LOGS -LOG_FILE_INFO="${LOGS}/info" +LOG_FILE_INFO="${LOGS}/info.txt" echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO NP=8 BACKEND=NCCL -S=* -M=* -K=* -N=* -Streams=* -export GTEST_FILTER="OverlapBenchmark.DummyBenchmark/${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}" +S=4 +M=32768 +K=32768 +N=1024 +Streams=8 +GTEST_PREFIX="OverlapBenchmark.DummyBenchmark/" +GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}" +export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}" echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO - +`` MPIFLAGS=" -np $NP" MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1" # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=^sharp,mlx5" -# MPIFLAGS+=" -x UCC_COLL_TRACE=info" +MPIFLAGS+=" -x UCC_COLL_TRACE=info" MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl" +# MPIFLAGS+=" -x NCCL_DEBUG=TRACE" #INFO +MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1" echo "mpi flags: $MPIFLAGS" | tee -a $LOG_FILE_INFO TEST_CMD="$BUILD_DIRECTORY/test_multidevice --gtest_filter=${GTEST_FILTER}" echo "test cmd: $TEST_CMD" | tee -a $LOG_FILE_INFO -CMD="mpirun $MPIFLAGS $TEST_CMD" -echo $CMD | tee -a $LOG_FILE_INFO -$CMD | tee -a $LOG_FILE_INFO +MPICMD="mpirun $MPIFLAGS $TEST_CMD" +echo $MPICMD | tee -a $LOG_FILE_INFO + +NSYSCMD="nsys profile --stats=false -w true -t cublas,cuda,nvtx,osrt,mpi,ucx -o ${LOGS}/${GTEST_POSTFIX} --capture-range-end stop --capture-range=cudaProfilerApi --cudabacktrace=memory,sync,kernel,other" + +CMD="${NSYSCMD} ${MPICMD}" +sudo /bin/sh -c "echo '1' > /proc/sys/kernel/perf_event_paranoid" +echo $CMD | tee -a ${LOG_FILE_INFO} +$CMD | tee -a ${LOG_FILE_INFO} diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp index 2febd097b62..189a0da732c 100644 --- a/tests/cpp/test_multidevice_overlap.cpp +++ b/tests/cpp/test_multidevice_overlap.cpp @@ -15,6 +15,7 @@ #include #include #include +#include #include namespace nvfuser { @@ -97,6 +98,9 @@ TEST_P(OverlapBenchmark, DummyBenchmark) { for (const auto& iteration : c10::irange(number_of_warmups + number_of_iterations)) { + if (iteration == 10) { + cudaProfilerStart();; + } if (iteration == number_of_warmups) { cudaEventRecord(start); } @@ -114,6 +118,9 @@ TEST_P(OverlapBenchmark, DummyBenchmark) { } setCurrentCUDAStream(c10::cuda::getDefaultCUDAStream(communicator_->deviceId())); synchronizeStreams(streams); + if (iteration == 15) { + cudaProfilerStop();; + } } cudaEventRecord(stop); cudaEventSynchronize(stop); From 0592a139918072d66c13790741310ecc195abe45 Mon Sep 17 00:00:00 2001 From: snordmann Date: Thu, 31 Oct 2024 06:39:08 -0700 Subject: [PATCH 06/55] nsight and tl/nccl/ sync mode --- bench/test | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/bench/test b/bench/test index 8ce85c8ff0f..b51daa63ebd 100755 --- a/bench/test +++ b/bench/test @@ -1,16 +1,16 @@ #!/bin/bash -EXPERIMENT=profile +EXPERIMENT=profile_driver DATE=$(date +%Y%m%d-%H%M) LOG_BASE="/opt/pytorch/Fuser/bench/logs" export LOGS="${LOG_BASE}/${EXPERIMENT}_${DATE}" mkdir -p $LOGS -LOG_FILE_INFO="${LOGS}/info.txt" +export LOG_FILE_INFO="${LOGS}/info.txt" echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO NP=8 -BACKEND=NCCL +BACKEND=UCC S=4 M=32768 K=32768 @@ -28,6 +28,7 @@ MPIFLAGS+=" -x UCC_COLL_TRACE=info" MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl" # MPIFLAGS+=" -x NCCL_DEBUG=TRACE" #INFO MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1" +MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=driver" echo "mpi flags: $MPIFLAGS" | tee -a $LOG_FILE_INFO TEST_CMD="$BUILD_DIRECTORY/test_multidevice --gtest_filter=${GTEST_FILTER}" From 0037b1e9b1398b9518a80011b1601f7e4f6cda5a Mon Sep 17 00:00:00 2001 From: snordmann Date: Mon, 4 Nov 2024 05:12:10 -0800 Subject: [PATCH 07/55] add cuStreamWriteValue but linkage error --- bench/test | 7 ++++--- tests/cpp/test_multidevice_overlap.cpp | 29 ++++++++++++++++++++++---- 2 files changed, 29 insertions(+), 7 deletions(-) diff --git a/bench/test b/bench/test index b51daa63ebd..2856cff9074 100755 --- a/bench/test +++ b/bench/test @@ -1,5 +1,5 @@ #!/bin/bash -EXPERIMENT=profile_driver +EXPERIMENT=profile_ncc_max_connection2 DATE=$(date +%Y%m%d-%H%M) LOG_BASE="/opt/pytorch/Fuser/bench/logs" @@ -10,7 +10,7 @@ export LOG_FILE_INFO="${LOGS}/info.txt" echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO NP=8 -BACKEND=UCC +BACKEND=NCCL S=4 M=32768 K=32768 @@ -28,7 +28,8 @@ MPIFLAGS+=" -x UCC_COLL_TRACE=info" MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl" # MPIFLAGS+=" -x NCCL_DEBUG=TRACE" #INFO MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1" -MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=driver" +MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event" +MPIFLAGS+=" -x CUDA_DEVICE_MAX_CONNECTIONS=2" echo "mpi flags: $MPIFLAGS" | tee -a $LOG_FILE_INFO TEST_CMD="$BUILD_DIRECTORY/test_multidevice --gtest_filter=${GTEST_FILTER}" diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp index 189a0da732c..8fdaf8afdd9 100644 --- a/tests/cpp/test_multidevice_overlap.cpp +++ b/tests/cpp/test_multidevice_overlap.cpp @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -48,7 +49,8 @@ using OverlapBenchmarkParams = std::tuple< /*M=*/int64_t, /*K=*/int64_t, /*N=*/int64_t, - /*number_of_streams=*/int64_t>; + /*number_of_streams=*/int64_t, + /*add_cuStreamWriteValue32=*/bool>; class OverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterface { protected: @@ -78,7 +80,8 @@ TEST_P(OverlapBenchmark, DummyBenchmark) { M, K, N, - number_of_streams] = GetParam(); + number_of_streams, + add_cuStreamWriteValue32] = GetParam(); GTEST_ASSERT_EQ(M % S, 0); @@ -96,6 +99,13 @@ TEST_P(OverlapBenchmark, DummyBenchmark) { cudaEventCreate(&start); cudaEventCreate(&stop); + // CUdeviceptr pDevice; + // void* ptr; + // if (add_cuStreamWriteValue32) { + // cudaMallocHost(&ptr, 32); + // cudaHostGetDevicePointer((void**)&pDevice, ptr, 0); + // } + for (const auto& iteration : c10::irange(number_of_warmups + number_of_iterations)) { if (iteration == 10) { @@ -113,6 +123,11 @@ TEST_P(OverlapBenchmark, DummyBenchmark) { // communication world->_allgather_base(ta_unsharded_j, ta_j)->wait(); + + // if (add_cuStreamWriteValue32) { + // cuStreamWriteValue32((CUstream)streams.at(stream_index), (CUdeviceptr)pDevice, (cuuint32_t)1, (unsigned int)0); + // } + // compute auto tc_j = torch::matmul(ta_unsharded_j,tb); } @@ -131,6 +146,10 @@ TEST_P(OverlapBenchmark, DummyBenchmark) { std::string test_name = ::testing::UnitTest::GetInstance()->current_test_info()->name(); times.insert({test_name, milliseconds}); std::cout << "rank " << communicator_->deviceId() << ", " << test_name << " : " << milliseconds << std::endl; + + // if (add_cuStreamWriteValue32) { + // cudaFree(ptr); + // } } INSTANTIATE_TEST_SUITE_P( @@ -142,7 +161,8 @@ INSTANTIATE_TEST_SUITE_P( /*M=*/testing::Values(pow(2,10), pow(2,15)), /*K=*/testing::Values(pow(2,10), pow(2,15)), /*N=*/testing::Values(pow(2,10)), - /*number_of_streams=*/testing::Values(3, 8, 32)), + /*number_of_streams=*/testing::Values(3, 8, 32), + /*add_cuStreamWriteValue32*/testing::Values(false)), [](const testing::TestParamInfo& info) -> std::string { std::ostringstream os; @@ -151,7 +171,8 @@ INSTANTIATE_TEST_SUITE_P( << "M" << std::get<2>(info.param) << "_" << "K" << std::get<3>(info.param) << "_" << "N" << std::get<4>(info.param) << "_" - << "Streams" << std::get<5>(info.param); + << "Streams" << std::get<5>(info.param) << "_" + << ((std::get<6>(info.param))? "With" : "Without") << "cuStreamWriteValue32"; return os.str(); }); From ec71e233de02deefb609bd81d2bd7dd6b3f2451f Mon Sep 17 00:00:00 2001 From: snordmann Date: Mon, 4 Nov 2024 06:27:56 -0800 Subject: [PATCH 08/55] multiple pgs --- bench/test | 13 +++++++------ tests/cpp/test_multidevice_overlap.cpp | 17 ++++++++++++----- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/bench/test b/bench/test index 2856cff9074..5433bbee9ce 100755 --- a/bench/test +++ b/bench/test @@ -1,5 +1,5 @@ #!/bin/bash -EXPERIMENT=profile_ncc_max_connection2 +EXPERIMENT=profile_baseline_NCCL DATE=$(date +%Y%m%d-%H%M) LOG_BASE="/opt/pytorch/Fuser/bench/logs" @@ -11,25 +11,26 @@ echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO NP=8 BACKEND=NCCL -S=4 +S=1 M=32768 K=32768 N=1024 Streams=8 +Pgs=1 GTEST_PREFIX="OverlapBenchmark.DummyBenchmark/" -GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}" +GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_Pgs${Pgs}" export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}" echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO `` MPIFLAGS=" -np $NP" MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1" # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=^sharp,mlx5" -MPIFLAGS+=" -x UCC_COLL_TRACE=info" +# MPIFLAGS+=" -x UCC_COLL_TRACE=info" MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl" # MPIFLAGS+=" -x NCCL_DEBUG=TRACE" #INFO MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1" -MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event" -MPIFLAGS+=" -x CUDA_DEVICE_MAX_CONNECTIONS=2" +# MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event" +# MPIFLAGS+=" -x CUDA_DEVICE_MAX_CONNECTIONS=2" echo "mpi flags: $MPIFLAGS" | tee -a $LOG_FILE_INFO TEST_CMD="$BUILD_DIRECTORY/test_multidevice --gtest_filter=${GTEST_FILTER}" diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp index 8fdaf8afdd9..ff79bb45609 100644 --- a/tests/cpp/test_multidevice_overlap.cpp +++ b/tests/cpp/test_multidevice_overlap.cpp @@ -50,7 +50,8 @@ using OverlapBenchmarkParams = std::tuple< /*K=*/int64_t, /*N=*/int64_t, /*number_of_streams=*/int64_t, - /*add_cuStreamWriteValue32=*/bool>; + /*add_cuStreamWriteValue32=*/bool, + /*number_of_pgs=*/int64_t>; class OverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterface { protected: @@ -81,11 +82,13 @@ TEST_P(OverlapBenchmark, DummyBenchmark) { K, N, number_of_streams, - add_cuStreamWriteValue32] = GetParam(); + add_cuStreamWriteValue32, + number_of_pgs] = GetParam(); GTEST_ASSERT_EQ(M % S, 0); - auto world = communicator_->getWorld(backend); + std::vector all_ranks(communicator_->size()); + std::iota(all_ranks.begin(), all_ranks.end(), 0); std::vector streams = createStreams(number_of_streams, communicator_->deviceId()); @@ -118,6 +121,8 @@ TEST_P(OverlapBenchmark, DummyBenchmark) { int64_t stream_index = j % streams.size(); setCurrentCUDAStream(streams.at(stream_index)); + auto world = communicator_->getBackendForTeam(all_ranks, backend, std::to_string(j % number_of_pgs)); + auto ta_j = ta.select(0, j); auto ta_unsharded_j = ta_unsharded.select(0, j); @@ -162,7 +167,8 @@ INSTANTIATE_TEST_SUITE_P( /*K=*/testing::Values(pow(2,10), pow(2,15)), /*N=*/testing::Values(pow(2,10)), /*number_of_streams=*/testing::Values(3, 8, 32), - /*add_cuStreamWriteValue32*/testing::Values(false)), + /*add_cuStreamWriteValue32*/testing::Values(false), + /*number_of_pgs=*/testing::Values(1, 2, 4, 8)), [](const testing::TestParamInfo& info) -> std::string { std::ostringstream os; @@ -172,7 +178,8 @@ INSTANTIATE_TEST_SUITE_P( << "K" << std::get<3>(info.param) << "_" << "N" << std::get<4>(info.param) << "_" << "Streams" << std::get<5>(info.param) << "_" - << ((std::get<6>(info.param))? "With" : "Without") << "cuStreamWriteValue32"; + << ((std::get<6>(info.param))? "WithcuStreamWriteValue32_" : "") + << "Pgs" << std::get<7>(info.param); return os.str(); }); From a15fdfc9d84258d38442a78110d57be1a121598c Mon Sep 17 00:00:00 2001 From: snordmann Date: Mon, 4 Nov 2024 06:39:54 -0800 Subject: [PATCH 09/55] reenable cuStreamValue32 --- bench/test | 9 +++++---- csrc/driver_api.h | 1 + tests/cpp/test_multidevice_overlap.cpp | 26 +++++++++++++------------- 3 files changed, 19 insertions(+), 17 deletions(-) diff --git a/bench/test b/bench/test index 5433bbee9ce..4f3559e283a 100755 --- a/bench/test +++ b/bench/test @@ -1,5 +1,5 @@ #!/bin/bash -EXPERIMENT=profile_baseline_NCCL +EXPERIMENT=profile_cuStreamWrite_NCCL DATE=$(date +%Y%m%d-%H%M) LOG_BASE="/opt/pytorch/Fuser/bench/logs" @@ -11,14 +11,15 @@ echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO NP=8 BACKEND=NCCL -S=1 +S=8 M=32768 K=32768 N=1024 Streams=8 Pgs=1 +cuStreamWrite=WithcuStreamWriteValue32_ GTEST_PREFIX="OverlapBenchmark.DummyBenchmark/" -GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_Pgs${Pgs}" +GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}" export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}" echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO `` @@ -39,7 +40,7 @@ echo "test cmd: $TEST_CMD" | tee -a $LOG_FILE_INFO MPICMD="mpirun $MPIFLAGS $TEST_CMD" echo $MPICMD | tee -a $LOG_FILE_INFO -NSYSCMD="nsys profile --stats=false -w true -t cublas,cuda,nvtx,osrt,mpi,ucx -o ${LOGS}/${GTEST_POSTFIX} --capture-range-end stop --capture-range=cudaProfilerApi --cudabacktrace=memory,sync,kernel,other" +# NSYSCMD="nsys profile --stats=false -w true -t cublas,cuda,nvtx,osrt,mpi,ucx -o ${LOGS}/${GTEST_POSTFIX} --capture-range-end stop --capture-range=cudaProfilerApi --cudabacktrace=memory,sync,kernel,other" CMD="${NSYSCMD} ${MPICMD}" sudo /bin/sh -c "echo '1' > /proc/sys/kernel/perf_event_paranoid" diff --git a/csrc/driver_api.h b/csrc/driver_api.h index b8c413a4054..8105cf855c2 100644 --- a/csrc/driver_api.h +++ b/csrc/driver_api.h @@ -32,6 +32,7 @@ namespace nvfuser { fn(cuModuleGetFunction); \ fn(cuModuleLoadDataEx); \ fn(cuModuleUnload); \ + fn(cuStreamWriteValue32); \ fn(cuOccupancyMaxActiveBlocksPerMultiprocessor) #if (CUDA_VERSION >= 12000) diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp index ff79bb45609..fef6e9bf468 100644 --- a/tests/cpp/test_multidevice_overlap.cpp +++ b/tests/cpp/test_multidevice_overlap.cpp @@ -102,12 +102,12 @@ TEST_P(OverlapBenchmark, DummyBenchmark) { cudaEventCreate(&start); cudaEventCreate(&stop); - // CUdeviceptr pDevice; - // void* ptr; - // if (add_cuStreamWriteValue32) { - // cudaMallocHost(&ptr, 32); - // cudaHostGetDevicePointer((void**)&pDevice, ptr, 0); - // } + CUdeviceptr pDevice; + void* ptr; + if (add_cuStreamWriteValue32) { + cudaMallocHost(&ptr, 32); + cudaHostGetDevicePointer((void**)&pDevice, ptr, 0); + } for (const auto& iteration : c10::irange(number_of_warmups + number_of_iterations)) { @@ -129,9 +129,9 @@ TEST_P(OverlapBenchmark, DummyBenchmark) { // communication world->_allgather_base(ta_unsharded_j, ta_j)->wait(); - // if (add_cuStreamWriteValue32) { - // cuStreamWriteValue32((CUstream)streams.at(stream_index), (CUdeviceptr)pDevice, (cuuint32_t)1, (unsigned int)0); - // } + if (add_cuStreamWriteValue32) { + cuStreamWriteValue32((CUstream)streams.at(stream_index), (CUdeviceptr)pDevice, (cuuint32_t)1, (unsigned int)0); + } // compute auto tc_j = torch::matmul(ta_unsharded_j,tb); @@ -152,9 +152,9 @@ TEST_P(OverlapBenchmark, DummyBenchmark) { times.insert({test_name, milliseconds}); std::cout << "rank " << communicator_->deviceId() << ", " << test_name << " : " << milliseconds << std::endl; - // if (add_cuStreamWriteValue32) { - // cudaFree(ptr); - // } + if (add_cuStreamWriteValue32) { + cudaFree(ptr); + } } INSTANTIATE_TEST_SUITE_P( @@ -167,7 +167,7 @@ INSTANTIATE_TEST_SUITE_P( /*K=*/testing::Values(pow(2,10), pow(2,15)), /*N=*/testing::Values(pow(2,10)), /*number_of_streams=*/testing::Values(3, 8, 32), - /*add_cuStreamWriteValue32*/testing::Values(false), + /*add_cuStreamWriteValue32*/testing::Values(false, true), /*number_of_pgs=*/testing::Values(1, 2, 4, 8)), [](const testing::TestParamInfo& info) -> std::string { From 6682a33b366b3f21a1ced568106e8a3b475c8567 Mon Sep 17 00:00:00 2001 From: snordmann Date: Mon, 4 Nov 2024 07:57:44 -0800 Subject: [PATCH 10/55] add tl/cuda and ec/cuda flags in bash test script --- bench/test | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/bench/test b/bench/test index 4f3559e283a..5ad427b4876 100755 --- a/bench/test +++ b/bench/test @@ -1,5 +1,5 @@ #!/bin/bash -EXPERIMENT=profile_cuStreamWrite_NCCL +EXPERIMENT=profile_UCC_TL_CUDA DATE=$(date +%Y%m%d-%H%M) LOG_BASE="/opt/pytorch/Fuser/bench/logs" @@ -10,14 +10,14 @@ export LOG_FILE_INFO="${LOGS}/info.txt" echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO NP=8 -BACKEND=NCCL -S=8 +BACKEND=UCC +S=4 M=32768 K=32768 N=1024 Streams=8 Pgs=1 -cuStreamWrite=WithcuStreamWriteValue32_ +# cuStreamWrite=WithcuStreamWriteValue32_ GTEST_PREFIX="OverlapBenchmark.DummyBenchmark/" GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}" export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}" @@ -25,11 +25,21 @@ echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO `` MPIFLAGS=" -np $NP" MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1" +MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl" +# MPIFLAGS+=" -x UCC_CL_BASIC_TLS=cuda" +# MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_NUM_WORKERS=8" +# MPIFLAGS+=" -x UCC_EC_CUDA_USE_COOPERATIVE_LAUNCH=0" +# MPIFLAGS+=" -x UCC_EC_CUDA_STREAM_TASK_MODE=kernel" +# MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_COPY_LARGE_THRESH=1M" +# MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_NUM_THREADS=512" +# MPIFLAGS+=" -x UCC_TL_CUDA_SCRATCH_SIZE=32mb" +# MPIFLAGS+=" -x UCC_TL_CUDA_ALLGATHER_RING_MAX_RINGS=32" +# MPIFLAGS+=" -x UCC_TL_CUDA_ALLGATHER_RING_NUM_CHUNKS=32" + # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=^sharp,mlx5" # MPIFLAGS+=" -x UCC_COLL_TRACE=info" -MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl" +# MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1" # MPIFLAGS+=" -x NCCL_DEBUG=TRACE" #INFO -MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1" # MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event" # MPIFLAGS+=" -x CUDA_DEVICE_MAX_CONNECTIONS=2" echo "mpi flags: $MPIFLAGS" | tee -a $LOG_FILE_INFO From b01f1f4fe236be4144182cac5cbdcef15c559337 Mon Sep 17 00:00:00 2001 From: snordmann Date: Mon, 4 Nov 2024 08:40:14 -0800 Subject: [PATCH 11/55] add option to unfuse loops --- bench/test | 5 +++-- tests/cpp/test_multidevice_overlap.cpp | 27 ++++++++++++++++++++------ 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/bench/test b/bench/test index 5ad427b4876..2102c1eb743 100755 --- a/bench/test +++ b/bench/test @@ -11,15 +11,16 @@ echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO NP=8 BACKEND=UCC -S=4 +S=8 M=32768 K=32768 N=1024 Streams=8 Pgs=1 +UNFUSE="_unfused" # cuStreamWrite=WithcuStreamWriteValue32_ GTEST_PREFIX="OverlapBenchmark.DummyBenchmark/" -GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}" +GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}${UNFUSE}" export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}" echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO `` diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp index fef6e9bf468..d4b9c757f7a 100644 --- a/tests/cpp/test_multidevice_overlap.cpp +++ b/tests/cpp/test_multidevice_overlap.cpp @@ -51,7 +51,8 @@ using OverlapBenchmarkParams = std::tuple< /*N=*/int64_t, /*number_of_streams=*/int64_t, /*add_cuStreamWriteValue32=*/bool, - /*number_of_pgs=*/int64_t>; + /*number_of_pgs=*/int64_t, + /*unfuse_loops=*/bool>; class OverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterface { protected: @@ -83,7 +84,8 @@ TEST_P(OverlapBenchmark, DummyBenchmark) { N, number_of_streams, add_cuStreamWriteValue32, - number_of_pgs] = GetParam(); + number_of_pgs, + unfuse_loops] = GetParam(); GTEST_ASSERT_EQ(M % S, 0); @@ -132,9 +134,20 @@ TEST_P(OverlapBenchmark, DummyBenchmark) { if (add_cuStreamWriteValue32) { cuStreamWriteValue32((CUstream)streams.at(stream_index), (CUdeviceptr)pDevice, (cuuint32_t)1, (unsigned int)0); } + if (unfuse_loops == false) { + // compute + auto tc_j = torch::matmul(ta_unsharded_j,tb); + } + } + if (unfuse_loops) { + for (auto j : c10::irange(S)) { + int64_t stream_index = j % streams.size(); + setCurrentCUDAStream(streams.at(stream_index)); + auto ta_unsharded_j = ta_unsharded.select(0, j); - // compute - auto tc_j = torch::matmul(ta_unsharded_j,tb); + // compute + auto tc_j = torch::matmul(ta_unsharded_j,tb); + } } setCurrentCUDAStream(c10::cuda::getDefaultCUDAStream(communicator_->deviceId())); synchronizeStreams(streams); @@ -168,7 +181,8 @@ INSTANTIATE_TEST_SUITE_P( /*N=*/testing::Values(pow(2,10)), /*number_of_streams=*/testing::Values(3, 8, 32), /*add_cuStreamWriteValue32*/testing::Values(false, true), - /*number_of_pgs=*/testing::Values(1, 2, 4, 8)), + /*number_of_pgs=*/testing::Values(1, 2, 4, 8), + /*unfuse_loops=*/testing::Values(false, true)), [](const testing::TestParamInfo& info) -> std::string { std::ostringstream os; @@ -179,7 +193,8 @@ INSTANTIATE_TEST_SUITE_P( << "N" << std::get<4>(info.param) << "_" << "Streams" << std::get<5>(info.param) << "_" << ((std::get<6>(info.param))? "WithcuStreamWriteValue32_" : "") - << "Pgs" << std::get<7>(info.param); + << "Pgs" << std::get<7>(info.param) + << ((std::get<8>(info.param))? "_unfused" : ""); return os.str(); }); From ea7fd37d61ad310c5dcb2d8ca599d8212003ff44 Mon Sep 17 00:00:00 2001 From: snordmann Date: Tue, 5 Nov 2024 02:53:36 -0800 Subject: [PATCH 12/55] add cuda graphs. Only working for NCCL and S1 bc there is a syncStream in nccl --- bench/test | 13 ++-- tests/cpp/test_multidevice_overlap.cpp | 84 ++++++++++++++++---------- 2 files changed, 60 insertions(+), 37 deletions(-) diff --git a/bench/test b/bench/test index 2102c1eb743..8a64225d9e9 100755 --- a/bench/test +++ b/bench/test @@ -1,5 +1,5 @@ #!/bin/bash -EXPERIMENT=profile_UCC_TL_CUDA +EXPERIMENT=profile_cudaGraph_NCCL_S1 DATE=$(date +%Y%m%d-%H%M) LOG_BASE="/opt/pytorch/Fuser/bench/logs" @@ -10,17 +10,18 @@ export LOG_FILE_INFO="${LOGS}/info.txt" echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO NP=8 -BACKEND=UCC -S=8 +BACKEND=NCCL +S=1 M=32768 K=32768 N=1024 Streams=8 Pgs=1 -UNFUSE="_unfused" +# UNFUSE="_unfused" +GRAPH="_WithCudaGraph" # cuStreamWrite=WithcuStreamWriteValue32_ GTEST_PREFIX="OverlapBenchmark.DummyBenchmark/" -GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}${UNFUSE}" +GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}${UNFUSE}${GRAPH}" export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}" echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO `` @@ -51,7 +52,7 @@ echo "test cmd: $TEST_CMD" | tee -a $LOG_FILE_INFO MPICMD="mpirun $MPIFLAGS $TEST_CMD" echo $MPICMD | tee -a $LOG_FILE_INFO -# NSYSCMD="nsys profile --stats=false -w true -t cublas,cuda,nvtx,osrt,mpi,ucx -o ${LOGS}/${GTEST_POSTFIX} --capture-range-end stop --capture-range=cudaProfilerApi --cudabacktrace=memory,sync,kernel,other" +NSYSCMD="nsys profile --stats=false -w true -t cublas,cuda,nvtx,osrt,mpi,ucx -o ${LOGS}/${GTEST_POSTFIX} --capture-range-end stop --capture-range=cudaProfilerApi --cudabacktrace=memory,sync,kernel,other" CMD="${NSYSCMD} ${MPICMD}" sudo /bin/sh -c "echo '1' > /proc/sys/kernel/perf_event_paranoid" diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp index d4b9c757f7a..c93987890b4 100644 --- a/tests/cpp/test_multidevice_overlap.cpp +++ b/tests/cpp/test_multidevice_overlap.cpp @@ -6,6 +6,7 @@ */ // clang-format on #include +#include #include #include #include @@ -52,7 +53,8 @@ using OverlapBenchmarkParams = std::tuple< /*number_of_streams=*/int64_t, /*add_cuStreamWriteValue32=*/bool, /*number_of_pgs=*/int64_t, - /*unfuse_loops=*/bool>; + /*unfuse_loops=*/bool, + /*use_cuda_graph=*/bool>; class OverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterface { protected: @@ -72,8 +74,11 @@ class OverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterf std::map OverlapBenchmark::times = {}; TEST_P(OverlapBenchmark, DummyBenchmark) { - int64_t number_of_warmups = 50; + constexpr int64_t number_of_warmups = 50; constexpr int64_t number_of_iterations = 100; + constexpr int64_t iteration_profiler_start = 10; + constexpr int64_t iteration_profiler_end = 15; + constexpr int64_t iteration_cuda_graph_capture = 5; const int64_t D = communicator_->size(); @@ -85,7 +90,8 @@ TEST_P(OverlapBenchmark, DummyBenchmark) { number_of_streams, add_cuStreamWriteValue32, number_of_pgs, - unfuse_loops] = GetParam(); + unfuse_loops, + use_cuda_graph] = GetParam(); GTEST_ASSERT_EQ(M % S, 0); @@ -94,6 +100,7 @@ TEST_P(OverlapBenchmark, DummyBenchmark) { std::vector streams = createStreams(number_of_streams, communicator_->deviceId()); + setCurrentCUDAStream(streams.at(0)); auto options = at::TensorOptions().dtype(at::kFloat).device(communicator_->device()); auto ta = at::randn({S, M/S,K}, options); @@ -104,6 +111,8 @@ TEST_P(OverlapBenchmark, DummyBenchmark) { cudaEventCreate(&start); cudaEventCreate(&stop); + at::cuda::CUDAGraph cuda_graph; + CUdeviceptr pDevice; void* ptr; if (add_cuStreamWriteValue32) { @@ -113,45 +122,56 @@ TEST_P(OverlapBenchmark, DummyBenchmark) { for (const auto& iteration : c10::irange(number_of_warmups + number_of_iterations)) { - if (iteration == 10) { + if (iteration == iteration_profiler_start) { cudaProfilerStart();; } if (iteration == number_of_warmups) { cudaEventRecord(start); } - for (auto j : c10::irange(S)) { - int64_t stream_index = j % streams.size(); - setCurrentCUDAStream(streams.at(stream_index)); - - auto world = communicator_->getBackendForTeam(all_ranks, backend, std::to_string(j % number_of_pgs)); - - auto ta_j = ta.select(0, j); - auto ta_unsharded_j = ta_unsharded.select(0, j); - - // communication - world->_allgather_base(ta_unsharded_j, ta_j)->wait(); - - if (add_cuStreamWriteValue32) { - cuStreamWriteValue32((CUstream)streams.at(stream_index), (CUdeviceptr)pDevice, (cuuint32_t)1, (unsigned int)0); + if (iteration <= iteration_cuda_graph_capture) { + if (iteration == iteration_cuda_graph_capture) { + cuda_graph.capture_begin(); } - if (unfuse_loops == false) { - // compute - auto tc_j = torch::matmul(ta_unsharded_j,tb); - } - } - if (unfuse_loops) { for (auto j : c10::irange(S)) { int64_t stream_index = j % streams.size(); setCurrentCUDAStream(streams.at(stream_index)); + + auto world = communicator_->getBackendForTeam(all_ranks, backend, std::to_string(j % number_of_pgs)); + + auto ta_j = ta.select(0, j); auto ta_unsharded_j = ta_unsharded.select(0, j); - // compute - auto tc_j = torch::matmul(ta_unsharded_j,tb); + // communication + world->_allgather_base(ta_unsharded_j, ta_j)->wait(); + + if (add_cuStreamWriteValue32) { + cuStreamWriteValue32((CUstream)streams.at(stream_index), (CUdeviceptr)pDevice, (cuuint32_t)1, (unsigned int)0); + } + if (unfuse_loops == false) { + // compute + auto tc_j = torch::matmul(ta_unsharded_j,tb); + } + } + if (unfuse_loops) { + for (auto j : c10::irange(S)) { + int64_t stream_index = j % streams.size(); + setCurrentCUDAStream(streams.at(stream_index)); + auto ta_unsharded_j = ta_unsharded.select(0, j); + + // compute + auto tc_j = torch::matmul(ta_unsharded_j,tb); + } } + if (iteration == iteration_cuda_graph_capture) { + cuda_graph.capture_end(); + } else { + setCurrentCUDAStream(streams.at(0)); + synchronizeStreams(streams); + } + } else { + cuda_graph.replay(); } - setCurrentCUDAStream(c10::cuda::getDefaultCUDAStream(communicator_->deviceId())); - synchronizeStreams(streams); - if (iteration == 15) { + if (iteration == iteration_profiler_end) { cudaProfilerStop();; } } @@ -182,7 +202,8 @@ INSTANTIATE_TEST_SUITE_P( /*number_of_streams=*/testing::Values(3, 8, 32), /*add_cuStreamWriteValue32*/testing::Values(false, true), /*number_of_pgs=*/testing::Values(1, 2, 4, 8), - /*unfuse_loops=*/testing::Values(false, true)), + /*unfuse_loops=*/testing::Values(false, true), + /*use_cuda_graph=*/testing::Values(false)), // cuda graphs not supported: ucc does not supports it (segfault) and nccl PG has a "syncStream" that throws [](const testing::TestParamInfo& info) -> std::string { std::ostringstream os; @@ -194,7 +215,8 @@ INSTANTIATE_TEST_SUITE_P( << "Streams" << std::get<5>(info.param) << "_" << ((std::get<6>(info.param))? "WithcuStreamWriteValue32_" : "") << "Pgs" << std::get<7>(info.param) - << ((std::get<8>(info.param))? "_unfused" : ""); + << ((std::get<8>(info.param))? "_unfused" : "") + << ((std::get<9>(info.param))? "_WithCudaGraph" : ""); return os.str(); }); From 9dddac2a6320e315f1300febc624a03e084aa54f Mon Sep 17 00:00:00 2001 From: snordmann Date: Mon, 25 Nov 2024 16:51:59 -0800 Subject: [PATCH 13/55] write matmul to sliced output --- tests/cpp/test_multidevice_overlap.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp index c93987890b4..5600041dc7d 100644 --- a/tests/cpp/test_multidevice_overlap.cpp +++ b/tests/cpp/test_multidevice_overlap.cpp @@ -73,7 +73,7 @@ class OverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterf std::map OverlapBenchmark::times = {}; -TEST_P(OverlapBenchmark, DummyBenchmark) { +TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) { constexpr int64_t number_of_warmups = 50; constexpr int64_t number_of_iterations = 100; constexpr int64_t iteration_profiler_start = 10; @@ -106,6 +106,7 @@ TEST_P(OverlapBenchmark, DummyBenchmark) { auto ta = at::randn({S, M/S,K}, options); auto ta_unsharded = at::empty({S, D, M/S,K}, options); auto tb = at::randn({K,N}, options); + auto tc = at::empty({S, D, M/S, N}, options); cudaEvent_t start, stop; cudaEventCreate(&start); @@ -140,6 +141,7 @@ TEST_P(OverlapBenchmark, DummyBenchmark) { auto ta_j = ta.select(0, j); auto ta_unsharded_j = ta_unsharded.select(0, j); + auto tc_j = ta_unsharded.select(0, j); // communication world->_allgather_base(ta_unsharded_j, ta_j)->wait(); @@ -149,7 +151,7 @@ TEST_P(OverlapBenchmark, DummyBenchmark) { } if (unfuse_loops == false) { // compute - auto tc_j = torch::matmul(ta_unsharded_j,tb); + torch::matmul_out(tc_j, ta_unsharded_j,tb); } } if (unfuse_loops) { @@ -157,9 +159,10 @@ TEST_P(OverlapBenchmark, DummyBenchmark) { int64_t stream_index = j % streams.size(); setCurrentCUDAStream(streams.at(stream_index)); auto ta_unsharded_j = ta_unsharded.select(0, j); + auto tc_j = ta_unsharded.select(0, j); // compute - auto tc_j = torch::matmul(ta_unsharded_j,tb); + torch::matmul_out(tc_j, ta_unsharded_j,tb); } } if (iteration == iteration_cuda_graph_capture) { From faf8bbe6b9c1ddf19b31069d66387b060481e9bf Mon Sep 17 00:00:00 2001 From: snordmann Date: Thu, 28 Nov 2024 08:12:18 -0800 Subject: [PATCH 14/55] wip cuStreamWriteValue not working --- bench/test | 14 +- tests/cpp/test_multidevice_overlap.cpp | 219 +++++++++++++++++++++++-- 2 files changed, 216 insertions(+), 17 deletions(-) diff --git a/bench/test b/bench/test index 8a64225d9e9..c27cb9ce74b 100755 --- a/bench/test +++ b/bench/test @@ -1,5 +1,5 @@ #!/bin/bash -EXPERIMENT=profile_cudaGraph_NCCL_S1 +EXPERIMENT=profile_NCCL_with_cuStreamValue DATE=$(date +%Y%m%d-%H%M) LOG_BASE="/opt/pytorch/Fuser/bench/logs" @@ -10,17 +10,17 @@ export LOG_FILE_INFO="${LOGS}/info.txt" echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO NP=8 -BACKEND=NCCL -S=1 +BACKEND=UCC +S=8 M=32768 K=32768 N=1024 Streams=8 Pgs=1 # UNFUSE="_unfused" -GRAPH="_WithCudaGraph" +# GRAPH="_WithCudaGraph" # cuStreamWrite=WithcuStreamWriteValue32_ -GTEST_PREFIX="OverlapBenchmark.DummyBenchmark/" +GTEST_PREFIX="OverlapBenchmark.PipelinedAGMatmulBenchmark/" GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}${UNFUSE}${GRAPH}" export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}" echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO @@ -42,7 +42,7 @@ MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl" # MPIFLAGS+=" -x UCC_COLL_TRACE=info" # MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1" # MPIFLAGS+=" -x NCCL_DEBUG=TRACE" #INFO -# MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event" +MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event" # MPIFLAGS+=" -x CUDA_DEVICE_MAX_CONNECTIONS=2" echo "mpi flags: $MPIFLAGS" | tee -a $LOG_FILE_INFO @@ -52,7 +52,7 @@ echo "test cmd: $TEST_CMD" | tee -a $LOG_FILE_INFO MPICMD="mpirun $MPIFLAGS $TEST_CMD" echo $MPICMD | tee -a $LOG_FILE_INFO -NSYSCMD="nsys profile --stats=false -w true -t cublas,cuda,nvtx,osrt,mpi,ucx -o ${LOGS}/${GTEST_POSTFIX} --capture-range-end stop --capture-range=cudaProfilerApi --cudabacktrace=memory,sync,kernel,other" +# NSYSCMD="nsys profile --stats=false -w true -t cublas,cuda,nvtx,osrt,mpi,ucx -o ${LOGS}/${GTEST_POSTFIX} --capture-range-end stop --capture-range=cudaProfilerApi --cudabacktrace=memory,sync,kernel,other" CMD="${NSYSCMD} ${MPICMD}" sudo /bin/sh -c "echo '1' > /proc/sys/kernel/perf_event_paranoid" diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp index 5600041dc7d..0d55580a11a 100644 --- a/tests/cpp/test_multidevice_overlap.cpp +++ b/tests/cpp/test_multidevice_overlap.cpp @@ -20,6 +20,8 @@ #include #include +#define CUSTOM_PG_WITH_INTERNAL_STREAM_ACCESS 1 + namespace nvfuser { namespace { @@ -44,6 +46,190 @@ void synchronizeStreams(const std::vector& streams) { } // namespace +TEST_F(NVFuserTest, cuStreamWriteValue32) { + constexpr cuuint32_t value = 3; + cudaError_t error; + CUdeviceptr pDevice; + volatile cuuint32_t* ptr; + error = cudaSetDevice(0); + ASSERT_EQ(error, 0); + error = cudaMallocHost((void**)&ptr, sizeof(cuuint32_t)); + ASSERT_EQ(error, 0); + error = cudaHostGetDevicePointer((void**)&pDevice, (void*)ptr, 0); + ASSERT_EQ(error, 0); + + at::cuda::CUDAStream c10_stream = at::cuda::getStreamFromPool( + /*isHighPriority=*/true, /*device_index*/0); + CUstream stream = c10_stream.stream(); + CUresult st; + st = cuStreamWriteValue32(stream, pDevice, value, /*flag=*/0); + ASSERT_EQ(st, 0); + + torch::cuda::synchronize(); + cuuint32_t ptr2; + error = cudaMemcpy(&ptr2, (void*)pDevice, sizeof(cuuint32_t), cudaMemcpyDeviceToHost); + ASSERT_EQ(error, 0); + ASSERT_EQ(ptr2, value); + + + int i = 0; + while (i < 10000000) { + if (*ptr == value) { + std::cout << " BREAK " << *ptr < DummyOverlapBenchmark::times = {}; + +TEST_P(DummyOverlapBenchmark, PipelinedAGMatmulBenchmark) { + constexpr int64_t number_of_warmups = 50; + constexpr int64_t number_of_iterations = 100; + constexpr int64_t iteration_profiler_start = 10; + constexpr int64_t iteration_profiler_end = 15; + + + auto [backend, + M, + K, + N, + L, + number_of_streams, + add_cuStreamWriteValue32, + number_of_pgs] = GetParam(); + + std::vector all_ranks(communicator_->size()); + std::iota(all_ranks.begin(), all_ranks.end(), 0); + + std::vector streams = + createStreams(number_of_streams, communicator_->deviceId()); + setCurrentCUDAStream(streams.at(0)); + + auto options = at::TensorOptions().dtype(at::kFloat).device(communicator_->device()); + auto ta = at::randn({M, K}, options); + auto tb = at::randn({K, N}, options); + auto tc = at::empty({M, N}, options); + auto src = at::randn({L}, options); + auto dst = at::empty({L * communicator_->size()}, options); + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + CUdeviceptr pDevice; + void* ptr; + if (add_cuStreamWriteValue32) { + cudaMallocHost(&ptr, 32); + cudaHostGetDevicePointer((void**)&pDevice, ptr, 0); + } + + for (const auto& iteration : + c10::irange(number_of_warmups + number_of_iterations)) { + if (iteration == iteration_profiler_start) { + cudaProfilerStart();; + } + if (iteration == number_of_warmups) { + cudaEventRecord(start); + } + int64_t stream_index = iteration % streams.size(); + setCurrentCUDAStream(streams.at(stream_index)); + + auto world = communicator_->getBackendForTeam(all_ranks, backend, std::to_string(iteration % number_of_pgs)); + + // communication + world->_allgather_base(dst, src)->wait(); + + // compute + torch::matmul_out(tc, ta, tb); + + if (add_cuStreamWriteValue32) { + + cuStreamWriteValue32( +#if CUSTOM_PG_WITH_INTERNAL_STREAM_ACCESS + (CUstream)world->getCudaStream(communicator_->device()).stream(), +#else + (CUstream)streams.at(stream_index).stream(), +#endif + (CUdeviceptr)pDevice, (cuuint32_t)1, (unsigned int)0); + } + + setCurrentCUDAStream(streams.at(0)); + synchronizeStreams(streams); + if (iteration == iteration_profiler_end) { + cudaProfilerStop();; + } + } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + milliseconds /= number_of_iterations; + + std::string test_name = ::testing::UnitTest::GetInstance()->current_test_info()->name(); + times.insert({test_name, milliseconds}); + std::cout << "rank " << communicator_->deviceId() << ", " << test_name << " : " << milliseconds << std::endl; + + if (add_cuStreamWriteValue32) { + cudaFree(ptr); + } +} + +INSTANTIATE_TEST_SUITE_P( + , + DummyOverlapBenchmark, + testing::Combine( + testing::Values(CommunicatorBackend::kNccl, CommunicatorBackend::kUcc), + /*M=*/testing::Values(pow(2,10), pow(2,15)), + /*K=*/testing::Values(pow(2,10), pow(2,15)), + /*N=*/testing::Values(pow(2,10)), + /*L=*/testing::Values(pow(2,15)), + /*number_of_streams=*/testing::Values(1, 8), + /*add_cuStreamWriteValue32*/testing::Values(false, true), + /*number_of_pgs=*/testing::Values(1, 2, 4, 8)), + [](const testing::TestParamInfo& info) + -> std::string { + std::ostringstream os; + os << /*backend*/std::get<0>(info.param) << "_" + << "M" << std::get<1>(info.param) << "_" + << "K" << std::get<2>(info.param) << "_" + << "N" << std::get<3>(info.param) << "_" + << "L" << std::get<4>(info.param) << "_" + << "Streams" << std::get<5>(info.param) << "_" + << ((std::get<6>(info.param))? "WithcuStreamWriteValue32_" : "") + << "Pgs" << std::get<7>(info.param); + return os.str(); + }); + using OverlapBenchmarkParams = std::tuple< CommunicatorBackend, /*S=*/int64_t, @@ -115,10 +301,10 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) { at::cuda::CUDAGraph cuda_graph; CUdeviceptr pDevice; - void* ptr; + cuuint32_t* ptr; if (add_cuStreamWriteValue32) { - cudaMallocHost(&ptr, 32); - cudaHostGetDevicePointer((void**)&pDevice, ptr, 0); + cudaMallocHost((void**)&ptr, sizeof(cuuint32_t)); + cudaHostGetDevicePointer((void**)&pDevice, (void*)ptr, 0); } for (const auto& iteration : @@ -129,8 +315,8 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) { if (iteration == number_of_warmups) { cudaEventRecord(start); } - if (iteration <= iteration_cuda_graph_capture) { - if (iteration == iteration_cuda_graph_capture) { + if (!use_cuda_graph || (iteration <= iteration_cuda_graph_capture)) { + if (use_cuda_graph && (iteration == iteration_cuda_graph_capture)) { cuda_graph.capture_begin(); } for (auto j : c10::irange(S)) { @@ -141,13 +327,22 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) { auto ta_j = ta.select(0, j); auto ta_unsharded_j = ta_unsharded.select(0, j); - auto tc_j = ta_unsharded.select(0, j); + auto tc_j = tc.select(0, j); // communication world->_allgather_base(ta_unsharded_j, ta_j)->wait(); if (add_cuStreamWriteValue32) { - cuStreamWriteValue32((CUstream)streams.at(stream_index), (CUdeviceptr)pDevice, (cuuint32_t)1, (unsigned int)0); + if (!communicator_->deviceId()){ + std::cout << "writing to stream " << world->getCudaStream(communicator_->device()).stream() << " the value " << (cuuint32_t)(iteration * S + j) << ", communicator_->device()=" << communicator_->device() << ", world=" << world << ", number_of_pgs=" << number_of_pgs << " with MACRO=" << CUSTOM_PG_WITH_INTERNAL_STREAM_ACCESS << std::endl; + } + cuStreamWriteValue32( +#if CUSTOM_PG_WITH_INTERNAL_STREAM_ACCESS + (CUstream)world->getCudaStream(communicator_->device()).stream(), +#else + // (CUstream)streams.at(stream_index).stream(), +#endif + (CUdeviceptr)pDevice, (cuuint32_t)(iteration * S + j), (unsigned int)0); } if (unfuse_loops == false) { // compute @@ -159,13 +354,13 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) { int64_t stream_index = j % streams.size(); setCurrentCUDAStream(streams.at(stream_index)); auto ta_unsharded_j = ta_unsharded.select(0, j); - auto tc_j = ta_unsharded.select(0, j); + auto tc_j = tc.select(0, j); // compute torch::matmul_out(tc_j, ta_unsharded_j,tb); } } - if (iteration == iteration_cuda_graph_capture) { + if (use_cuda_graph && (iteration == iteration_cuda_graph_capture)) { cuda_graph.capture_end(); } else { setCurrentCUDAStream(streams.at(0)); @@ -189,7 +384,11 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) { std::cout << "rank " << communicator_->deviceId() << ", " << test_name << " : " << milliseconds << std::endl; if (add_cuStreamWriteValue32) { - cudaFree(ptr); + std::cout << "RANK " << communicator_->device() << " entering while loop. Max index=" << (number_of_warmups + number_of_iterations)*S + S << std::endl; + while (*ptr < (cuuint32_t)(number_of_warmups + number_of_iterations)*S + S - 1) { + std::cout << "RANK " << communicator_->device() << " waiting at index=" << *ptr << std::endl; + } + cudaFree((void*)ptr); } } From a6b5fd75896d26a15fc0e2b6a8a66e9e81e60016 Mon Sep 17 00:00:00 2001 From: snordmann Date: Mon, 2 Dec 2024 05:45:31 -0800 Subject: [PATCH 15/55] dummy benchmark --- bench/test | 30 ++++--- tests/cpp/test_multidevice_overlap.cpp | 110 ++++--------------------- 2 files changed, 35 insertions(+), 105 deletions(-) diff --git a/bench/test b/bench/test index c27cb9ce74b..cff8d8b34bb 100755 --- a/bench/test +++ b/bench/test @@ -1,5 +1,5 @@ #!/bin/bash -EXPERIMENT=profile_NCCL_with_cuStreamValue +EXPERIMENT=Dummy_profile_NCCL_P2P_NET_CHUNKSIZE_LARGE DATE=$(date +%Y%m%d-%H%M) LOG_BASE="/opt/pytorch/Fuser/bench/logs" @@ -10,27 +10,33 @@ export LOG_FILE_INFO="${LOGS}/info.txt" echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO NP=8 -BACKEND=UCC +BACKEND=NCCL S=8 -M=32768 +M=131072 #32768 K=32768 -N=1024 +N=32768 #1024 +L=32768 Streams=8 Pgs=1 # UNFUSE="_unfused" # GRAPH="_WithCudaGraph" # cuStreamWrite=WithcuStreamWriteValue32_ -GTEST_PREFIX="OverlapBenchmark.PipelinedAGMatmulBenchmark/" -GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}${UNFUSE}${GRAPH}" +# GTEST_PREFIX="OverlapBenchmark.PipelinedAGMatmulBenchmark/" +GTEST_PREFIX="DummyOverlapBenchmark.PipelinedAGMatmulBenchmark/" +# GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}${UNFUSE}${GRAPH}" +GTEST_POSTFIX="${BACKEND}_M${M}_K${K}_N${N}_L${L}" export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}" echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO -`` + MPIFLAGS=" -np $NP" MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1" -MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl" +# MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl" # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=cuda" +# MPIFLAGS+=" -x UCC_CL_BASIC_TLS=ucp" +# MPIFLAGS+=" -x UCX_RNDV_THRESH=0 -x UCX_TLS=ib,cuda_copy" # MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_NUM_WORKERS=8" # MPIFLAGS+=" -x UCC_EC_CUDA_USE_COOPERATIVE_LAUNCH=0" +# MPIFLAGS+=" -x UCC_EC_CUDA_STREAM_TASK_MODE=driver" # MPIFLAGS+=" -x UCC_EC_CUDA_STREAM_TASK_MODE=kernel" # MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_COPY_LARGE_THRESH=1M" # MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_NUM_THREADS=512" @@ -39,10 +45,12 @@ MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl" # MPIFLAGS+=" -x UCC_TL_CUDA_ALLGATHER_RING_NUM_CHUNKS=32" # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=^sharp,mlx5" -# MPIFLAGS+=" -x UCC_COLL_TRACE=info" +# MPIFLAGS+=" -x UCC_COLL_TRACE=debug" +# MPIFLAGS+=" -x UCC_LOG_LEVEL=debug" # MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1" +MPIFLAGS+=" -x NCCL_P2P_NET_CHUNKSIZE=2MB" # MPIFLAGS+=" -x NCCL_DEBUG=TRACE" #INFO -MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event" +# MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event" # MPIFLAGS+=" -x CUDA_DEVICE_MAX_CONNECTIONS=2" echo "mpi flags: $MPIFLAGS" | tee -a $LOG_FILE_INFO @@ -52,7 +60,7 @@ echo "test cmd: $TEST_CMD" | tee -a $LOG_FILE_INFO MPICMD="mpirun $MPIFLAGS $TEST_CMD" echo $MPICMD | tee -a $LOG_FILE_INFO -# NSYSCMD="nsys profile --stats=false -w true -t cublas,cuda,nvtx,osrt,mpi,ucx -o ${LOGS}/${GTEST_POSTFIX} --capture-range-end stop --capture-range=cudaProfilerApi --cudabacktrace=memory,sync,kernel,other" +NSYSCMD="nsys profile --stats=false -w true -t cublas,cuda,nvtx,osrt,mpi,ucx -o ${LOGS}/${GTEST_POSTFIX} --capture-range-end stop --capture-range=cudaProfilerApi --cudabacktrace=memory,sync,kernel,other" CMD="${NSYSCMD} ${MPICMD}" sudo /bin/sh -c "echo '1' > /proc/sys/kernel/perf_event_paranoid" diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp index 0d55580a11a..85059b89a31 100644 --- a/tests/cpp/test_multidevice_overlap.cpp +++ b/tests/cpp/test_multidevice_overlap.cpp @@ -20,7 +20,7 @@ #include #include -#define CUSTOM_PG_WITH_INTERNAL_STREAM_ACCESS 1 +#define CUSTOM_PG_WITH_INTERNAL_STREAM_ACCESS 0 namespace nvfuser { @@ -46,54 +46,12 @@ void synchronizeStreams(const std::vector& streams) { } // namespace -TEST_F(NVFuserTest, cuStreamWriteValue32) { - constexpr cuuint32_t value = 3; - cudaError_t error; - CUdeviceptr pDevice; - volatile cuuint32_t* ptr; - error = cudaSetDevice(0); - ASSERT_EQ(error, 0); - error = cudaMallocHost((void**)&ptr, sizeof(cuuint32_t)); - ASSERT_EQ(error, 0); - error = cudaHostGetDevicePointer((void**)&pDevice, (void*)ptr, 0); - ASSERT_EQ(error, 0); - - at::cuda::CUDAStream c10_stream = at::cuda::getStreamFromPool( - /*isHighPriority=*/true, /*device_index*/0); - CUstream stream = c10_stream.stream(); - CUresult st; - st = cuStreamWriteValue32(stream, pDevice, value, /*flag=*/0); - ASSERT_EQ(st, 0); - - torch::cuda::synchronize(); - cuuint32_t ptr2; - error = cudaMemcpy(&ptr2, (void*)pDevice, sizeof(cuuint32_t), cudaMemcpyDeviceToHost); - ASSERT_EQ(error, 0); - ASSERT_EQ(ptr2, value); - - - int i = 0; - while (i < 10000000) { - if (*ptr == value) { - std::cout << " BREAK " << *ptr <deviceId() << ", " << test_name << " : " << milliseconds << std::endl; - - if (add_cuStreamWriteValue32) { - cudaFree(ptr); - } } INSTANTIATE_TEST_SUITE_P( @@ -209,13 +140,10 @@ INSTANTIATE_TEST_SUITE_P( DummyOverlapBenchmark, testing::Combine( testing::Values(CommunicatorBackend::kNccl, CommunicatorBackend::kUcc), - /*M=*/testing::Values(pow(2,10), pow(2,15)), - /*K=*/testing::Values(pow(2,10), pow(2,15)), - /*N=*/testing::Values(pow(2,10)), - /*L=*/testing::Values(pow(2,15)), - /*number_of_streams=*/testing::Values(1, 8), - /*add_cuStreamWriteValue32*/testing::Values(false, true), - /*number_of_pgs=*/testing::Values(1, 2, 4, 8)), + /*M=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)), + /*K=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)), + /*N=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)), + /*L=*/testing::Values(pow(2,10), pow(2,15), pow(2,17))), [](const testing::TestParamInfo& info) -> std::string { std::ostringstream os; @@ -223,10 +151,7 @@ INSTANTIATE_TEST_SUITE_P( << "M" << std::get<1>(info.param) << "_" << "K" << std::get<2>(info.param) << "_" << "N" << std::get<3>(info.param) << "_" - << "L" << std::get<4>(info.param) << "_" - << "Streams" << std::get<5>(info.param) << "_" - << ((std::get<6>(info.param))? "WithcuStreamWriteValue32_" : "") - << "Pgs" << std::get<7>(info.param); + << "L" << std::get<4>(info.param); return os.str(); }); @@ -333,14 +258,11 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) { world->_allgather_base(ta_unsharded_j, ta_j)->wait(); if (add_cuStreamWriteValue32) { - if (!communicator_->deviceId()){ - std::cout << "writing to stream " << world->getCudaStream(communicator_->device()).stream() << " the value " << (cuuint32_t)(iteration * S + j) << ", communicator_->device()=" << communicator_->device() << ", world=" << world << ", number_of_pgs=" << number_of_pgs << " with MACRO=" << CUSTOM_PG_WITH_INTERNAL_STREAM_ACCESS << std::endl; - } cuStreamWriteValue32( #if CUSTOM_PG_WITH_INTERNAL_STREAM_ACCESS (CUstream)world->getCudaStream(communicator_->device()).stream(), #else - // (CUstream)streams.at(stream_index).stream(), + (CUstream)streams.at(stream_index).stream(), #endif (CUdeviceptr)pDevice, (cuuint32_t)(iteration * S + j), (unsigned int)0); } From 8d927bf4d7537b2ae2450efd775c039c68ebffbe Mon Sep 17 00:00:00 2001 From: snordmann Date: Mon, 2 Dec 2024 06:45:27 -0800 Subject: [PATCH 16/55] add pre post comms option --- bench/test | 22 ++++++++++++--------- tests/cpp/test_multidevice_overlap.cpp | 27 ++++++++++++++++++++------ 2 files changed, 34 insertions(+), 15 deletions(-) diff --git a/bench/test b/bench/test index cff8d8b34bb..28532970124 100755 --- a/bench/test +++ b/bench/test @@ -1,5 +1,5 @@ #!/bin/bash -EXPERIMENT=Dummy_profile_NCCL_P2P_NET_CHUNKSIZE_LARGE +EXPERIMENT=Dummy_profile_POST_COMM_UCC_TL_UCP_OVER_IB_LARGE DATE=$(date +%Y%m%d-%H%M) LOG_BASE="/opt/pytorch/Fuser/bench/logs" @@ -10,21 +10,25 @@ export LOG_FILE_INFO="${LOGS}/info.txt" echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO NP=8 -BACKEND=NCCL -S=8 +BACKEND=UCC M=131072 #32768 K=32768 N=32768 #1024 -L=32768 + +S=8 Streams=8 Pgs=1 + +L=32768 +# PRE_COMM="_pre_comm" +POST_COMM="_post_comm" # UNFUSE="_unfused" # GRAPH="_WithCudaGraph" # cuStreamWrite=WithcuStreamWriteValue32_ # GTEST_PREFIX="OverlapBenchmark.PipelinedAGMatmulBenchmark/" GTEST_PREFIX="DummyOverlapBenchmark.PipelinedAGMatmulBenchmark/" # GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}${UNFUSE}${GRAPH}" -GTEST_POSTFIX="${BACKEND}_M${M}_K${K}_N${N}_L${L}" +GTEST_POSTFIX="${BACKEND}_M${M}_K${K}_N${N}_L${L}${PRE_COMM}${POST_COMM}" export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}" echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO @@ -32,8 +36,8 @@ MPIFLAGS=" -np $NP" MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1" # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl" # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=cuda" -# MPIFLAGS+=" -x UCC_CL_BASIC_TLS=ucp" -# MPIFLAGS+=" -x UCX_RNDV_THRESH=0 -x UCX_TLS=ib,cuda_copy" +MPIFLAGS+=" -x UCC_CL_BASIC_TLS=ucp" +MPIFLAGS+=" -x UCX_RNDV_THRESH=0 -x UCX_TLS=ib,cuda_copy" # MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_NUM_WORKERS=8" # MPIFLAGS+=" -x UCC_EC_CUDA_USE_COOPERATIVE_LAUNCH=0" # MPIFLAGS+=" -x UCC_EC_CUDA_STREAM_TASK_MODE=driver" @@ -45,10 +49,10 @@ MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1" # MPIFLAGS+=" -x UCC_TL_CUDA_ALLGATHER_RING_NUM_CHUNKS=32" # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=^sharp,mlx5" -# MPIFLAGS+=" -x UCC_COLL_TRACE=debug" +# MPIFLAGS+=" -x UCC_COLL_TRACE=info" # MPIFLAGS+=" -x UCC_LOG_LEVEL=debug" # MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1" -MPIFLAGS+=" -x NCCL_P2P_NET_CHUNKSIZE=2MB" +# MPIFLAGS+=" -x NCCL_P2P_NET_CHUNKSIZE=2MB" # MPIFLAGS+=" -x NCCL_DEBUG=TRACE" #INFO # MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event" # MPIFLAGS+=" -x CUDA_DEVICE_MAX_CONNECTIONS=2" diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp index 85059b89a31..9898df02ac8 100644 --- a/tests/cpp/test_multidevice_overlap.cpp +++ b/tests/cpp/test_multidevice_overlap.cpp @@ -51,7 +51,9 @@ using DummyOverlapBenchmarkParams = std::tuple< /*M=*/int64_t, /*K=*/int64_t, /*N=*/int64_t, - /*L(communication msgsize)=*/int64_t>; + /*L(communication msgsize)=*/int64_t, + /*pre_comm=*/bool, + /*post_comm=*/bool>; class DummyOverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterface { protected: @@ -81,7 +83,9 @@ TEST_P(DummyOverlapBenchmark, PipelinedAGMatmulBenchmark) { M, K, N, - L] = GetParam(); + L, + pre_comm, + post_comm] = GetParam(); std::vector all_ranks(communicator_->size()); std::iota(all_ranks.begin(), all_ranks.end(), 0); @@ -112,13 +116,20 @@ TEST_P(DummyOverlapBenchmark, PipelinedAGMatmulBenchmark) { cudaEventRecord(start); } - setCurrentCUDAStream(communication_stream); - world->_allgather_base(dst, src)->wait(); + if (pre_comm) { + setCurrentCUDAStream(communication_stream); + world->_allgather_base(dst, src)->wait(); + } // compute setCurrentCUDAStream(compute_stream); torch::matmul_out(tc, ta, tb); + if (post_comm) { + setCurrentCUDAStream(communication_stream); + world->_allgather_base(dst, src)->wait(); + } + if (iteration == iteration_profiler_end) { cudaProfilerStop();; } @@ -143,7 +154,9 @@ INSTANTIATE_TEST_SUITE_P( /*M=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)), /*K=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)), /*N=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)), - /*L=*/testing::Values(pow(2,10), pow(2,15), pow(2,17))), + /*L=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)), + /*pre-comm=*/testing::Bool(), + /*post-comm=*/testing::Bool()), [](const testing::TestParamInfo& info) -> std::string { std::ostringstream os; @@ -151,7 +164,9 @@ INSTANTIATE_TEST_SUITE_P( << "M" << std::get<1>(info.param) << "_" << "K" << std::get<2>(info.param) << "_" << "N" << std::get<3>(info.param) << "_" - << "L" << std::get<4>(info.param); + << "L" << std::get<4>(info.param) + << ((std::get<5>(info.param))? "_pre_comm" : "") + << ((std::get<6>(info.param))? "_post_comm" : ""); return os.str(); }); From d9c581c13a9742b3896baf1bd37bc8bcd0acb923 Mon Sep 17 00:00:00 2001 From: snordmann Date: Mon, 2 Dec 2024 06:45:27 -0800 Subject: [PATCH 17/55] add pre post comms option --- bench/test | 20 +++++++++++-------- tests/cpp/test_multidevice_overlap.cpp | 27 ++++++++++++++++++++------ 2 files changed, 33 insertions(+), 14 deletions(-) diff --git a/bench/test b/bench/test index cff8d8b34bb..72c22480714 100755 --- a/bench/test +++ b/bench/test @@ -1,5 +1,5 @@ #!/bin/bash -EXPERIMENT=Dummy_profile_NCCL_P2P_NET_CHUNKSIZE_LARGE +EXPERIMENT=Dummy_profile_POST_COMM_UCC_TL_UCP_OVER_IB_LARGE DATE=$(date +%Y%m%d-%H%M) LOG_BASE="/opt/pytorch/Fuser/bench/logs" @@ -11,20 +11,24 @@ echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO NP=8 BACKEND=NCCL -S=8 M=131072 #32768 K=32768 N=32768 #1024 -L=32768 + +S=8 Streams=8 Pgs=1 + +L=32768 +# PRE_COMM="_pre_comm" +POST_COMM="_post_comm" # UNFUSE="_unfused" # GRAPH="_WithCudaGraph" # cuStreamWrite=WithcuStreamWriteValue32_ # GTEST_PREFIX="OverlapBenchmark.PipelinedAGMatmulBenchmark/" GTEST_PREFIX="DummyOverlapBenchmark.PipelinedAGMatmulBenchmark/" # GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}${UNFUSE}${GRAPH}" -GTEST_POSTFIX="${BACKEND}_M${M}_K${K}_N${N}_L${L}" +GTEST_POSTFIX="${BACKEND}_M${M}_K${K}_N${N}_L${L}${PRE_COMM}${POST_COMM}" export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}" echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO @@ -32,8 +36,8 @@ MPIFLAGS=" -np $NP" MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1" # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl" # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=cuda" -# MPIFLAGS+=" -x UCC_CL_BASIC_TLS=ucp" -# MPIFLAGS+=" -x UCX_RNDV_THRESH=0 -x UCX_TLS=ib,cuda_copy" +MPIFLAGS+=" -x UCC_CL_BASIC_TLS=ucp" +MPIFLAGS+=" -x UCX_RNDV_THRESH=0 -x UCX_TLS=ib,cuda_copy" # MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_NUM_WORKERS=8" # MPIFLAGS+=" -x UCC_EC_CUDA_USE_COOPERATIVE_LAUNCH=0" # MPIFLAGS+=" -x UCC_EC_CUDA_STREAM_TASK_MODE=driver" @@ -45,10 +49,10 @@ MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1" # MPIFLAGS+=" -x UCC_TL_CUDA_ALLGATHER_RING_NUM_CHUNKS=32" # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=^sharp,mlx5" -# MPIFLAGS+=" -x UCC_COLL_TRACE=debug" +# MPIFLAGS+=" -x UCC_COLL_TRACE=info" # MPIFLAGS+=" -x UCC_LOG_LEVEL=debug" # MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1" -MPIFLAGS+=" -x NCCL_P2P_NET_CHUNKSIZE=2MB" +# MPIFLAGS+=" -x NCCL_P2P_NET_CHUNKSIZE=2MB" # MPIFLAGS+=" -x NCCL_DEBUG=TRACE" #INFO # MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event" # MPIFLAGS+=" -x CUDA_DEVICE_MAX_CONNECTIONS=2" diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp index 85059b89a31..9898df02ac8 100644 --- a/tests/cpp/test_multidevice_overlap.cpp +++ b/tests/cpp/test_multidevice_overlap.cpp @@ -51,7 +51,9 @@ using DummyOverlapBenchmarkParams = std::tuple< /*M=*/int64_t, /*K=*/int64_t, /*N=*/int64_t, - /*L(communication msgsize)=*/int64_t>; + /*L(communication msgsize)=*/int64_t, + /*pre_comm=*/bool, + /*post_comm=*/bool>; class DummyOverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterface { protected: @@ -81,7 +83,9 @@ TEST_P(DummyOverlapBenchmark, PipelinedAGMatmulBenchmark) { M, K, N, - L] = GetParam(); + L, + pre_comm, + post_comm] = GetParam(); std::vector all_ranks(communicator_->size()); std::iota(all_ranks.begin(), all_ranks.end(), 0); @@ -112,13 +116,20 @@ TEST_P(DummyOverlapBenchmark, PipelinedAGMatmulBenchmark) { cudaEventRecord(start); } - setCurrentCUDAStream(communication_stream); - world->_allgather_base(dst, src)->wait(); + if (pre_comm) { + setCurrentCUDAStream(communication_stream); + world->_allgather_base(dst, src)->wait(); + } // compute setCurrentCUDAStream(compute_stream); torch::matmul_out(tc, ta, tb); + if (post_comm) { + setCurrentCUDAStream(communication_stream); + world->_allgather_base(dst, src)->wait(); + } + if (iteration == iteration_profiler_end) { cudaProfilerStop();; } @@ -143,7 +154,9 @@ INSTANTIATE_TEST_SUITE_P( /*M=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)), /*K=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)), /*N=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)), - /*L=*/testing::Values(pow(2,10), pow(2,15), pow(2,17))), + /*L=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)), + /*pre-comm=*/testing::Bool(), + /*post-comm=*/testing::Bool()), [](const testing::TestParamInfo& info) -> std::string { std::ostringstream os; @@ -151,7 +164,9 @@ INSTANTIATE_TEST_SUITE_P( << "M" << std::get<1>(info.param) << "_" << "K" << std::get<2>(info.param) << "_" << "N" << std::get<3>(info.param) << "_" - << "L" << std::get<4>(info.param); + << "L" << std::get<4>(info.param) + << ((std::get<5>(info.param))? "_pre_comm" : "") + << ((std::get<6>(info.param))? "_post_comm" : ""); return os.str(); }); From bfc7fa6ac9e81d6b4a2552733cc9a76dc1c66635 Mon Sep 17 00:00:00 2001 From: snordmann Date: Fri, 6 Dec 2024 17:26:44 +0200 Subject: [PATCH 18/55] cleanup test script --- bench/test | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/bench/test b/bench/test index 72c22480714..969b8da00e2 100755 --- a/bench/test +++ b/bench/test @@ -1,5 +1,5 @@ #!/bin/bash -EXPERIMENT=Dummy_profile_POST_COMM_UCC_TL_UCP_OVER_IB_LARGE +EXPERIMENT=Dummy_profile_UCC_TL_CUDA DATE=$(date +%Y%m%d-%H%M) LOG_BASE="/opt/pytorch/Fuser/bench/logs" @@ -10,7 +10,7 @@ export LOG_FILE_INFO="${LOGS}/info.txt" echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO NP=8 -BACKEND=NCCL +BACKEND=UCC M=131072 #32768 K=32768 N=32768 #1024 @@ -20,8 +20,8 @@ Streams=8 Pgs=1 L=32768 -# PRE_COMM="_pre_comm" -POST_COMM="_post_comm" +PRE_COMM="_pre_comm" +# POST_COMM="_post_comm" # UNFUSE="_unfused" # GRAPH="_WithCudaGraph" # cuStreamWrite=WithcuStreamWriteValue32_ @@ -33,28 +33,37 @@ export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}" echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO MPIFLAGS=" -np $NP" -MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1" + +# MPIFLAGS+=" -x NCCL_P2P_NET_CHUNKSIZE=2MB" +# MPIFLAGS+=" -x NCCL_DEBUG=TRACE" #INFO +# MPIFLAGS+=" -x NCCL_MAX_NCHANNELS=1" + # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl" -# MPIFLAGS+=" -x UCC_CL_BASIC_TLS=cuda" -MPIFLAGS+=" -x UCC_CL_BASIC_TLS=ucp" -MPIFLAGS+=" -x UCX_RNDV_THRESH=0 -x UCX_TLS=ib,cuda_copy" +# MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event" + +MPIFLAGS+=" -x UCC_CL_BASIC_TLS=cuda" +# MPIFLAGS+=" -x UCC_TL_CUDA_SCRATCH_SIZE=32mb" +# MPIFLAGS+=" -x UCC_TL_CUDA_ALLGATHER_RING_MAX_RINGS=32" +# MPIFLAGS+=" -x UCC_TL_CUDA_ALLGATHER_RING_NUM_CHUNKS=32" + # MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_NUM_WORKERS=8" # MPIFLAGS+=" -x UCC_EC_CUDA_USE_COOPERATIVE_LAUNCH=0" # MPIFLAGS+=" -x UCC_EC_CUDA_STREAM_TASK_MODE=driver" # MPIFLAGS+=" -x UCC_EC_CUDA_STREAM_TASK_MODE=kernel" # MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_COPY_LARGE_THRESH=1M" # MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_NUM_THREADS=512" -# MPIFLAGS+=" -x UCC_TL_CUDA_SCRATCH_SIZE=32mb" -# MPIFLAGS+=" -x UCC_TL_CUDA_ALLGATHER_RING_MAX_RINGS=32" -# MPIFLAGS+=" -x UCC_TL_CUDA_ALLGATHER_RING_NUM_CHUNKS=32" +# MPIFLAGS+=" -x UCC_CL_BASIC_TLS=ucp" +# MPIFLAGS+=" -x UCX_RNDV_THRESH=0 -x UCX_TLS=ib,cuda_copy" +# MPIFLAGS+=" -x UCX_RNDV_SCHEME=put_zcopy" +# MPIFLAGS+=" -x UCX_RNDV_SCHEME=get_zcopy" + + +MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1" # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=^sharp,mlx5" # MPIFLAGS+=" -x UCC_COLL_TRACE=info" # MPIFLAGS+=" -x UCC_LOG_LEVEL=debug" # MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1" -# MPIFLAGS+=" -x NCCL_P2P_NET_CHUNKSIZE=2MB" -# MPIFLAGS+=" -x NCCL_DEBUG=TRACE" #INFO -# MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event" # MPIFLAGS+=" -x CUDA_DEVICE_MAX_CONNECTIONS=2" echo "mpi flags: $MPIFLAGS" | tee -a $LOG_FILE_INFO From 1a1138cbb5629fd47c9d0c056ac21db68af2f77b Mon Sep 17 00:00:00 2001 From: snordmann Date: Wed, 8 Jan 2025 18:28:45 +0200 Subject: [PATCH 19/55] update --- bench/test | 27 +++++++++-------- csrc/multidevice/utils.cpp | 6 ++-- tests/cpp/test_multidevice_overlap.cpp | 42 ++++++++++++++++++-------- 3 files changed, 46 insertions(+), 29 deletions(-) diff --git a/bench/test b/bench/test index 969b8da00e2..1b5d6f41c5a 100755 --- a/bench/test +++ b/bench/test @@ -1,25 +1,19 @@ #!/bin/bash -EXPERIMENT=Dummy_profile_UCC_TL_CUDA +EXPERIMENT=Dummy_profile_msgsize256m_float16_M128k_K128k_N32k_UCC_IB DATE=$(date +%Y%m%d-%H%M) LOG_BASE="/opt/pytorch/Fuser/bench/logs" -export LOGS="${LOG_BASE}/${EXPERIMENT}_${DATE}" - -mkdir -p $LOGS -export LOG_FILE_INFO="${LOGS}/info.txt" -echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO - NP=8 BACKEND=UCC M=131072 #32768 -K=32768 +K=131072 N=32768 #1024 S=8 Streams=8 Pgs=1 -L=32768 +L=1048576 #268435456 #67108864 #131072 PRE_COMM="_pre_comm" # POST_COMM="_post_comm" # UNFUSE="_unfused" @@ -41,7 +35,7 @@ MPIFLAGS=" -np $NP" # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl" # MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event" -MPIFLAGS+=" -x UCC_CL_BASIC_TLS=cuda" +# MPIFLAGS+=" -x UCC_CL_BASIC_TLS=cuda" # MPIFLAGS+=" -x UCC_TL_CUDA_SCRATCH_SIZE=32mb" # MPIFLAGS+=" -x UCC_TL_CUDA_ALLGATHER_RING_MAX_RINGS=32" # MPIFLAGS+=" -x UCC_TL_CUDA_ALLGATHER_RING_NUM_CHUNKS=32" @@ -53,10 +47,10 @@ MPIFLAGS+=" -x UCC_CL_BASIC_TLS=cuda" # MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_COPY_LARGE_THRESH=1M" # MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_NUM_THREADS=512" -# MPIFLAGS+=" -x UCC_CL_BASIC_TLS=ucp" -# MPIFLAGS+=" -x UCX_RNDV_THRESH=0 -x UCX_TLS=ib,cuda_copy" +MPIFLAGS+=" -x UCC_CL_BASIC_TLS=ucp" +MPIFLAGS+=" -x UCX_RNDV_THRESH=0 -x UCX_TLS=ib,cuda_copy" # MPIFLAGS+=" -x UCX_RNDV_SCHEME=put_zcopy" -# MPIFLAGS+=" -x UCX_RNDV_SCHEME=get_zcopy" +MPIFLAGS+=" -x UCX_RNDV_SCHEME=get_zcopy" MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1" @@ -65,6 +59,13 @@ MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1" # MPIFLAGS+=" -x UCC_LOG_LEVEL=debug" # MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1" # MPIFLAGS+=" -x CUDA_DEVICE_MAX_CONNECTIONS=2" + + +export LOGS="${LOG_BASE}/${EXPERIMENT}_${BACKEND}_${DATE}" +mkdir -p $LOGS +export LOG_FILE_INFO="${LOGS}/info.txt" +echo "Writing to $LOG_FILE_INFO" | tee -a $LOG_FILE_INFO + echo "mpi flags: $MPIFLAGS" | tee -a $LOG_FILE_INFO TEST_CMD="$BUILD_DIRECTORY/test_multidevice --gtest_filter=${GTEST_FILTER}" diff --git a/csrc/multidevice/utils.cpp b/csrc/multidevice/utils.cpp index d2117b222da..5eb4a8a21b9 100644 --- a/csrc/multidevice/utils.cpp +++ b/csrc/multidevice/utils.cpp @@ -43,11 +43,11 @@ std::unordered_set getShardedIterDomains(TensorView* tv) { // Returns whether a IterDomain in a TensorView is the outermost // allocated IterDomain in the TensorView. bool isOutermostAllocatedId(TensorView* tv, IterDomain* id) { - for (auto i : tv->getLoopDomain()) { - if (i == id) { + for (auto* loop_id : tv->getLoopDomain()) { + if (loop_id == id) { return true; } - if (!i->isDeviceDim() && !i->isReduction() && !i->isBroadcast()) { + if (!loop_id->isDeviceDim() && !loop_id->isReduction() && !loop_id->isBroadcast()) { return false; } } diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp index 9898df02ac8..a3999b477ba 100644 --- a/tests/cpp/test_multidevice_overlap.cpp +++ b/tests/cpp/test_multidevice_overlap.cpp @@ -73,10 +73,11 @@ class DummyOverlapBenchmark : public MultiDeviceTest, public testing::WithParamI std::map DummyOverlapBenchmark::times = {}; TEST_P(DummyOverlapBenchmark, PipelinedAGMatmulBenchmark) { - constexpr int64_t number_of_warmups = 50; - constexpr int64_t number_of_iterations = 100; - constexpr int64_t iteration_profiler_start = 10; - constexpr int64_t iteration_profiler_end = 15; + constexpr int64_t number_of_warmups = 20; + constexpr int64_t number_of_iterations = 80; + constexpr int64_t total_number_of_iterations = number_of_warmups + number_of_iterations; + constexpr int64_t iteration_profiler_start = 5; + constexpr int64_t iteration_profiler_end = 10; auto [backend, @@ -90,27 +91,36 @@ TEST_P(DummyOverlapBenchmark, PipelinedAGMatmulBenchmark) { std::vector all_ranks(communicator_->size()); std::iota(all_ranks.begin(), all_ranks.end(), 0); auto world = communicator_->getBackendForTeam(all_ranks, backend); + auto nccl_world = communicator_->getBackendForTeam(all_ranks, CommunicatorBackend::kNccl); std::vector streams = createStreams(2, communicator_->deviceId()); auto& compute_stream = streams.at(0); auto& communication_stream = streams.at(1); - auto options = at::TensorOptions().dtype(at::kFloat).device(communicator_->device()); - auto ta = at::randn({M, K}, options); - auto tb = at::randn({K, N}, options); - auto tc = at::empty({M, N}, options); - auto src = at::randn({L}, options); - auto dst = at::empty({L * communicator_->size()}, options); + auto options_matmul = at::TensorOptions().dtype(torch::kFloat16).device(communicator_->device()); + auto ta = at::randn({M, K}, options_matmul); + auto tb = at::randn({K, N}, options_matmul); + auto tc = at::empty({M, N}, options_matmul); + + auto options_comms = at::TensorOptions().dtype(torch::kFloat32).device(communicator_->device()); + auto src = at::randn({L}, options_comms); + auto dst = at::empty({L * communicator_->size()}, options_comms); + std::vector barrier_scratch_buffer = {at::randn({1}, options_comms)}; cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); + nccl_world->allreduce(barrier_scratch_buffer)->wait(); + for (const auto& iteration : - c10::irange(number_of_warmups + number_of_iterations)) { + c10::irange(total_number_of_iterations)) { + if (iteration % 10 == 0 && communicator_->deviceId() == 0) { + std::cout << "iteration " << iteration <<"/" << total_number_of_iterations << std::endl; + } if (iteration == iteration_profiler_start) { - cudaProfilerStart();; + cudaProfilerStart(); } if (iteration == number_of_warmups) { cudaEventRecord(start); @@ -133,8 +143,14 @@ TEST_P(DummyOverlapBenchmark, PipelinedAGMatmulBenchmark) { if (iteration == iteration_profiler_end) { cudaProfilerStop();; } + if (!pre_comm & !post_comm) { + nccl_world->allreduce(barrier_scratch_buffer)->wait(); + } synchronizeStreams(streams); } + if (pre_comm || post_comm) { + nccl_world->allreduce(barrier_scratch_buffer)->wait(); + } cudaEventRecord(stop); cudaEventSynchronize(stop); float milliseconds = 0; @@ -154,7 +170,7 @@ INSTANTIATE_TEST_SUITE_P( /*M=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)), /*K=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)), /*N=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)), - /*L=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)), + /*L=*/testing::Values(1, pow(2,10), pow(2,15), pow(2,17), pow(2,20), pow(2,24), pow(2,26), pow(2,28)), /*pre-comm=*/testing::Bool(), /*post-comm=*/testing::Bool()), [](const testing::TestParamInfo& info) From e037ee5b62418632055fe5f32f8659b0b4bc49d9 Mon Sep 17 00:00:00 2001 From: snordmann Date: Thu, 16 Jan 2025 02:38:52 -0800 Subject: [PATCH 20/55] test with stream parallel type and host IR --- bench/test | 34 ++++---- tests/cpp/test_multidevice_overlap.cpp | 111 ++++++++++++++++++++++++- 2 files changed, 127 insertions(+), 18 deletions(-) diff --git a/bench/test b/bench/test index 1b5d6f41c5a..6777835f7b4 100755 --- a/bench/test +++ b/bench/test @@ -1,28 +1,32 @@ #!/bin/bash -EXPERIMENT=Dummy_profile_msgsize256m_float16_M128k_K128k_N32k_UCC_IB +EXPERIMENT=StreamParallelType_tests DATE=$(date +%Y%m%d-%H%M) LOG_BASE="/opt/pytorch/Fuser/bench/logs" NP=8 BACKEND=UCC -M=131072 #32768 -K=131072 -N=32768 #1024 +M=32768 +K=32768 +N=1024 S=8 -Streams=8 +Streams=3 Pgs=1 -L=1048576 #268435456 #67108864 #131072 -PRE_COMM="_pre_comm" +# M=131072 #32768 +# K=131072 +# N=32768 #1024 +# L=1048576 #268435456 #67108864 #131072 +# PRE_COMM="_pre_comm" # POST_COMM="_post_comm" # UNFUSE="_unfused" # GRAPH="_WithCudaGraph" # cuStreamWrite=WithcuStreamWriteValue32_ # GTEST_PREFIX="OverlapBenchmark.PipelinedAGMatmulBenchmark/" -GTEST_PREFIX="DummyOverlapBenchmark.PipelinedAGMatmulBenchmark/" -# GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}${UNFUSE}${GRAPH}" -GTEST_POSTFIX="${BACKEND}_M${M}_K${K}_N${N}_L${L}${PRE_COMM}${POST_COMM}" +# GTEST_PREFIX="DummyOverlapBenchmark.PipelinedAGMatmulBenchmark/" +GTEST_PREFIX="OverlapBenchmark.PipelinedAGMatmulBenchmarkStreamParallelType/" +GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}${UNFUSE}${GRAPH}" +# GTEST_POSTFIX="${BACKEND}_M${M}_K${K}_N${N}_L${L}${PRE_COMM}${POST_COMM}" export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}" echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO @@ -32,7 +36,7 @@ MPIFLAGS=" -np $NP" # MPIFLAGS+=" -x NCCL_DEBUG=TRACE" #INFO # MPIFLAGS+=" -x NCCL_MAX_NCHANNELS=1" -# MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl" +MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl" # MPIFLAGS+=" -x UCC_TL_NCCL_SYNC=event" # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=cuda" @@ -47,15 +51,15 @@ MPIFLAGS=" -np $NP" # MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_COPY_LARGE_THRESH=1M" # MPIFLAGS+=" -x UCC_EC_CUDA_EXEC_NUM_THREADS=512" -MPIFLAGS+=" -x UCC_CL_BASIC_TLS=ucp" -MPIFLAGS+=" -x UCX_RNDV_THRESH=0 -x UCX_TLS=ib,cuda_copy" +# MPIFLAGS+=" -x UCC_CL_BASIC_TLS=ucp" +# MPIFLAGS+=" -x UCX_RNDV_THRESH=0 -x UCX_TLS=ib,cuda_copy" # MPIFLAGS+=" -x UCX_RNDV_SCHEME=put_zcopy" -MPIFLAGS+=" -x UCX_RNDV_SCHEME=get_zcopy" +# MPIFLAGS+=" -x UCX_RNDV_SCHEME=get_zcopy" MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1" # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=^sharp,mlx5" -# MPIFLAGS+=" -x UCC_COLL_TRACE=info" +MPIFLAGS+=" -x UCC_COLL_TRACE=info" # MPIFLAGS+=" -x UCC_LOG_LEVEL=debug" # MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1" # MPIFLAGS+=" -x CUDA_DEVICE_MAX_CONNECTIONS=2" diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp index 272d785e2a1..7cf3cd288a4 100644 --- a/tests/cpp/test_multidevice_overlap.cpp +++ b/tests/cpp/test_multidevice_overlap.cpp @@ -345,15 +345,120 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) { } } +TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmarkStreamParallelType) { + constexpr int64_t number_of_warmups = 50; + constexpr int64_t number_of_iterations = 200; + constexpr int64_t iteration_profiler_start = 10; + constexpr int64_t iteration_profiler_end = 15; + + const int64_t D = communicator_->size(); + auto [backend, + S, + M, + K, + N, + number_of_streams, + add_cuStreamWriteValue32, + number_of_pgs, + unfuse_loops, + use_cuda_graph] = GetParam(); + + if (M % (D * S) != 0) { + GTEST_SKIP() << "M must be a multiple of D * S, but got M = " << M + << ", D = " << D << ", S = " << S; + } + if (add_cuStreamWriteValue32) { + GTEST_SKIP() << "cuStreamWriteValue32 not supported with StreamParallelType"; + } + if (number_of_pgs > 1) { + GTEST_SKIP() << "StreamParallelType not supported with multiple process groups"; + } + if (unfuse_loops) { + GTEST_SKIP() << "StreamParallelType not supported with unfused loops"; + } + if (use_cuda_graph) { + GTEST_SKIP() << "StreamParallelType not supported with cuda graphs"; + } + + + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); + + TensorView* a = makeContigTensor(4); //[S, DIDx(D), M/(S*D), K] + TensorView* b = makeContigTensor(2); //[K, N] + TensorView* c = matmul(a, b); //[S, D, M/(S*D), N] + + fusion->addInput(a); + fusion->addInput(b); + fusion->addOutput(c); + + auto mesh = DeviceMesh::createForNumDevices(D); + a->setDeviceMesh(mesh); + b->setDeviceMesh(mesh); + c->setDeviceMesh(mesh); + + a->axis(1)->parallelize(ParallelType::DIDx); + c->axis(0)->parallelize(ParallelType::Stream); + + communicator_->setDefaultBackend(backend); + + hir::HostIrEvaluatorParams params; + params.number_of_streams = number_of_streams; + MultiDeviceExecutor executor(std::move(fusion), *communicator_, params); + + + auto tensor_options = + at::TensorOptions().dtype(at::kFloat).device(communicator_->device()); + at::Tensor ta_unsharded = at::randn({S, D, M / (S * D), K}, tensor_options); + at::Tensor ta = ta_unsharded.slice( + 1, communicator_->deviceId(), communicator_->deviceId() + 1); + at::Tensor tb = at::randn({K, N}, tensor_options); + at::Tensor tc_ref = at::matmul(ta_unsharded, tb); + + std::vector inputs = {ta, tb}; + at::Tensor tc; + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + for (const auto& iteration : + c10::irange(number_of_warmups + number_of_iterations)) { + if (iteration == iteration_profiler_start) { + cudaProfilerStart();; + } + if (iteration == number_of_warmups) { + cudaEventRecord(start); + } + + tc = executor.runWithInput(inputs).at(0); + + if (iteration == iteration_profiler_end) { + cudaProfilerStop();; + } + } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + milliseconds /= number_of_iterations; + + std::string test_name = ::testing::UnitTest::GetInstance()->current_test_info()->name(); + times.insert({test_name, milliseconds}); + std::cout << "rank " << communicator_->deviceId() << ", " << test_name << " : " << milliseconds << std::endl; + + EXPECT_TRUE(torch::allclose(tc_ref, tc, 1e-1, 1e-1)); +} + INSTANTIATE_TEST_SUITE_P( , OverlapBenchmark, testing::Combine( testing::Values(CommunicatorBackend::kNccl, CommunicatorBackend::kUcc), /*S=*/testing::Values(1,2,4,8, 16, 32), - /*M=*/testing::Values(pow(2,10), pow(2,15)), - /*K=*/testing::Values(pow(2,10), pow(2,15)), - /*N=*/testing::Values(pow(2,10)), + /*M=*/testing::Values(pow(2,10), pow(2,15), pow(2,18)), + /*K=*/testing::Values(pow(2,10), pow(2,15), pow(2,18)), + /*N=*/testing::Values(pow(2,10), pow(2,15)), /*number_of_streams=*/testing::Values(3, 8, 32), /*add_cuStreamWriteValue32*/testing::Values(false, true), /*number_of_pgs=*/testing::Values(1, 2, 4, 8), From 8328c2809420bab045248b34c321b74279198a17 Mon Sep 17 00:00:00 2001 From: snordmann Date: Mon, 20 Jan 2025 05:49:51 -0800 Subject: [PATCH 21/55] add support for other dtypes --- bench/test | 4 +++- tests/cpp/test_multidevice_overlap.cpp | 21 +++++++++++++-------- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/bench/test b/bench/test index 6777835f7b4..d1836ac8ccb 100755 --- a/bench/test +++ b/bench/test @@ -9,6 +9,8 @@ M=32768 K=32768 N=1024 +DTYPE="__half" # float, __bfloat + S=8 Streams=3 Pgs=1 @@ -25,7 +27,7 @@ Pgs=1 # GTEST_PREFIX="OverlapBenchmark.PipelinedAGMatmulBenchmark/" # GTEST_PREFIX="DummyOverlapBenchmark.PipelinedAGMatmulBenchmark/" GTEST_PREFIX="OverlapBenchmark.PipelinedAGMatmulBenchmarkStreamParallelType/" -GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${cuStreamWrite}Pgs${Pgs}${UNFUSE}${GRAPH}" +GTEST_POSTFIX="${BACKEND}_S${S}_M${M}_K${K}_N${N}_Streams${Streams}_${DTYPE}_${cuStreamWrite}Pgs${Pgs}${UNFUSE}${GRAPH}" # GTEST_POSTFIX="${BACKEND}_M${M}_K${K}_N${N}_L${L}${PRE_COMM}${POST_COMM}" export GTEST_FILTER="${GTEST_PREFIX}${GTEST_POSTFIX}" echo "gtest filter: $GTEST_FILTER" | tee -a $LOG_FILE_INFO diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp index 7cf3cd288a4..c08eea14b93 100644 --- a/tests/cpp/test_multidevice_overlap.cpp +++ b/tests/cpp/test_multidevice_overlap.cpp @@ -196,7 +196,8 @@ using OverlapBenchmarkParams = std::tuple< /*add_cuStreamWriteValue32=*/bool, /*number_of_pgs=*/int64_t, /*unfuse_loops=*/bool, - /*use_cuda_graph=*/bool>; + /*use_cuda_graph=*/bool, + DataType>; class OverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterface { protected: @@ -233,7 +234,8 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) { add_cuStreamWriteValue32, number_of_pgs, unfuse_loops, - use_cuda_graph] = GetParam(); + use_cuda_graph, + dtype] = GetParam(); GTEST_ASSERT_EQ(M % S, 0); @@ -244,7 +246,7 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) { createStreams(number_of_streams, communicator_->deviceId()); setCurrentCUDAStream(streams.at(0)); - auto options = at::TensorOptions().dtype(at::kFloat).device(communicator_->device()); + auto options = at::TensorOptions().dtype(data_type_to_aten(dtype)).device(communicator_->device()); auto ta = at::randn({S, M/S,K}, options); auto ta_unsharded = at::empty({S, D, M/S,K}, options); auto tb = at::randn({K,N}, options); @@ -361,7 +363,8 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmarkStreamParallelType) { add_cuStreamWriteValue32, number_of_pgs, unfuse_loops, - use_cuda_graph] = GetParam(); + use_cuda_graph, + dtype] = GetParam(); if (M % (D * S) != 0) { GTEST_SKIP() << "M must be a multiple of D * S, but got M = " << M @@ -384,8 +387,8 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmarkStreamParallelType) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); - TensorView* a = makeContigTensor(4); //[S, DIDx(D), M/(S*D), K] - TensorView* b = makeContigTensor(2); //[K, N] + TensorView* a = makeContigTensor(4, dtype); //[S, DIDx(D), M/(S*D), K] + TensorView* b = makeContigTensor(2, dtype); //[K, N] TensorView* c = matmul(a, b); //[S, D, M/(S*D), N] fusion->addInput(a); @@ -408,7 +411,7 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmarkStreamParallelType) { auto tensor_options = - at::TensorOptions().dtype(at::kFloat).device(communicator_->device()); + at::TensorOptions().dtype(data_type_to_aten(dtype)).device(communicator_->device()); at::Tensor ta_unsharded = at::randn({S, D, M / (S * D), K}, tensor_options); at::Tensor ta = ta_unsharded.slice( 1, communicator_->deviceId(), communicator_->deviceId() + 1); @@ -463,7 +466,8 @@ INSTANTIATE_TEST_SUITE_P( /*add_cuStreamWriteValue32*/testing::Values(false, true), /*number_of_pgs=*/testing::Values(1, 2, 4, 8), /*unfuse_loops=*/testing::Values(false, true), - /*use_cuda_graph=*/testing::Values(false)), // cuda graphs not supported: ucc does not supports it (segfault) and nccl PG has a "syncStream" that throws + /*use_cuda_graph=*/testing::Values(false), // cuda graphs not supported: ucc does not supports it (segfault) and nccl PG has a "syncStream" that throws + testing::Values(DataType::Float, DataType::Half, DataType::BFloat16)), [](const testing::TestParamInfo& info) -> std::string { std::ostringstream os; @@ -473,6 +477,7 @@ INSTANTIATE_TEST_SUITE_P( << "K" << std::get<3>(info.param) << "_" << "N" << std::get<4>(info.param) << "_" << "Streams" << std::get<5>(info.param) << "_" + << /*dtype:*/std::get<10>(info.param) << "_" << ((std::get<6>(info.param))? "WithcuStreamWriteValue32_" : "") << "Pgs" << std::get<7>(info.param) << ((std::get<8>(info.param))? "_unfused" : "") From 2fecf02c58822a1fc2da9fdcfcc50ff4ab8204ad Mon Sep 17 00:00:00 2001 From: snordmann Date: Wed, 22 Jan 2025 17:05:06 +0200 Subject: [PATCH 22/55] remove trace print --- bench/test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bench/test b/bench/test index d1836ac8ccb..8527e2d370c 100755 --- a/bench/test +++ b/bench/test @@ -61,7 +61,7 @@ MPIFLAGS+=" -x UCC_CL_BASIC_TLS=nccl" MPIFLAGS+=" -x UCX_NET_DEVICES=mlx5_0:1" # MPIFLAGS+=" -x UCC_CL_BASIC_TLS=^sharp,mlx5" -MPIFLAGS+=" -x UCC_COLL_TRACE=info" +# MPIFLAGS+=" -x UCC_COLL_TRACE=info" # MPIFLAGS+=" -x UCC_LOG_LEVEL=debug" # MPIFLAGS+=" -x TORCH_NCCL_AVOID_RECORD_STREAMS=1" # MPIFLAGS+=" -x CUDA_DEVICE_MAX_CONNECTIONS=2" From 26f1f7a7c9e4dbeef4006110e5601ad6b3966219 Mon Sep 17 00:00:00 2001 From: snordmann Date: Thu, 23 Jan 2025 11:21:29 -0800 Subject: [PATCH 23/55] add stub files --- CMakeLists.txt | 13 ++++++++++++- tests/cpp/multidevice.h | 2 ++ tests/cpp/multidevice_kernels.cu | 23 +++++++++++++++++++++++ tests/cpp/multidevice_kernels.h | 16 ++++++++++++++++ tests/cpp/test_multidevice_gpu_comms.cpp | 23 +++++++++++++++++++++++ 5 files changed, 76 insertions(+), 1 deletion(-) create mode 100644 tests/cpp/multidevice_kernels.cu create mode 100644 tests/cpp/multidevice_kernels.h create mode 100644 tests/cpp/test_multidevice_gpu_comms.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 6e5322527cf..c1899c416e7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -611,6 +611,16 @@ if(BUILD_TEST) target_include_directories(${RNG_TEST_KERNELS} PRIVATE "${NVFUSER_ROOT}") endif() +if(BUILD_TEST) + set(MULTIDEVICE_TEST_KERNELS "${NVFUSER_TESTS}_multidevice_kernels") + add_library(${MULTIDEVICE_TEST_KERNELS} SHARED ${NVFUSER_ROOT}/tests/cpp/multidevice_kernels.cu) + + # CUDA 11 does not support C++20, so hard code C++17 here + set_property(TARGET ${MULTIDEVICE_TEST_KERNELS} PROPERTY CXX_STANDARD 17) + target_link_libraries(${MULTIDEVICE_TEST_KERNELS} PRIVATE torch ${TORCH_LIBRARIES} codegen_internal) + target_include_directories(${MULTIDEVICE_TEST_KERNELS} PRIVATE "${NVFUSER_ROOT}") +endif() + function(add_test_without_main TEST_NAME TEST_SRC ADDITIONAL_LINK) list(APPEND TEST_SRC ${NVFUSER_ROOT}/tests/cpp/utils.cpp @@ -669,8 +679,9 @@ if(BUILD_TEST) ${NVFUSER_ROOT}/tests/cpp/test_multidevice_pipeline.cpp ${NVFUSER_ROOT}/tests/cpp/test_multidevice_sharding.cpp ${NVFUSER_ROOT}/tests/cpp/test_multidevice_transformer.cpp + ${NVFUSER_ROOT}/tests/cpp/test_multidevice_gpu_comms.cpp ) - add_test_without_main(test_multidevice "${MULTIDEVICE_TEST_SRCS}" "") + add_test_without_main(test_multidevice "${MULTIDEVICE_TEST_SRCS}" ${MULTIDEVICE_TEST_KERNELS}) list(APPEND TEST_BINARIES test_multidevice) set(MULTIDEVICE_TUTORIAL_SRCS) diff --git a/tests/cpp/multidevice.h b/tests/cpp/multidevice.h index 1831eb46bbb..9863c4e919e 100644 --- a/tests/cpp/multidevice.h +++ b/tests/cpp/multidevice.h @@ -48,4 +48,6 @@ class MultiDeviceTest : public NVFuserTest { void waitForDebuggerAtRank(DeviceIdxType rank); }; +__global__ void DummyMultiDeviceKernel(); + } // namespace nvfuser diff --git a/tests/cpp/multidevice_kernels.cu b/tests/cpp/multidevice_kernels.cu new file mode 100644 index 00000000000..6553bee9393 --- /dev/null +++ b/tests/cpp/multidevice_kernels.cu @@ -0,0 +1,23 @@ +// clang-format off +/* + * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES. + * All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + */ +// clang-format on + +// Warning: this file should not include any header from nvFuser or pytorch +// (except raw headers). Compiling dynamic_type.h with nvcc is not supported. +// Compiling pytorch with nvcc is not supported either. + +#include + +namespace nvfuser { + +__global__ void DummyMultiDeviceKernel() {} + +void LaunchDummyMultiDeviceKernel() { + DummyMultiDeviceKernel<<<1, 1>>>(); +} + +} // namespace nvfuser diff --git a/tests/cpp/multidevice_kernels.h b/tests/cpp/multidevice_kernels.h new file mode 100644 index 00000000000..0f1099aa8c3 --- /dev/null +++ b/tests/cpp/multidevice_kernels.h @@ -0,0 +1,16 @@ +// clang-format off +/* + * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES. + * All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + */ +// clang-format on + +#pragma once + + +namespace nvfuser { + +void LaunchDummyMultiDeviceKernel(); + +} // namespace nvfuser diff --git a/tests/cpp/test_multidevice_gpu_comms.cpp b/tests/cpp/test_multidevice_gpu_comms.cpp new file mode 100644 index 00000000000..bfffdba70f0 --- /dev/null +++ b/tests/cpp/test_multidevice_gpu_comms.cpp @@ -0,0 +1,23 @@ +// clang-format off +/* +* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES. +* All rights reserved. +* SPDX-License-Identifier: BSD-3-Clause +*/ +// clang-format on +#include +#include +#include +#include +#include +#include +#include +#include + +namespace nvfuser { + +TEST_F(MultiDeviceTest, DummyMultiDeviceKernelTest) { + LaunchDummyMultiDeviceKernel(); +} + +} // namespace nvfuser From 03b0147012452f393582a4cfbd30c6f10644519f Mon Sep 17 00:00:00 2001 From: snordmann Date: Fri, 24 Jan 2025 06:08:05 -0800 Subject: [PATCH 24/55] first working example opening cuda ipc handles --- csrc/multidevice/communicator.h | 4 ++ tests/cpp/test_multidevice_gpu_comms.cpp | 54 +++++++++++++++++++++++- 2 files changed, 56 insertions(+), 2 deletions(-) diff --git a/csrc/multidevice/communicator.h b/csrc/multidevice/communicator.h index f476de2b37a..9ee6c613da8 100644 --- a/csrc/multidevice/communicator.h +++ b/csrc/multidevice/communicator.h @@ -141,6 +141,10 @@ class Communicator { return false; } + auto getTcpStore() { + return store_; + } + private: Communicator( CommunicatorBackend backend = comm_backend_default, diff --git a/tests/cpp/test_multidevice_gpu_comms.cpp b/tests/cpp/test_multidevice_gpu_comms.cpp index bfffdba70f0..8f662130257 100644 --- a/tests/cpp/test_multidevice_gpu_comms.cpp +++ b/tests/cpp/test_multidevice_gpu_comms.cpp @@ -16,8 +16,58 @@ namespace nvfuser { -TEST_F(MultiDeviceTest, DummyMultiDeviceKernelTest) { - LaunchDummyMultiDeviceKernel(); +namespace { + +#define CUDA_CALL(call) ASSERT_EQ((call), cudaSuccess) + +template +std::vector toBytes(T data) { + return std::vector( + reinterpret_cast(&data), + reinterpret_cast(&data) + sizeof(T)); +} + +template +T fromBytes(std::vector bytes) { + return *reinterpret_cast(bytes.data()); +} + +} // namespace + +class GpuCommTest : public MultiDeviceTest {}; + +TEST_F(GpuCommTest, IpcMemHandle) { + // Allocate GPU memory + constexpr size_t size = sizeof(int64_t); + const int64_t num_devices = communicator_->size(); + const int64_t rank = communicator_->deviceId(); + void* d_ptr; + CUDA_CALL(cudaMalloc(&d_ptr, size)); + + // Write the value 3 to the cuda buffer + const int64_t value = rank; + CUDA_CALL(cudaMemcpy(d_ptr, &value, sizeof(int64_t), cudaMemcpyHostToDevice)); + + cudaIpcMemHandle_t ipc_handle; + CUDA_CALL(cudaIpcGetMemHandle(&ipc_handle, d_ptr)); + + auto store = communicator_->getTcpStore(); + store->set("ipc_handle_" + std::to_string(rank), toBytes(ipc_handle)); + communicator_->barrier(); + auto peer_ipc_handle = fromBytes(store->get("ipc_handle_" + std::to_string((rank + 1) % num_devices))); + + void* peer_d_ptr; + CUDA_CALL(cudaIpcOpenMemHandle(&peer_d_ptr, peer_ipc_handle, cudaIpcMemLazyEnablePeerAccess)); + + int64_t peer_value; + CUDA_CALL(cudaMemcpy(&peer_value, peer_d_ptr, size, cudaMemcpyDeviceToHost)); + + EXPECT_EQ((value + 1) % num_devices, peer_value); + + // Clean up + CUDA_CALL(cudaIpcCloseMemHandle(peer_d_ptr)); + CUDA_CALL(cudaFree(d_ptr)); + } } // namespace nvfuser From 7625fab82011eba82f19d3b38decb2f7ec6a6f59 Mon Sep 17 00:00:00 2001 From: snordmann Date: Fri, 24 Jan 2025 07:21:14 -0800 Subject: [PATCH 25/55] adding a non-working example with cudaDeviceCanAccessPeer --- tests/cpp/test_multidevice_gpu_comms.cpp | 47 +++++++++++++++++++++++- 1 file changed, 45 insertions(+), 2 deletions(-) diff --git a/tests/cpp/test_multidevice_gpu_comms.cpp b/tests/cpp/test_multidevice_gpu_comms.cpp index 8f662130257..13c05f228fd 100644 --- a/tests/cpp/test_multidevice_gpu_comms.cpp +++ b/tests/cpp/test_multidevice_gpu_comms.cpp @@ -44,9 +44,8 @@ TEST_F(GpuCommTest, IpcMemHandle) { void* d_ptr; CUDA_CALL(cudaMalloc(&d_ptr, size)); - // Write the value 3 to the cuda buffer const int64_t value = rank; - CUDA_CALL(cudaMemcpy(d_ptr, &value, sizeof(int64_t), cudaMemcpyHostToDevice)); + CUDA_CALL(cudaMemcpy(d_ptr, &value, size, cudaMemcpyHostToDevice)); cudaIpcMemHandle_t ipc_handle; CUDA_CALL(cudaIpcGetMemHandle(&ipc_handle, d_ptr)); @@ -70,4 +69,48 @@ TEST_F(GpuCommTest, IpcMemHandle) { } +TEST_F(GpuCommTest, DeviceEnablePeerAccess) { + // Doesn't seem to work when the PID are differents, i.e., when it's one CPU rank per GPU. The line "udaMemcpy(d_ptr, peer_d_ptr, size, cudaMemcpyDeviceToDevice)" throws. + // https://github.com/NVIDIA/nccl/blob/1672c85781ba6158d5d173d3ecac969f8796af11/src/transport/p2p.cc#L324-328 + // https://github.com/NVIDIA/nccl/blob/1672c85781ba6158d5d173d3ecac969f8796af11/src/transport/p2p.cc#L249 + GTEST_SKIP(); + + // Allocate GPU memory + constexpr size_t size = sizeof(int64_t); + const int64_t num_devices = communicator_->size(); + const int64_t rank = communicator_->deviceId(); + const int64_t peer = (rank + 1) % num_devices; + // const int64_t accessing_peer = (num_devices + rank - 1) % num_devices; + + int can_access_peer; + CUDA_CALL(cudaDeviceCanAccessPeer (&can_access_peer, rank, peer)); + if (!can_access_peer) { + GTEST_SKIP() << "Peer access not enabled between devices " << rank << " and " << peer; + } + + CUDA_CALL(cudaDeviceEnablePeerAccess(peer, /*flag (reserved)*/0)); + + void* d_ptr; + CUDA_CALL(cudaMalloc(&d_ptr, size)); + + const int64_t value = rank; + CUDA_CALL(cudaMemcpy(d_ptr, &value, size, cudaMemcpyHostToDevice)); + + + auto store = communicator_->getTcpStore(); + store->set("d_ptr_" + std::to_string(rank), toBytes(d_ptr)); + communicator_->barrier(); + auto peer_d_ptr = fromBytes(store->get("d_ptr_" + std::to_string(peer))); + + CUDA_CALL(cudaMemcpy(d_ptr, peer_d_ptr, size, cudaMemcpyDeviceToDevice)); + int64_t peer_value; + CUDA_CALL(cudaMemcpy(&peer_value, d_ptr, size, cudaMemcpyDeviceToHost)); + + EXPECT_EQ((value + 1) % num_devices, peer_value); + + // Clean up + CUDA_CALL(cudaDeviceDisablePeerAccess(peer)); // not necessary + CUDA_CALL(cudaFree(d_ptr)); +} + } // namespace nvfuser From f703abda1196179a2140eab84fbc06a0b60fe6f3 Mon Sep 17 00:00:00 2001 From: snordmann Date: Tue, 28 Jan 2025 09:36:16 -0800 Subject: [PATCH 26/55] cleanup --- tests/cpp/test_multidevice_gpu_comms.cpp | 44 ------------------------ 1 file changed, 44 deletions(-) diff --git a/tests/cpp/test_multidevice_gpu_comms.cpp b/tests/cpp/test_multidevice_gpu_comms.cpp index 13c05f228fd..a9973d656f4 100644 --- a/tests/cpp/test_multidevice_gpu_comms.cpp +++ b/tests/cpp/test_multidevice_gpu_comms.cpp @@ -69,48 +69,4 @@ TEST_F(GpuCommTest, IpcMemHandle) { } -TEST_F(GpuCommTest, DeviceEnablePeerAccess) { - // Doesn't seem to work when the PID are differents, i.e., when it's one CPU rank per GPU. The line "udaMemcpy(d_ptr, peer_d_ptr, size, cudaMemcpyDeviceToDevice)" throws. - // https://github.com/NVIDIA/nccl/blob/1672c85781ba6158d5d173d3ecac969f8796af11/src/transport/p2p.cc#L324-328 - // https://github.com/NVIDIA/nccl/blob/1672c85781ba6158d5d173d3ecac969f8796af11/src/transport/p2p.cc#L249 - GTEST_SKIP(); - - // Allocate GPU memory - constexpr size_t size = sizeof(int64_t); - const int64_t num_devices = communicator_->size(); - const int64_t rank = communicator_->deviceId(); - const int64_t peer = (rank + 1) % num_devices; - // const int64_t accessing_peer = (num_devices + rank - 1) % num_devices; - - int can_access_peer; - CUDA_CALL(cudaDeviceCanAccessPeer (&can_access_peer, rank, peer)); - if (!can_access_peer) { - GTEST_SKIP() << "Peer access not enabled between devices " << rank << " and " << peer; - } - - CUDA_CALL(cudaDeviceEnablePeerAccess(peer, /*flag (reserved)*/0)); - - void* d_ptr; - CUDA_CALL(cudaMalloc(&d_ptr, size)); - - const int64_t value = rank; - CUDA_CALL(cudaMemcpy(d_ptr, &value, size, cudaMemcpyHostToDevice)); - - - auto store = communicator_->getTcpStore(); - store->set("d_ptr_" + std::to_string(rank), toBytes(d_ptr)); - communicator_->barrier(); - auto peer_d_ptr = fromBytes(store->get("d_ptr_" + std::to_string(peer))); - - CUDA_CALL(cudaMemcpy(d_ptr, peer_d_ptr, size, cudaMemcpyDeviceToDevice)); - int64_t peer_value; - CUDA_CALL(cudaMemcpy(&peer_value, d_ptr, size, cudaMemcpyDeviceToHost)); - - EXPECT_EQ((value + 1) % num_devices, peer_value); - - // Clean up - CUDA_CALL(cudaDeviceDisablePeerAccess(peer)); // not necessary - CUDA_CALL(cudaFree(d_ptr)); -} - } // namespace nvfuser From abf5c17d27f883173f81c8480ac1f4b167f87f17 Mon Sep 17 00:00:00 2001 From: snordmann Date: Tue, 28 Jan 2025 11:20:23 -0800 Subject: [PATCH 27/55] AllgatherThroughCudaMemcpyAsync --- tests/cpp/multidevice_kernels.cu | 39 ++++++++++++++++++++++++ tests/cpp/multidevice_kernels.h | 34 ++++++++++++++++++++- tests/cpp/test_multidevice_gpu_comms.cpp | 37 ++++++++++++---------- 3 files changed, 93 insertions(+), 17 deletions(-) diff --git a/tests/cpp/multidevice_kernels.cu b/tests/cpp/multidevice_kernels.cu index 6553bee9393..7cbd4753eb3 100644 --- a/tests/cpp/multidevice_kernels.cu +++ b/tests/cpp/multidevice_kernels.cu @@ -11,13 +11,52 @@ // Compiling pytorch with nvcc is not supported either. #include +#include namespace nvfuser { +#define CUDA_CALL(call) NVF_ERROR((call) == cudaSuccess, "CUDA call failed: ", cudaGetErrorString(cudaGetLastError())) + __global__ void DummyMultiDeviceKernel() {} void LaunchDummyMultiDeviceKernel() { DummyMultiDeviceKernel<<<1, 1>>>(); } +int64_t AllgatherThroughCudaMemcpyAsync::running_counter = 0; + +AllgatherThroughCudaMemcpyAsync::AllgatherThroughCudaMemcpyAsync(at::Tensor input, std::vector outputs, Communicator* communicator) : unique_id(running_counter++), communicator_(communicator) { + cudaIpcMemHandle_t input_ipc_handle; + CUDA_CALL(cudaIpcGetMemHandle(&input_ipc_handle, input.data_ptr())); + + auto store = communicator->getTcpStore(); + const int64_t my_rank = communicator->deviceId(); + store->set(prefix() + std::to_string(my_rank), toBytes(input_ipc_handle)); + + communicator_->barrier(); + + sizes_.resize(communicator_->size(), 0); + input_ptrs_.resize(communicator_->size(), nullptr); + output_ptrs_.resize(communicator_->size(), nullptr); + for (int64_t rank: c10::irange(communicator_->size())) { + auto output = outputs.at(rank); + sizes_.at(rank) = output.numel() * output.element_size(); + + output_ptrs_.at(rank) = output.data_ptr(); + if (rank == my_rank) { + input_ptrs_.at(rank) = input.data_ptr(); + } else { + auto peer_ipc_handle = fromBytes(store->get(prefix() + std::to_string(rank))); + CUDA_CALL(cudaIpcOpenMemHandle(&input_ptrs_.at(rank), peer_ipc_handle, cudaIpcMemLazyEnablePeerAccess)); + } + } +} + +void AllgatherThroughCudaMemcpyAsync::post() const { + for (size_t i = 0; i < sizes_.size(); i++) { + CUDA_CALL(cudaMemcpyAsync(output_ptrs_.at(i), input_ptrs_.at(i), sizes_.at(i), cudaMemcpyDeviceToDevice)); + } +} + + } // namespace nvfuser diff --git a/tests/cpp/multidevice_kernels.h b/tests/cpp/multidevice_kernels.h index 0f1099aa8c3..40e29bd7989 100644 --- a/tests/cpp/multidevice_kernels.h +++ b/tests/cpp/multidevice_kernels.h @@ -7,10 +7,42 @@ // clang-format on #pragma once - +#include namespace nvfuser { +template +std::vector toBytes(T data) { + return std::vector( + reinterpret_cast(&data), + reinterpret_cast(&data) + sizeof(T)); +} + +template +T fromBytes(std::vector bytes) { + return *reinterpret_cast(bytes.data()); +} + void LaunchDummyMultiDeviceKernel(); +class AllgatherThroughCudaMemcpyAsync { + public: + AllgatherThroughCudaMemcpyAsync(at::Tensor input, std::vector outputs, Communicator* communicator); + + void post() const; + + private: + std::string prefix() const { + return "AllgatherThroughCudaMemcpyAsync" + std::to_string(unique_id); + } + + static int64_t running_counter; + int64_t unique_id; + Communicator* communicator_; + std::vector sizes_; + std::vector input_ptrs_; + std::vector output_ptrs_; +}; + + } // namespace nvfuser diff --git a/tests/cpp/test_multidevice_gpu_comms.cpp b/tests/cpp/test_multidevice_gpu_comms.cpp index a9973d656f4..2afe784056d 100644 --- a/tests/cpp/test_multidevice_gpu_comms.cpp +++ b/tests/cpp/test_multidevice_gpu_comms.cpp @@ -16,24 +16,8 @@ namespace nvfuser { -namespace { - #define CUDA_CALL(call) ASSERT_EQ((call), cudaSuccess) -template -std::vector toBytes(T data) { - return std::vector( - reinterpret_cast(&data), - reinterpret_cast(&data) + sizeof(T)); -} - -template -T fromBytes(std::vector bytes) { - return *reinterpret_cast(bytes.data()); -} - -} // namespace - class GpuCommTest : public MultiDeviceTest {}; TEST_F(GpuCommTest, IpcMemHandle) { @@ -69,4 +53,25 @@ TEST_F(GpuCommTest, IpcMemHandle) { } +TEST_F(GpuCommTest, Allgather) { + constexpr int64_t kTensorSize = 1024; + + at::Tensor input = at::full({kTensorSize}, communicator_->deviceId(), tensor_options); + auto outputs = std::vector(communicator_->size()); + std::generate(outputs.begin(), outputs.end(), [&]() { + return at::empty({kTensorSize}, tensor_options); + }); + + AllgatherThroughCudaMemcpyAsync allgather(input, outputs, communicator_); + allgather.post(); + + torch::cuda::synchronize(); + communicator_->barrier(); + + for (int64_t i = 0; i < communicator_->size(); ++i) { + at::Tensor expected = at::full({kTensorSize}, i, tensor_options); + EXPECT_TRUE(outputs[i].equal(expected)); + } +} + } // namespace nvfuser From 836d59955592615c0ee46b0454df9e3c620e43cb Mon Sep 17 00:00:00 2001 From: snordmann Date: Wed, 29 Jan 2025 15:44:03 +0200 Subject: [PATCH 28/55] refactor to expose choice of backend --- csrc/host_ir/executor.cpp | 81 ++++++++++++++++++++---- csrc/host_ir/executor.h | 33 ++++++++++ csrc/host_ir/lower.cpp | 4 ++ csrc/host_ir/lower.h | 12 +++- csrc/multidevice/communication.cpp | 4 +- csrc/multidevice/communication.h | 7 +- csrc/multidevice/communicator.cpp | 3 + csrc/multidevice/communicator.h | 3 - csrc/multidevice/executor.cpp | 7 +- csrc/multidevice/executor.h | 8 ++- csrc/multidevice/multidevice.h | 3 + tests/cpp/multidevice_kernels.cu | 36 ----------- tests/cpp/multidevice_kernels.h | 32 ---------- tests/cpp/test_multidevice_gpu_comms.cpp | 2 +- tests/cpp/test_multidevice_pipeline.cpp | 4 +- 15 files changed, 147 insertions(+), 92 deletions(-) diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index eba71fd6ee9..8806a8a5b46 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -69,7 +69,8 @@ void HostIrExecutor::compile(Fusion* fusion) { } else { std::vector exprs = fusion->exprs(); for (Expr* e : exprs) { - std::vector communications = HostIrLower::lower(cloner.clone(e)); + HostIrLower lower; + std::vector communications = lower.lower(cloner.clone(e)); for (auto* communication : communications) { host_ir_container_->pushBackTopLevelExprs(communication); } @@ -408,6 +409,45 @@ void HostIrEvaluator::handle(PostOnStream* post_ir) { } } +int64_t AllgatherThroughCudaMemcpyAsync::running_counter = 0; + +AllgatherThroughCudaMemcpyAsync::AllgatherThroughCudaMemcpyAsync(at::Tensor input, std::vector outputs, Communicator* communicator) : unique_id(running_counter++), communicator_(communicator) { + cudaIpcMemHandle_t input_ipc_handle; + NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcGetMemHandle(&input_ipc_handle, input.data_ptr())); + + auto store = communicator->getTcpStore(); + const int64_t my_rank = communicator->deviceId(); + store->set(prefix() + std::to_string(my_rank), toBytes(input_ipc_handle)); + + communicator_->barrier(); + + sizes_.resize(communicator_->size(), 0); + input_ptrs_.resize(communicator_->size(), nullptr); + output_ptrs_.resize(communicator_->size(), nullptr); + for (int64_t rank: c10::irange(communicator_->size())) { + auto output = outputs.at(rank); + sizes_.at(rank) = output.numel() * output.element_size(); + + output_ptrs_.at(rank) = output.data_ptr(); + if (rank == my_rank) { + input_ptrs_.at(rank) = input.data_ptr(); + } else { + auto peer_ipc_handle = fromBytes(store->get(prefix() + std::to_string(rank))); + NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcOpenMemHandle(&input_ptrs_.at(rank), peer_ipc_handle, cudaIpcMemLazyEnablePeerAccess)); + } + } +} + +void AllgatherThroughCudaMemcpyAsync::post() const { + for (size_t i = 0; i < sizes_.size(); i++) { + NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync(output_ptrs_.at(i), input_ptrs_.at(i), sizes_.at(i), cudaMemcpyDeviceToDevice)); + } +} + + + + + void HostIrEvaluator::handle(Communication* communication) { NVF_ERROR( communicator_ != nullptr && communicator_->is_available(), @@ -418,14 +458,30 @@ void HostIrEvaluator::handle(Communication* communication) { at::Tensor output_tensor = getKnownTensorOrUndefined(communication->output(0), expr_evaluator_); - c10d::Backend* backend = - communicator_->getBackendForTeam(communication->team(), std::nullopt); - works_[communication] = postSingleCommunication( - communication, - communicator_->deviceId(), - backend, - input_tensor, - output_tensor); + CommunicatorBackend backend_type = communication->backend(); + + if (backend_type != CommunicatorBackend::kCuda) { + c10d::Backend* backend = + communicator_->getBackendForTeam(communication->team(), backend_type); + works_[communication] = postSingleCommunication( + communication, + communicator_->deviceId(), + backend, + input_tensor, + output_tensor); + return; + } + + NVF_ERROR(communication->type() == CommunicationType::Allgather); + if (allgather_backends_.find(communication) == allgather_backends_.end()) { + allgather_backends_.try_emplace( + communication, + AllgatherThroughCudaMemcpyAsync( + input_tensor, + getKnownTensorOrUndefined(communication->outputs(), expr_evaluator_), + communicator_)); + } + allgather_backends_.at(communication).post(); } void HostIrEvaluator::handle(P2PCommunication* communication) { @@ -446,8 +502,11 @@ void HostIrEvaluator::handle(P2PCommunication* communication) { void HostIrEvaluator::handle(Wait* wait) { Expr* communication = wait->communication(); - NVF_ERROR(works_.find(communication) != works_.end(), "no wait req"); - auto& work = works_.at(communication); + auto it = works_.find(communication); + if (it == works_.end()) { + return; + } + auto& work = it->second; if (work != nullptr) { work->wait(); } diff --git a/csrc/host_ir/executor.h b/csrc/host_ir/executor.h index ad3e8422ca1..435a568bc63 100644 --- a/csrc/host_ir/executor.h +++ b/csrc/host_ir/executor.h @@ -48,8 +48,40 @@ class HostIrExecutor : public ExecutorAbstract { Communicator* communicator_; }; +template +std::vector toBytes(T data) { + return std::vector( + reinterpret_cast(&data), + reinterpret_cast(&data) + sizeof(T)); +} + +template +T fromBytes(std::vector bytes) { + return *reinterpret_cast(bytes.data()); +} + namespace hir { + +class AllgatherThroughCudaMemcpyAsync { + public: + AllgatherThroughCudaMemcpyAsync(at::Tensor input, std::vector outputs, Communicator* communicator); + + void post() const; + + private: + std::string prefix() const { + return "AllgatherThroughCudaMemcpyAsync" + std::to_string(unique_id); + } + + static int64_t running_counter; + int64_t unique_id; + Communicator* communicator_; + std::vector sizes_; + std::vector input_ptrs_; + std::vector output_ptrs_; +}; + /* a HostIrEvaluator evaluates a host programs represented through a HostIrContainer It is instantiated with the desired HostIrContainer, and runs @@ -145,6 +177,7 @@ class HostIrEvaluator final : public OptOutDispatch { std::unordered_map streams_; std::unordered_map> works_; const int64_t my_device_index_; + std::unordered_map allgather_backends_; }; } // namespace hir diff --git a/csrc/host_ir/lower.cpp b/csrc/host_ir/lower.cpp index ea52ba5eeb6..1631e773ea0 100644 --- a/csrc/host_ir/lower.cpp +++ b/csrc/host_ir/lower.cpp @@ -304,6 +304,10 @@ std::vector HostIrLower::lower(Expr* c) { lowerToBroadcastOrSendRecv(input_tv, output_tv, comms); } } + + std::for_each(comms.begin(), comms.end(), [this](Expr* comm) { + comm->as()->backend() = params_.communicator_backend; + }); return comms; } diff --git a/csrc/host_ir/lower.h b/csrc/host_ir/lower.h index 02d120cb734..47417e9eba4 100644 --- a/csrc/host_ir/lower.h +++ b/csrc/host_ir/lower.h @@ -14,22 +14,30 @@ namespace nvfuser { +struct HostIrLowerParams { + CommunicatorBackend communicator_backend = CommunicatorBackend::kNccl; +}; + class HostIrLower { public: + + HostIrLower(HostIrLowerParams params = HostIrLowerParams()) : params_(params) {} + // The flag `ignore_inner_resharding` is useful because the preseg passes // `InsertReshardingsPass` and `ReorderShardedAxisPass` want different // behaviors static bool canLower(Expr* expr, bool ignore_inner_resharding = false); // Lower a sharded Expr into a series of Communication. - static std::vector lower(Expr* c); + std::vector lower(Expr* c); - static std::unique_ptr lower( + std::unique_ptr lower( std::unique_ptr fusion, int64_t my_device_index); private: static std::vector lowerToCollectiveBasedPipelinedGemmComm(Expr* expr); + HostIrLowerParams params_; }; } // namespace nvfuser diff --git a/csrc/multidevice/communication.cpp b/csrc/multidevice/communication.cpp index edcc40e4d5f..c49a5f3a85d 100644 --- a/csrc/multidevice/communication.cpp +++ b/csrc/multidevice/communication.cpp @@ -145,7 +145,8 @@ Communication::Communication( Team team, DeviceIdxType root, RedOpType red_op, - int64_t scattered_axis) + int64_t scattered_axis, + CommunicatorBackend backend) : Expr(passkey) { NVF_ERROR( in->getDeviceMesh().size() > 0, @@ -161,6 +162,7 @@ Communication::Communication( addDataAttribute(root); addDataAttribute(red_op); addDataAttribute(scattered_axis); + addDataAttribute(backend); validate(); } diff --git a/csrc/multidevice/communication.h b/csrc/multidevice/communication.h index 8631a1a04e5..2714ae4dcea 100644 --- a/csrc/multidevice/communication.h +++ b/csrc/multidevice/communication.h @@ -59,7 +59,8 @@ class Communication : public Expr { // sharding. DeviceIdxType root = -1, RedOpType red_op = RedOpType::UNUSED, - int64_t scattered_axis = -1); + int64_t scattered_axis = -1, + CommunicatorBackend backend = CommunicatorBackend::kNccl); Communication(const Communication& other) = delete; Communication& operator=(const Communication& other) = delete; @@ -107,6 +108,10 @@ class Communication : public Expr { return attribute(4); } + CommunicatorBackend& backend() const { + return attribute(5); + } + // PyTorch's process group expects the root to be specified // as an integer between 0 and world_size-1. We choose it to be // the device's relative index within the team diff --git a/csrc/multidevice/communicator.cpp b/csrc/multidevice/communicator.cpp index 6cf1a499bb9..ce102695637 100644 --- a/csrc/multidevice/communicator.cpp +++ b/csrc/multidevice/communicator.cpp @@ -38,6 +38,9 @@ std::ostream& operator<<(std::ostream& out, const CommunicatorBackend& cb) { case CommunicatorBackend::kGloo: out << "GLOO"; break; + case CommunicatorBackend::kCuda: + out << "CUDA"; + break; } return out; } diff --git a/csrc/multidevice/communicator.h b/csrc/multidevice/communicator.h index 9ee6c613da8..3ac48d9906b 100644 --- a/csrc/multidevice/communicator.h +++ b/csrc/multidevice/communicator.h @@ -36,9 +36,6 @@ namespace nvfuser { using RankType = DeviceIdxType; -// Supported backends. TODO: gloo untested -enum class CommunicatorBackend { kNccl, kUcc, kGloo }; - std::ostream& operator<<(std::ostream& out, const CommunicatorBackend& cb); #ifdef USE_C10D_NCCL diff --git a/csrc/multidevice/executor.cpp b/csrc/multidevice/executor.cpp index 963b80812d3..eaea12ef2f3 100644 --- a/csrc/multidevice/executor.cpp +++ b/csrc/multidevice/executor.cpp @@ -23,13 +23,14 @@ namespace nvfuser { MultiDeviceExecutor::MultiDeviceExecutor( std::unique_ptr fusion, Communicator& comm, - hir::HostIrEvaluatorParams params) + MultiDeviceExecutorParams params) : comm_(comm) { + HostIrLower lower(params.lower); std::unique_ptr hic = - HostIrLower::lower(std::move(fusion), comm.deviceId()); + lower.lower(std::move(fusion), comm.deviceId()); // Create the HostIrEvaluator representing the host program host_ir_executor_ = - std::make_unique(std::move(hic), &comm, params); + std::make_unique(std::move(hic), &comm, params.executor); } std::vector MultiDeviceExecutor::runWithInput( diff --git a/csrc/multidevice/executor.h b/csrc/multidevice/executor.h index 7cad0388b18..e43b7c57f72 100644 --- a/csrc/multidevice/executor.h +++ b/csrc/multidevice/executor.h @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -19,6 +20,11 @@ namespace nvfuser { +struct MultiDeviceExecutorParams { + hir::HostIrEvaluatorParams executor = hir::HostIrEvaluatorParams(); + HostIrLowerParams lower = HostIrLowerParams(); +}; + /* The MultiDeviceExecutor executes a Fusion on a multi-device setting. It is instantiated from a Fusion and a Communicator. @@ -74,7 +80,7 @@ class MultiDeviceExecutor { MultiDeviceExecutor( std::unique_ptr fusion, Communicator& comm, - hir::HostIrEvaluatorParams params = hir::HostIrEvaluatorParams()); + MultiDeviceExecutorParams params = MultiDeviceExecutorParams()); // Run the fusion on several devices with the given global inputs std::vector runWithInput(const std::vector& inputs); diff --git a/csrc/multidevice/multidevice.h b/csrc/multidevice/multidevice.h index 0923383413f..46656f2aceb 100644 --- a/csrc/multidevice/multidevice.h +++ b/csrc/multidevice/multidevice.h @@ -15,4 +15,7 @@ using DeviceIdxType = int64_t; using DimensionType = int; using DeviceType = c10::Device; using Team = std::vector; + +// Supported backends. TODO: gloo untested +enum class CommunicatorBackend { kNccl, kUcc, kGloo, kCuda }; } // namespace nvfuser diff --git a/tests/cpp/multidevice_kernels.cu b/tests/cpp/multidevice_kernels.cu index 7cbd4753eb3..1d38e034137 100644 --- a/tests/cpp/multidevice_kernels.cu +++ b/tests/cpp/multidevice_kernels.cu @@ -23,40 +23,4 @@ void LaunchDummyMultiDeviceKernel() { DummyMultiDeviceKernel<<<1, 1>>>(); } -int64_t AllgatherThroughCudaMemcpyAsync::running_counter = 0; - -AllgatherThroughCudaMemcpyAsync::AllgatherThroughCudaMemcpyAsync(at::Tensor input, std::vector outputs, Communicator* communicator) : unique_id(running_counter++), communicator_(communicator) { - cudaIpcMemHandle_t input_ipc_handle; - CUDA_CALL(cudaIpcGetMemHandle(&input_ipc_handle, input.data_ptr())); - - auto store = communicator->getTcpStore(); - const int64_t my_rank = communicator->deviceId(); - store->set(prefix() + std::to_string(my_rank), toBytes(input_ipc_handle)); - - communicator_->barrier(); - - sizes_.resize(communicator_->size(), 0); - input_ptrs_.resize(communicator_->size(), nullptr); - output_ptrs_.resize(communicator_->size(), nullptr); - for (int64_t rank: c10::irange(communicator_->size())) { - auto output = outputs.at(rank); - sizes_.at(rank) = output.numel() * output.element_size(); - - output_ptrs_.at(rank) = output.data_ptr(); - if (rank == my_rank) { - input_ptrs_.at(rank) = input.data_ptr(); - } else { - auto peer_ipc_handle = fromBytes(store->get(prefix() + std::to_string(rank))); - CUDA_CALL(cudaIpcOpenMemHandle(&input_ptrs_.at(rank), peer_ipc_handle, cudaIpcMemLazyEnablePeerAccess)); - } - } -} - -void AllgatherThroughCudaMemcpyAsync::post() const { - for (size_t i = 0; i < sizes_.size(); i++) { - CUDA_CALL(cudaMemcpyAsync(output_ptrs_.at(i), input_ptrs_.at(i), sizes_.at(i), cudaMemcpyDeviceToDevice)); - } -} - - } // namespace nvfuser diff --git a/tests/cpp/multidevice_kernels.h b/tests/cpp/multidevice_kernels.h index 40e29bd7989..4cd1e6c16b5 100644 --- a/tests/cpp/multidevice_kernels.h +++ b/tests/cpp/multidevice_kernels.h @@ -11,38 +11,6 @@ namespace nvfuser { -template -std::vector toBytes(T data) { - return std::vector( - reinterpret_cast(&data), - reinterpret_cast(&data) + sizeof(T)); -} - -template -T fromBytes(std::vector bytes) { - return *reinterpret_cast(bytes.data()); -} - void LaunchDummyMultiDeviceKernel(); -class AllgatherThroughCudaMemcpyAsync { - public: - AllgatherThroughCudaMemcpyAsync(at::Tensor input, std::vector outputs, Communicator* communicator); - - void post() const; - - private: - std::string prefix() const { - return "AllgatherThroughCudaMemcpyAsync" + std::to_string(unique_id); - } - - static int64_t running_counter; - int64_t unique_id; - Communicator* communicator_; - std::vector sizes_; - std::vector input_ptrs_; - std::vector output_ptrs_; -}; - - } // namespace nvfuser diff --git a/tests/cpp/test_multidevice_gpu_comms.cpp b/tests/cpp/test_multidevice_gpu_comms.cpp index 2afe784056d..10d82c99b85 100644 --- a/tests/cpp/test_multidevice_gpu_comms.cpp +++ b/tests/cpp/test_multidevice_gpu_comms.cpp @@ -62,7 +62,7 @@ TEST_F(GpuCommTest, Allgather) { return at::empty({kTensorSize}, tensor_options); }); - AllgatherThroughCudaMemcpyAsync allgather(input, outputs, communicator_); + hir::AllgatherThroughCudaMemcpyAsync allgather(input, outputs, communicator_); allgather.post(); torch::cuda::synchronize(); diff --git a/tests/cpp/test_multidevice_pipeline.cpp b/tests/cpp/test_multidevice_pipeline.cpp index 5a626bfc967..dbd4befd98b 100644 --- a/tests/cpp/test_multidevice_pipeline.cpp +++ b/tests/cpp/test_multidevice_pipeline.cpp @@ -124,10 +124,12 @@ void PipelineTest::executeAndValidate(bool validate_with_prescribed_values) { std::cout << ss.str() << std::endl; } + MultiDeviceExecutorParams params; + params.executor = host_ir_executor_params; runtime = std::make_unique( std::make_unique(*fusion), *communicator_, - host_ir_executor_params); + params); auto error_msg = runtime->validate(); if (error_msg != "") { GTEST_SKIP() << error_msg; From e09dd58d02828b382759c81e872d24f3c29addac Mon Sep 17 00:00:00 2001 From: snordmann Date: Wed, 29 Jan 2025 06:43:47 -0800 Subject: [PATCH 29/55] add backend type to P2PCommunication --- csrc/multidevice/communication.cpp | 4 +++- csrc/multidevice/communication.h | 10 ++++++++-- tests/cpp/test_multidevice_pipeline.cpp | 1 - 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/csrc/multidevice/communication.cpp b/csrc/multidevice/communication.cpp index c49a5f3a85d..07861329567 100644 --- a/csrc/multidevice/communication.cpp +++ b/csrc/multidevice/communication.cpp @@ -233,11 +233,13 @@ P2PCommunication::P2PCommunication( IrBuilderPasskey passkey, P2PCommunicationType type, TensorView* buffer, - Val* peer) + Val* peer, + CommunicatorBackend backend) : Expr(passkey) { addInput(buffer); addDataAttribute(type); addAttribute(peer); + addDataAttribute(backend); } NVFUSER_DEFINE_CLONE_AND_CREATE(P2PCommunication) diff --git a/csrc/multidevice/communication.h b/csrc/multidevice/communication.h index 2714ae4dcea..944df467a62 100644 --- a/csrc/multidevice/communication.h +++ b/csrc/multidevice/communication.h @@ -108,7 +108,7 @@ class Communication : public Expr { return attribute(4); } - CommunicatorBackend& backend() const { + CommunicatorBackend& backend() { return attribute(5); } @@ -133,7 +133,8 @@ class P2PCommunication : public Expr { IrBuilderPasskey passkey, P2PCommunicationType type, TensorView* buffer, - Val* peer); + Val* peer, + CommunicatorBackend backend = CommunicatorBackend::kNccl); P2PCommunication(const P2PCommunication& other) = delete; P2PCommunication& operator=(const P2PCommunication& other) = delete; @@ -159,6 +160,11 @@ class P2PCommunication : public Expr { Val* peer() const { return attributeVal(1); } + + CommunicatorBackend& backend() { + return attribute(2); + } + }; // The method "post" triggers the execution of the communication. This call is diff --git a/tests/cpp/test_multidevice_pipeline.cpp b/tests/cpp/test_multidevice_pipeline.cpp index dbd4befd98b..ca113123e16 100644 --- a/tests/cpp/test_multidevice_pipeline.cpp +++ b/tests/cpp/test_multidevice_pipeline.cpp @@ -154,7 +154,6 @@ void PipelineTest::executeAndValidate(bool validate_with_prescribed_values) { PipelineTest::PipelineTest() { fusion = std::make_unique(); - communicator_->setDefaultBackend(CommunicatorBackend::kNccl); } // To run the following tests on several devices, pytorch must be installed with From 63717467dc86e70c934eec5d680d28156de5e408 Mon Sep 17 00:00:00 2001 From: snordmann Date: Thu, 30 Jan 2025 10:17:13 +0000 Subject: [PATCH 30/55] wip --- csrc/host_ir/executor.cpp | 54 ++++++++++------ csrc/host_ir/executor.h | 2 +- csrc/host_ir/lower.cpp | 6 +- csrc/host_ir/lower.h | 2 +- tests/cpp/test_multidevice_gpu_comms.cpp | 36 +++++++++++ tests/cpp/test_multidevice_overlap.cpp | 78 +++++++++++++----------- 6 files changed, 121 insertions(+), 57 deletions(-) diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index 8806a8a5b46..bc35fa4ec93 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -415,9 +415,12 @@ AllgatherThroughCudaMemcpyAsync::AllgatherThroughCudaMemcpyAsync(at::Tensor inpu cudaIpcMemHandle_t input_ipc_handle; NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcGetMemHandle(&input_ipc_handle, input.data_ptr())); + std::string rank_prefix = "_rank="; + auto store = communicator->getTcpStore(); const int64_t my_rank = communicator->deviceId(); - store->set(prefix() + std::to_string(my_rank), toBytes(input_ipc_handle)); + store->set(prefix() + rank_prefix + std::to_string(my_rank), toBytes(input_ipc_handle)); + std::cout << "rank " << communicator_->deviceId() << " sets at key " << prefix() + rank_prefix + std::to_string(my_rank) << std::endl; communicator_->barrier(); @@ -432,7 +435,8 @@ AllgatherThroughCudaMemcpyAsync::AllgatherThroughCudaMemcpyAsync(at::Tensor inpu if (rank == my_rank) { input_ptrs_.at(rank) = input.data_ptr(); } else { - auto peer_ipc_handle = fromBytes(store->get(prefix() + std::to_string(rank))); + std::cout << "rank " << communicator_->deviceId() << " gets at key " << prefix() + rank_prefix + std::to_string(rank) << " for iteration " << rank << std::endl; + auto peer_ipc_handle = fromBytes(store->get(prefix() + rank_prefix + std::to_string(rank))); NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcOpenMemHandle(&input_ptrs_.at(rank), peer_ipc_handle, cudaIpcMemLazyEnablePeerAccess)); } } @@ -440,6 +444,7 @@ AllgatherThroughCudaMemcpyAsync::AllgatherThroughCudaMemcpyAsync(at::Tensor inpu void AllgatherThroughCudaMemcpyAsync::post() const { for (size_t i = 0; i < sizes_.size(); i++) { + std::cout << "rank " << communicator_->deviceId() <<", iteration " << i << ", input_ptr=" << input_ptrs_.at(i) << ", output_ptr=" << output_ptrs_.at(i) << ", size=" << sizes_.at(i)<< std::endl; NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync(output_ptrs_.at(i), input_ptrs_.at(i), sizes_.at(i), cudaMemcpyDeviceToDevice)); } } @@ -473,15 +478,23 @@ void HostIrEvaluator::handle(Communication* communication) { } NVF_ERROR(communication->type() == CommunicationType::Allgather); - if (allgather_backends_.find(communication) == allgather_backends_.end()) { - allgather_backends_.try_emplace( - communication, - AllgatherThroughCudaMemcpyAsync( - input_tensor, - getKnownTensorOrUndefined(communication->outputs(), expr_evaluator_), - communicator_)); - } - allgather_backends_.at(communication).post(); + // if (allgather_backends_.find(communication) == allgather_backends_.end()) { + // // TODO: retrieve sharded axis here + // auto output_tensors = at::tensor_split(output_tensor.squeeze(), communication->team_size(), 0); + // allgather_backends_.try_emplace( + // communication, + // AllgatherThroughCudaMemcpyAsync( + // input_tensor, + // output_tensors, + // communicator_)); + // } + // allgather_backends_.at(communication).post(); + + auto output_tensors = at::tensor_split(output_tensor.squeeze(), communication->team_size(), 0); + AllgatherThroughCudaMemcpyAsync allgather_backend(input_tensor, output_tensors, communicator_); + allgather_backend.post(); + torch::cuda::synchronize(); + communicator_->barrier(); } void HostIrEvaluator::handle(P2PCommunication* communication) { @@ -492,12 +505,19 @@ void HostIrEvaluator::handle(P2PCommunication* communication) { at::Tensor buffer = getKnownTensorOrUndefined(communication->buffer(), expr_evaluator_); - works_[communication] = postSingleCommunication( - communication, - communicator_->deviceId(), - expr_evaluator_.evaluate(communication->peer()).as(), - communicator_->getWorld(), - buffer); + CommunicatorBackend backend_type = communication->backend(); + + if (backend_type != CommunicatorBackend::kCuda) { + + works_[communication] = postSingleCommunication( + communication, + communicator_->deviceId(), + expr_evaluator_.evaluate(communication->peer()).as(), + communicator_->getWorld(), + buffer); + return; + } + NVF_ERROR(false, "CUDA backend not supported yet"); } void HostIrEvaluator::handle(Wait* wait) { diff --git a/csrc/host_ir/executor.h b/csrc/host_ir/executor.h index 435a568bc63..f4cb7608d1d 100644 --- a/csrc/host_ir/executor.h +++ b/csrc/host_ir/executor.h @@ -71,7 +71,7 @@ class AllgatherThroughCudaMemcpyAsync { private: std::string prefix() const { - return "AllgatherThroughCudaMemcpyAsync" + std::to_string(unique_id); + return "AllgatherThroughCudaMemcpyAsync_uniqueId=" + std::to_string(unique_id); } static int64_t running_counter; diff --git a/csrc/host_ir/lower.cpp b/csrc/host_ir/lower.cpp index ff6a99dc421..0bb4a8b885c 100644 --- a/csrc/host_ir/lower.cpp +++ b/csrc/host_ir/lower.cpp @@ -475,7 +475,11 @@ std::vector HostIrLower::lowerToCollectiveBasedPipelinedGemmComm( CommunicationType::Allgather, /*out=*/tva_allgathered_j, /*in=*/tva_j, - /*team=*/tva->getDeviceMesh().vector()); + /*team=*/tva->getDeviceMesh().vector(), + /*root=*/-1, + /*red_op=*/RedOpType::UNUSED, + /*scattered_axis=*/-1, + params_.communicator_backend); auto* wait = IrBuilder::create(communication); Expr* compute = nullptr; diff --git a/csrc/host_ir/lower.h b/csrc/host_ir/lower.h index 47417e9eba4..88d5dd10fa7 100644 --- a/csrc/host_ir/lower.h +++ b/csrc/host_ir/lower.h @@ -36,7 +36,7 @@ class HostIrLower { int64_t my_device_index); private: - static std::vector lowerToCollectiveBasedPipelinedGemmComm(Expr* expr); + std::vector lowerToCollectiveBasedPipelinedGemmComm(Expr* expr); HostIrLowerParams params_; }; diff --git a/tests/cpp/test_multidevice_gpu_comms.cpp b/tests/cpp/test_multidevice_gpu_comms.cpp index 10d82c99b85..e017830ea53 100644 --- a/tests/cpp/test_multidevice_gpu_comms.cpp +++ b/tests/cpp/test_multidevice_gpu_comms.cpp @@ -53,6 +53,42 @@ TEST_F(GpuCommTest, IpcMemHandle) { } +TEST_F(GpuCommTest, IpcMemHandlePtrArithmetic) { + // Allocate GPU memory + constexpr size_t size = 2 * sizeof(int64_t); + const int64_t num_devices = communicator_->size(); + const int64_t rank = communicator_->deviceId(); + const int64_t peer_rank = (rank + 1) % num_devices; + void* d_ptr; + CUDA_CALL(cudaMalloc(&d_ptr, size)); + + std::vector values; + values.push_back(2 * rank); + values.push_back(2 * rank + 1); + CUDA_CALL(cudaMemcpy(d_ptr, values.data(), size, cudaMemcpyHostToDevice)); + + cudaIpcMemHandle_t ipc_handle; + CUDA_CALL(cudaIpcGetMemHandle(&ipc_handle, d_ptr)); + + auto store = communicator_->getTcpStore(); + store->set("ipc_handle_" + std::to_string(rank), toBytes(ipc_handle)); + communicator_->barrier(); + auto peer_ipc_handle = fromBytes(store->get("ipc_handle_" + std::to_string(peer_rank))); + + int64_t* peer_d_ptr; + CUDA_CALL(cudaIpcOpenMemHandle((void**)&peer_d_ptr, peer_ipc_handle, cudaIpcMemLazyEnablePeerAccess)); + + int64_t peer_value; + CUDA_CALL(cudaMemcpy(&peer_value, peer_d_ptr + 1, size / 2, cudaMemcpyDeviceToHost)); + + EXPECT_EQ(2 * peer_rank + 1, peer_value); + + // Clean up + CUDA_CALL(cudaIpcCloseMemHandle(peer_d_ptr)); + CUDA_CALL(cudaFree(d_ptr)); + +} + TEST_F(GpuCommTest, Allgather) { constexpr int64_t kTensorSize = 1024; diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp index c08eea14b93..100cc3b92a4 100644 --- a/tests/cpp/test_multidevice_overlap.cpp +++ b/tests/cpp/test_multidevice_overlap.cpp @@ -236,7 +236,9 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) { unfuse_loops, use_cuda_graph, dtype] = GetParam(); - + if (backend == CommunicatorBackend::kCuda) { + GTEST_SKIP() << "Cuda Backend not supported in this test"; + } GTEST_ASSERT_EQ(M % S, 0); std::vector all_ranks(communicator_->size()); @@ -348,10 +350,10 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) { } TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmarkStreamParallelType) { - constexpr int64_t number_of_warmups = 50; - constexpr int64_t number_of_iterations = 200; - constexpr int64_t iteration_profiler_start = 10; - constexpr int64_t iteration_profiler_end = 15; + // constexpr int64_t number_of_warmups = 50; + // constexpr int64_t number_of_iterations = 200; + // constexpr int64_t iteration_profiler_start = 10; + // constexpr int64_t iteration_profiler_end = 15; const int64_t D = communicator_->size(); auto [backend, @@ -403,10 +405,9 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmarkStreamParallelType) { a->axis(1)->parallelize(ParallelType::DIDx); c->axis(0)->parallelize(ParallelType::Stream); - communicator_->setDefaultBackend(backend); - - hir::HostIrEvaluatorParams params; - params.number_of_streams = number_of_streams; + MultiDeviceExecutorParams params; + params.lower.communicator_backend = backend; + params.executor.number_of_streams = number_of_streams; MultiDeviceExecutor executor(std::move(fusion), *communicator_, params); @@ -421,47 +422,50 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmarkStreamParallelType) { std::vector inputs = {ta, tb}; at::Tensor tc; - cudaEvent_t start, stop; - cudaEventCreate(&start); - cudaEventCreate(&stop); + // cudaEvent_t start, stop; + // cudaEventCreate(&start); + // cudaEventCreate(&stop); - for (const auto& iteration : - c10::irange(number_of_warmups + number_of_iterations)) { - if (iteration == iteration_profiler_start) { - cudaProfilerStart();; - } - if (iteration == number_of_warmups) { - cudaEventRecord(start); - } + // for (const auto& iteration : + // c10::irange(1)) { + // if (iteration == iteration_profiler_start) { + // cudaProfilerStart();; + // } + // if (iteration == number_of_warmups) { + // cudaEventRecord(start); + // } tc = executor.runWithInput(inputs).at(0); - if (iteration == iteration_profiler_end) { - cudaProfilerStop();; - } - } - cudaEventRecord(stop); - cudaEventSynchronize(stop); - float milliseconds = 0; - cudaEventElapsedTime(&milliseconds, start, stop); - milliseconds /= number_of_iterations; + // if (iteration == iteration_profiler_end) { + // cudaProfilerStop();; + // } + // } + // cudaEventRecord(stop); + // cudaEventSynchronize(stop); + // float milliseconds = 0; + // cudaEventElapsedTime(&milliseconds, start, stop); + // milliseconds /= number_of_iterations; - std::string test_name = ::testing::UnitTest::GetInstance()->current_test_info()->name(); - times.insert({test_name, milliseconds}); - std::cout << "rank " << communicator_->deviceId() << ", " << test_name << " : " << milliseconds << std::endl; + // std::string test_name = ::testing::UnitTest::GetInstance()->current_test_info()->name(); + // times.insert({test_name, milliseconds}); + // std::cout << "rank " << communicator_->deviceId() << ", " << test_name << " : " << milliseconds << std::endl; + + torch::cuda::synchronize(); + communicator_->barrier(); - EXPECT_TRUE(torch::allclose(tc_ref, tc, 1e-1, 1e-1)); + EXPECT_TRUE(torch::allclose(tc_ref, tc, 1e-1, 1e-1)) << "rank " << communicator_->deviceId() << "failed.\ntc_ref: " << tc_ref << ",\ntc: " << tc; } INSTANTIATE_TEST_SUITE_P( , OverlapBenchmark, testing::Combine( - testing::Values(CommunicatorBackend::kNccl, CommunicatorBackend::kUcc), + testing::Values(CommunicatorBackend::kNccl, CommunicatorBackend::kUcc, CommunicatorBackend::kCuda), /*S=*/testing::Values(1,2,4,8, 16, 32), - /*M=*/testing::Values(pow(2,10), pow(2,15), pow(2,18)), - /*K=*/testing::Values(pow(2,10), pow(2,15), pow(2,18)), - /*N=*/testing::Values(pow(2,10), pow(2,15)), + /*M=*/testing::Values(pow(2,3), pow(2,10), pow(2,15), pow(2,18)), + /*K=*/testing::Values(pow(2,3), pow(2,10), pow(2,15), pow(2,18)), + /*N=*/testing::Values(pow(2,3), pow(2,10), pow(2,15)), /*number_of_streams=*/testing::Values(3, 8, 32), /*add_cuStreamWriteValue32*/testing::Values(false, true), /*number_of_pgs=*/testing::Values(1, 2, 4, 8), From b700a31b881ebc29c66dda1430eb68ef3db07097 Mon Sep 17 00:00:00 2001 From: snordmann Date: Thu, 30 Jan 2025 03:36:17 -0800 Subject: [PATCH 31/55] working chkpt --- csrc/host_ir/executor.cpp | 37 ++++++++++++++++----- tests/cpp/test_multidevice_gpu_comms.cpp | 42 +++++++++++++++++++++++- tests/cpp/test_multidevice_overlap.cpp | 2 ++ 3 files changed, 72 insertions(+), 9 deletions(-) diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index bc35fa4ec93..4714d14ded2 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -411,16 +411,30 @@ void HostIrEvaluator::handle(PostOnStream* post_ir) { int64_t AllgatherThroughCudaMemcpyAsync::running_counter = 0; +struct IpcTensorInfo { + cudaIpcMemHandle_t ipc_handle; + int64_t storage_offset; + int64_t element_size; +}; + AllgatherThroughCudaMemcpyAsync::AllgatherThroughCudaMemcpyAsync(at::Tensor input, std::vector outputs, Communicator* communicator) : unique_id(running_counter++), communicator_(communicator) { - cudaIpcMemHandle_t input_ipc_handle; - NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcGetMemHandle(&input_ipc_handle, input.data_ptr())); std::string rank_prefix = "_rank="; + std::string ipc_handle_prefix = "_IpcHandle="; + std::string offset_prefix = "_Offset="; + + IpcTensorInfo ipc_tensor_info; + NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcGetMemHandle(&ipc_tensor_info.ipc_handle, input.data_ptr())); + ipc_tensor_info.storage_offset = input.storage_offset(); + ipc_tensor_info.element_size = input.element_size(); - auto store = communicator->getTcpStore(); const int64_t my_rank = communicator->deviceId(); - store->set(prefix() + rank_prefix + std::to_string(my_rank), toBytes(input_ipc_handle)); - std::cout << "rank " << communicator_->deviceId() << " sets at key " << prefix() + rank_prefix + std::to_string(my_rank) << std::endl; + auto store = communicator->getTcpStore(); + store->set(prefix() + rank_prefix + std::to_string(my_rank), toBytes(ipc_tensor_info)); + std::cout << "rank " << communicator_->deviceId() + << " sets at key " << prefix() + rank_prefix + std::to_string(my_rank) + << " offset " << input.storage_offset() << " at key " << prefix() + offset_prefix + std::to_string(my_rank) + << ", for input=" << input << std::endl; communicator_->barrier(); @@ -436,16 +450,20 @@ AllgatherThroughCudaMemcpyAsync::AllgatherThroughCudaMemcpyAsync(at::Tensor inpu input_ptrs_.at(rank) = input.data_ptr(); } else { std::cout << "rank " << communicator_->deviceId() << " gets at key " << prefix() + rank_prefix + std::to_string(rank) << " for iteration " << rank << std::endl; - auto peer_ipc_handle = fromBytes(store->get(prefix() + rank_prefix + std::to_string(rank))); - NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcOpenMemHandle(&input_ptrs_.at(rank), peer_ipc_handle, cudaIpcMemLazyEnablePeerAccess)); + ipc_tensor_info = fromBytes(store->get(prefix() + rank_prefix + std::to_string(rank))); + // auto peer_ipc_handle = fromBytes(store->get(prefix() + rank_prefix + std::to_string(rank))); + void*& ptr = input_ptrs_.at(rank); + NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcOpenMemHandle(&ptr, ipc_tensor_info.ipc_handle, cudaIpcMemLazyEnablePeerAccess)); + ptr = (void*)((uint8_t*)ptr + ipc_tensor_info.storage_offset * ipc_tensor_info.element_size); } } } void AllgatherThroughCudaMemcpyAsync::post() const { for (size_t i = 0; i < sizes_.size(); i++) { - std::cout << "rank " << communicator_->deviceId() <<", iteration " << i << ", input_ptr=" << input_ptrs_.at(i) << ", output_ptr=" << output_ptrs_.at(i) << ", size=" << sizes_.at(i)<< std::endl; NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync(output_ptrs_.at(i), input_ptrs_.at(i), sizes_.at(i), cudaMemcpyDeviceToDevice)); + torch::cuda::synchronize(); + std::cout << "rank " << communicator_->deviceId() <<", iteration " << i << ", input_ptr=" << input_ptrs_.at(i) << ", output_ptr=" << output_ptrs_.at(i) << ", size=" << sizes_.at(i) << std::endl; } } @@ -495,6 +513,9 @@ void HostIrEvaluator::handle(Communication* communication) { allgather_backend.post(); torch::cuda::synchronize(); communicator_->barrier(); + if (communicator_->deviceId() == 0) { + std::cout << "rank " << communicator_->deviceId() << " finishes allgather, output=" << output_tensor << std::endl; + } } void HostIrEvaluator::handle(P2PCommunication* communication) { diff --git a/tests/cpp/test_multidevice_gpu_comms.cpp b/tests/cpp/test_multidevice_gpu_comms.cpp index e017830ea53..d59fe3628a1 100644 --- a/tests/cpp/test_multidevice_gpu_comms.cpp +++ b/tests/cpp/test_multidevice_gpu_comms.cpp @@ -53,7 +53,9 @@ TEST_F(GpuCommTest, IpcMemHandle) { } -TEST_F(GpuCommTest, IpcMemHandlePtrArithmetic) { +TEST_F(GpuCommTest, IpcMemHandlePtrArithmeticAtReceiver) { + // TLDR; We can do pointer arithmetic on the receiver side. + // Allocate GPU memory constexpr size_t size = 2 * sizeof(int64_t); const int64_t num_devices = communicator_->size(); @@ -89,6 +91,44 @@ TEST_F(GpuCommTest, IpcMemHandlePtrArithmetic) { } +TEST_F(GpuCommTest, IpcMemHandlePtrArithmeticAtSender) { + // TLDR; We CANNOT do pointer arithmetic on the sender side! The IPC handle points to the beginning of the allocated buffer. + + // Allocate GPU memory + constexpr size_t size = 2 * sizeof(int64_t); + const int64_t num_devices = communicator_->size(); + const int64_t rank = communicator_->deviceId(); + const int64_t peer_rank = (rank + 1) % num_devices; + int64_t* d_ptr; + CUDA_CALL(cudaMalloc(&d_ptr, size)); + + std::vector values; + values.push_back(2 * rank); + values.push_back(2 * rank + 1); + CUDA_CALL(cudaMemcpy(d_ptr, values.data(), size, cudaMemcpyHostToDevice)); + + cudaIpcMemHandle_t ipc_handle; + CUDA_CALL(cudaIpcGetMemHandle(&ipc_handle, d_ptr + 1)); + + auto store = communicator_->getTcpStore(); + store->set("ipc_handle_" + std::to_string(rank), toBytes(ipc_handle)); + communicator_->barrier(); + auto peer_ipc_handle = fromBytes(store->get("ipc_handle_" + std::to_string(peer_rank))); + + int64_t* peer_d_ptr; + CUDA_CALL(cudaIpcOpenMemHandle((void**)&peer_d_ptr, peer_ipc_handle, cudaIpcMemLazyEnablePeerAccess)); + + int64_t peer_value; + CUDA_CALL(cudaMemcpy(&peer_value, peer_d_ptr, size / 2, cudaMemcpyDeviceToHost)); + + EXPECT_EQ(2 * peer_rank, peer_value); // and not 2 * peer_rank + 1 as could be expected! + + // Clean up + CUDA_CALL(cudaIpcCloseMemHandle(peer_d_ptr)); + CUDA_CALL(cudaFree(d_ptr)); + +} + TEST_F(GpuCommTest, Allgather) { constexpr int64_t kTensorSize = 1024; diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp index 100cc3b92a4..4fa60f67ecc 100644 --- a/tests/cpp/test_multidevice_overlap.cpp +++ b/tests/cpp/test_multidevice_overlap.cpp @@ -422,6 +422,8 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmarkStreamParallelType) { std::vector inputs = {ta, tb}; at::Tensor tc; + std::cout << "rank " << communicator_->deviceId() << ", ta_unsharded_ptr=" << ta_unsharded.data_ptr() << ", ta_ptr=" << ta.data_ptr() << std::endl; + // cudaEvent_t start, stop; // cudaEventCreate(&start); // cudaEventCreate(&stop); From 1838d1e7d91fedaa161c4d6a8bd56810741c98f1 Mon Sep 17 00:00:00 2001 From: snordmann Date: Thu, 30 Jan 2025 14:04:34 +0200 Subject: [PATCH 32/55] remove prints --- csrc/host_ir/executor.cpp | 16 ++-------------- tests/cpp/test_multidevice_gpu_comms.cpp | 2 -- 2 files changed, 2 insertions(+), 16 deletions(-) diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index 4714d14ded2..de9d3ecbcba 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -420,8 +420,6 @@ struct IpcTensorInfo { AllgatherThroughCudaMemcpyAsync::AllgatherThroughCudaMemcpyAsync(at::Tensor input, std::vector outputs, Communicator* communicator) : unique_id(running_counter++), communicator_(communicator) { std::string rank_prefix = "_rank="; - std::string ipc_handle_prefix = "_IpcHandle="; - std::string offset_prefix = "_Offset="; IpcTensorInfo ipc_tensor_info; NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcGetMemHandle(&ipc_tensor_info.ipc_handle, input.data_ptr())); @@ -431,10 +429,6 @@ AllgatherThroughCudaMemcpyAsync::AllgatherThroughCudaMemcpyAsync(at::Tensor inpu const int64_t my_rank = communicator->deviceId(); auto store = communicator->getTcpStore(); store->set(prefix() + rank_prefix + std::to_string(my_rank), toBytes(ipc_tensor_info)); - std::cout << "rank " << communicator_->deviceId() - << " sets at key " << prefix() + rank_prefix + std::to_string(my_rank) - << " offset " << input.storage_offset() << " at key " << prefix() + offset_prefix + std::to_string(my_rank) - << ", for input=" << input << std::endl; communicator_->barrier(); @@ -449,7 +443,6 @@ AllgatherThroughCudaMemcpyAsync::AllgatherThroughCudaMemcpyAsync(at::Tensor inpu if (rank == my_rank) { input_ptrs_.at(rank) = input.data_ptr(); } else { - std::cout << "rank " << communicator_->deviceId() << " gets at key " << prefix() + rank_prefix + std::to_string(rank) << " for iteration " << rank << std::endl; ipc_tensor_info = fromBytes(store->get(prefix() + rank_prefix + std::to_string(rank))); // auto peer_ipc_handle = fromBytes(store->get(prefix() + rank_prefix + std::to_string(rank))); void*& ptr = input_ptrs_.at(rank); @@ -457,20 +450,18 @@ AllgatherThroughCudaMemcpyAsync::AllgatherThroughCudaMemcpyAsync(at::Tensor inpu ptr = (void*)((uint8_t*)ptr + ipc_tensor_info.storage_offset * ipc_tensor_info.element_size); } } + // TODO: close ipc mem handle at shutdown } void AllgatherThroughCudaMemcpyAsync::post() const { + // TODO: use multicast for (size_t i = 0; i < sizes_.size(); i++) { NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync(output_ptrs_.at(i), input_ptrs_.at(i), sizes_.at(i), cudaMemcpyDeviceToDevice)); torch::cuda::synchronize(); - std::cout << "rank " << communicator_->deviceId() <<", iteration " << i << ", input_ptr=" << input_ptrs_.at(i) << ", output_ptr=" << output_ptrs_.at(i) << ", size=" << sizes_.at(i) << std::endl; } } - - - void HostIrEvaluator::handle(Communication* communication) { NVF_ERROR( communicator_ != nullptr && communicator_->is_available(), @@ -513,9 +504,6 @@ void HostIrEvaluator::handle(Communication* communication) { allgather_backend.post(); torch::cuda::synchronize(); communicator_->barrier(); - if (communicator_->deviceId() == 0) { - std::cout << "rank " << communicator_->deviceId() << " finishes allgather, output=" << output_tensor << std::endl; - } } void HostIrEvaluator::handle(P2PCommunication* communication) { diff --git a/tests/cpp/test_multidevice_gpu_comms.cpp b/tests/cpp/test_multidevice_gpu_comms.cpp index d59fe3628a1..026df97ea26 100644 --- a/tests/cpp/test_multidevice_gpu_comms.cpp +++ b/tests/cpp/test_multidevice_gpu_comms.cpp @@ -88,7 +88,6 @@ TEST_F(GpuCommTest, IpcMemHandlePtrArithmeticAtReceiver) { // Clean up CUDA_CALL(cudaIpcCloseMemHandle(peer_d_ptr)); CUDA_CALL(cudaFree(d_ptr)); - } TEST_F(GpuCommTest, IpcMemHandlePtrArithmeticAtSender) { @@ -126,7 +125,6 @@ TEST_F(GpuCommTest, IpcMemHandlePtrArithmeticAtSender) { // Clean up CUDA_CALL(cudaIpcCloseMemHandle(peer_d_ptr)); CUDA_CALL(cudaFree(d_ptr)); - } TEST_F(GpuCommTest, Allgather) { From 21eed4a7acf0e9e05a117f312deaf80f1a24230a Mon Sep 17 00:00:00 2001 From: snordmann Date: Thu, 30 Jan 2025 08:51:02 -0800 Subject: [PATCH 33/55] working chkpt --- csrc/host_ir/executor.cpp | 19 ++++---- csrc/host_ir/executor.h | 2 +- tests/cpp/test_multidevice_overlap.cpp | 61 ++++++++++++-------------- 3 files changed, 37 insertions(+), 45 deletions(-) diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index de9d3ecbcba..012f5d054e2 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -190,7 +190,7 @@ HostIrEvaluator::HostIrEvaluator( : container_(std::move(container)), communicator_(communicator), params_(params), - my_device_index_(communicator_ ? communicator_->deviceId() : 0) { + my_local_device_index_(communicator_ ? communicator_->local_rank() : 0) { const DeviceIdxType device_index = (communicator_ != nullptr && communicator_->is_available()) ? communicator_->deviceId() @@ -280,13 +280,13 @@ void HostIrEvaluator::handle(GetCurrentStream* get_current_stream) { streams_.insert( {get_current_stream->stream(), c10::cuda::getCurrentCUDAStream( - static_cast(my_device_index_))}); + static_cast(my_local_device_index_))}); } void HostIrEvaluator::handle(Synchronize* synchronize) { cudaStream_t current_stream = c10::cuda::getCurrentCUDAStream( - static_cast(my_device_index_)) + static_cast(my_local_device_index_)) .stream(); cudaStream_t stream_to_sync = getCUDAStream(synchronize->stream()).stream(); @@ -419,7 +419,6 @@ struct IpcTensorInfo { AllgatherThroughCudaMemcpyAsync::AllgatherThroughCudaMemcpyAsync(at::Tensor input, std::vector outputs, Communicator* communicator) : unique_id(running_counter++), communicator_(communicator) { - std::string rank_prefix = "_rank="; IpcTensorInfo ipc_tensor_info; NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcGetMemHandle(&ipc_tensor_info.ipc_handle, input.data_ptr())); @@ -428,7 +427,7 @@ AllgatherThroughCudaMemcpyAsync::AllgatherThroughCudaMemcpyAsync(at::Tensor inpu const int64_t my_rank = communicator->deviceId(); auto store = communicator->getTcpStore(); - store->set(prefix() + rank_prefix + std::to_string(my_rank), toBytes(ipc_tensor_info)); + store->set(prefix() + std::to_string(my_rank), toBytes(ipc_tensor_info)); communicator_->barrier(); @@ -443,8 +442,7 @@ AllgatherThroughCudaMemcpyAsync::AllgatherThroughCudaMemcpyAsync(at::Tensor inpu if (rank == my_rank) { input_ptrs_.at(rank) = input.data_ptr(); } else { - ipc_tensor_info = fromBytes(store->get(prefix() + rank_prefix + std::to_string(rank))); - // auto peer_ipc_handle = fromBytes(store->get(prefix() + rank_prefix + std::to_string(rank))); + ipc_tensor_info = fromBytes(store->get(prefix() + std::to_string(rank))); void*& ptr = input_ptrs_.at(rank); NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcOpenMemHandle(&ptr, ipc_tensor_info.ipc_handle, cudaIpcMemLazyEnablePeerAccess)); ptr = (void*)((uint8_t*)ptr + ipc_tensor_info.storage_offset * ipc_tensor_info.element_size); @@ -454,10 +452,10 @@ AllgatherThroughCudaMemcpyAsync::AllgatherThroughCudaMemcpyAsync(at::Tensor inpu } void AllgatherThroughCudaMemcpyAsync::post() const { + cudaStream_t stream = c10::cuda::getCurrentCUDAStream(static_cast(communicator_->local_rank())).stream(); // TODO: use multicast for (size_t i = 0; i < sizes_.size(); i++) { - NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync(output_ptrs_.at(i), input_ptrs_.at(i), sizes_.at(i), cudaMemcpyDeviceToDevice)); - torch::cuda::synchronize(); + NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync(output_ptrs_.at(i), input_ptrs_.at(i), sizes_.at(i), cudaMemcpyDeviceToDevice, stream)); } } @@ -487,6 +485,7 @@ void HostIrEvaluator::handle(Communication* communication) { } NVF_ERROR(communication->type() == CommunicationType::Allgather); + // TODO: fix registration cache // if (allgather_backends_.find(communication) == allgather_backends_.end()) { // // TODO: retrieve sharded axis here // auto output_tensors = at::tensor_split(output_tensor.squeeze(), communication->team_size(), 0); @@ -502,8 +501,6 @@ void HostIrEvaluator::handle(Communication* communication) { auto output_tensors = at::tensor_split(output_tensor.squeeze(), communication->team_size(), 0); AllgatherThroughCudaMemcpyAsync allgather_backend(input_tensor, output_tensors, communicator_); allgather_backend.post(); - torch::cuda::synchronize(); - communicator_->barrier(); } void HostIrEvaluator::handle(P2PCommunication* communication) { diff --git a/csrc/host_ir/executor.h b/csrc/host_ir/executor.h index f4cb7608d1d..44e615eb484 100644 --- a/csrc/host_ir/executor.h +++ b/csrc/host_ir/executor.h @@ -176,7 +176,7 @@ class HostIrEvaluator final : public OptOutDispatch { using StreamKey = std::variant; std::unordered_map streams_; std::unordered_map> works_; - const int64_t my_device_index_; + const int64_t my_local_device_index_; std::unordered_map allgather_backends_; }; diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp index 4fa60f67ecc..21c2a838805 100644 --- a/tests/cpp/test_multidevice_overlap.cpp +++ b/tests/cpp/test_multidevice_overlap.cpp @@ -350,10 +350,10 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) { } TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmarkStreamParallelType) { - // constexpr int64_t number_of_warmups = 50; - // constexpr int64_t number_of_iterations = 200; - // constexpr int64_t iteration_profiler_start = 10; - // constexpr int64_t iteration_profiler_end = 15; + constexpr int64_t number_of_warmups = 50; + constexpr int64_t number_of_iterations = 200; + constexpr int64_t iteration_profiler_start = 10; + constexpr int64_t iteration_profiler_end = 15; const int64_t D = communicator_->size(); auto [backend, @@ -422,39 +422,34 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmarkStreamParallelType) { std::vector inputs = {ta, tb}; at::Tensor tc; - std::cout << "rank " << communicator_->deviceId() << ", ta_unsharded_ptr=" << ta_unsharded.data_ptr() << ", ta_ptr=" << ta.data_ptr() << std::endl; - - // cudaEvent_t start, stop; - // cudaEventCreate(&start); - // cudaEventCreate(&stop); + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); - // for (const auto& iteration : - // c10::irange(1)) { - // if (iteration == iteration_profiler_start) { - // cudaProfilerStart();; - // } - // if (iteration == number_of_warmups) { - // cudaEventRecord(start); - // } + for (const auto& iteration : + c10::irange(number_of_iterations)) { + if (iteration == iteration_profiler_start) { + // cudaProfilerStart();; + } + if (iteration == number_of_warmups) { + cudaEventRecord(start); + } tc = executor.runWithInput(inputs).at(0); - // if (iteration == iteration_profiler_end) { - // cudaProfilerStop();; - // } - // } - // cudaEventRecord(stop); - // cudaEventSynchronize(stop); - // float milliseconds = 0; - // cudaEventElapsedTime(&milliseconds, start, stop); - // milliseconds /= number_of_iterations; - - // std::string test_name = ::testing::UnitTest::GetInstance()->current_test_info()->name(); - // times.insert({test_name, milliseconds}); - // std::cout << "rank " << communicator_->deviceId() << ", " << test_name << " : " << milliseconds << std::endl; - - torch::cuda::synchronize(); - communicator_->barrier(); + if (iteration == iteration_profiler_end) { + // cudaProfilerStop();; + } + } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + milliseconds /= number_of_iterations; + + std::string test_name = ::testing::UnitTest::GetInstance()->current_test_info()->name(); + times.insert({test_name, milliseconds}); + std::cout << "rank " << communicator_->deviceId() << ", " << test_name << " : " << milliseconds << std::endl; EXPECT_TRUE(torch::allclose(tc_ref, tc, 1e-1, 1e-1)) << "rank " << communicator_->deviceId() << "failed.\ntc_ref: " << tc_ref << ",\ntc: " << tc; } From f455c7093186cd2c8fa430b2ecac76556a41a4ca Mon Sep 17 00:00:00 2001 From: snordmann Date: Thu, 30 Jan 2025 09:11:44 -0800 Subject: [PATCH 34/55] reenable profiling --- tests/cpp/test_multidevice_overlap.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp index 21c2a838805..6344fcae890 100644 --- a/tests/cpp/test_multidevice_overlap.cpp +++ b/tests/cpp/test_multidevice_overlap.cpp @@ -429,7 +429,7 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmarkStreamParallelType) { for (const auto& iteration : c10::irange(number_of_iterations)) { if (iteration == iteration_profiler_start) { - // cudaProfilerStart();; + cudaProfilerStart(); } if (iteration == number_of_warmups) { cudaEventRecord(start); @@ -438,7 +438,7 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmarkStreamParallelType) { tc = executor.runWithInput(inputs).at(0); if (iteration == iteration_profiler_end) { - // cudaProfilerStop();; + cudaProfilerStop(); } } cudaEventRecord(stop); From 5a27b7e5d4231073a4ea7ca0256bcd0a9a618903 Mon Sep 17 00:00:00 2001 From: snordmann Date: Fri, 31 Jan 2025 06:26:25 -0800 Subject: [PATCH 35/55] fix cache for ipc handles --- bench/test | 10 +-- csrc/host_ir/executor.cpp | 77 +++--------------------- csrc/host_ir/executor.h | 33 ---------- csrc/multidevice/communicator.cpp | 41 +++++++++++++ csrc/multidevice/communicator.h | 30 +++++++++ tests/cpp/test_multidevice_gpu_comms.cpp | 3 +- 6 files changed, 88 insertions(+), 106 deletions(-) diff --git a/bench/test b/bench/test index 8527e2d370c..19275e4b2e5 100755 --- a/bench/test +++ b/bench/test @@ -1,15 +1,15 @@ #!/bin/bash -EXPERIMENT=StreamParallelType_tests +EXPERIMENT=CUDA_tests DATE=$(date +%Y%m%d-%H%M) LOG_BASE="/opt/pytorch/Fuser/bench/logs" NP=8 -BACKEND=UCC +BACKEND=CUDA M=32768 K=32768 N=1024 -DTYPE="__half" # float, __bfloat +DTYPE="float" #"__half" # float, __bfloat S=8 Streams=3 @@ -80,7 +80,9 @@ echo "test cmd: $TEST_CMD" | tee -a $LOG_FILE_INFO MPICMD="mpirun $MPIFLAGS $TEST_CMD" echo $MPICMD | tee -a $LOG_FILE_INFO -NSYSCMD="nsys profile --stats=false -w true -t cublas,cuda,nvtx,osrt,mpi,ucx -o ${LOGS}/${GTEST_POSTFIX} --capture-range-end stop --capture-range=cudaProfilerApi --cudabacktrace=memory,sync,kernel,other" +# opt/pytorch/scripts/nsight/install-nsight.sh +NSYS=$(sudo which nsys) +NSYSCMD="${NSYS} profile --stats=false -w true -t cublas,cuda,nvtx,osrt,mpi,ucx -o ${LOGS}/${GTEST_POSTFIX} --capture-range-end stop --capture-range=cudaProfilerApi --cudabacktrace=memory,sync,kernel,other" CMD="${NSYSCMD} ${MPICMD}" sudo /bin/sh -c "echo '1' > /proc/sys/kernel/perf_event_paranoid" diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index 012f5d054e2..7bc35107996 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -409,57 +409,6 @@ void HostIrEvaluator::handle(PostOnStream* post_ir) { } } -int64_t AllgatherThroughCudaMemcpyAsync::running_counter = 0; - -struct IpcTensorInfo { - cudaIpcMemHandle_t ipc_handle; - int64_t storage_offset; - int64_t element_size; -}; - -AllgatherThroughCudaMemcpyAsync::AllgatherThroughCudaMemcpyAsync(at::Tensor input, std::vector outputs, Communicator* communicator) : unique_id(running_counter++), communicator_(communicator) { - - - IpcTensorInfo ipc_tensor_info; - NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcGetMemHandle(&ipc_tensor_info.ipc_handle, input.data_ptr())); - ipc_tensor_info.storage_offset = input.storage_offset(); - ipc_tensor_info.element_size = input.element_size(); - - const int64_t my_rank = communicator->deviceId(); - auto store = communicator->getTcpStore(); - store->set(prefix() + std::to_string(my_rank), toBytes(ipc_tensor_info)); - - communicator_->barrier(); - - sizes_.resize(communicator_->size(), 0); - input_ptrs_.resize(communicator_->size(), nullptr); - output_ptrs_.resize(communicator_->size(), nullptr); - for (int64_t rank: c10::irange(communicator_->size())) { - auto output = outputs.at(rank); - sizes_.at(rank) = output.numel() * output.element_size(); - - output_ptrs_.at(rank) = output.data_ptr(); - if (rank == my_rank) { - input_ptrs_.at(rank) = input.data_ptr(); - } else { - ipc_tensor_info = fromBytes(store->get(prefix() + std::to_string(rank))); - void*& ptr = input_ptrs_.at(rank); - NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcOpenMemHandle(&ptr, ipc_tensor_info.ipc_handle, cudaIpcMemLazyEnablePeerAccess)); - ptr = (void*)((uint8_t*)ptr + ipc_tensor_info.storage_offset * ipc_tensor_info.element_size); - } - } - // TODO: close ipc mem handle at shutdown -} - -void AllgatherThroughCudaMemcpyAsync::post() const { - cudaStream_t stream = c10::cuda::getCurrentCUDAStream(static_cast(communicator_->local_rank())).stream(); - // TODO: use multicast - for (size_t i = 0; i < sizes_.size(); i++) { - NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync(output_ptrs_.at(i), input_ptrs_.at(i), sizes_.at(i), cudaMemcpyDeviceToDevice, stream)); - } -} - - void HostIrEvaluator::handle(Communication* communication) { NVF_ERROR( communicator_ != nullptr && communicator_->is_available(), @@ -485,22 +434,16 @@ void HostIrEvaluator::handle(Communication* communication) { } NVF_ERROR(communication->type() == CommunicationType::Allgather); - // TODO: fix registration cache - // if (allgather_backends_.find(communication) == allgather_backends_.end()) { - // // TODO: retrieve sharded axis here - // auto output_tensors = at::tensor_split(output_tensor.squeeze(), communication->team_size(), 0); - // allgather_backends_.try_emplace( - // communication, - // AllgatherThroughCudaMemcpyAsync( - // input_tensor, - // output_tensors, - // communicator_)); - // } - // allgather_backends_.at(communication).post(); - - auto output_tensors = at::tensor_split(output_tensor.squeeze(), communication->team_size(), 0); - AllgatherThroughCudaMemcpyAsync allgather_backend(input_tensor, output_tensors, communicator_); - allgather_backend.post(); + + std::vector output_tensors = at::tensor_split(output_tensor.squeeze(), communication->team_size(), 0); + std::vector input_ptrs = communicator_->getRemotePtrs(input_tensor); + cudaStream_t stream = c10::cuda::getCurrentCUDAStream(static_cast(communicator_->local_rank())).stream(); + // TODO: use multicast + for (auto i = 0; i < communicator_->size(); i++) { + auto output = output_tensors.at(i); + NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync(output.data_ptr(), input_ptrs.at(i), output.numel() * output.element_size(), cudaMemcpyDeviceToDevice, stream)); + } + } void HostIrEvaluator::handle(P2PCommunication* communication) { diff --git a/csrc/host_ir/executor.h b/csrc/host_ir/executor.h index 44e615eb484..8e281b66143 100644 --- a/csrc/host_ir/executor.h +++ b/csrc/host_ir/executor.h @@ -48,40 +48,8 @@ class HostIrExecutor : public ExecutorAbstract { Communicator* communicator_; }; -template -std::vector toBytes(T data) { - return std::vector( - reinterpret_cast(&data), - reinterpret_cast(&data) + sizeof(T)); -} - -template -T fromBytes(std::vector bytes) { - return *reinterpret_cast(bytes.data()); -} - namespace hir { - -class AllgatherThroughCudaMemcpyAsync { - public: - AllgatherThroughCudaMemcpyAsync(at::Tensor input, std::vector outputs, Communicator* communicator); - - void post() const; - - private: - std::string prefix() const { - return "AllgatherThroughCudaMemcpyAsync_uniqueId=" + std::to_string(unique_id); - } - - static int64_t running_counter; - int64_t unique_id; - Communicator* communicator_; - std::vector sizes_; - std::vector input_ptrs_; - std::vector output_ptrs_; -}; - /* a HostIrEvaluator evaluates a host programs represented through a HostIrContainer It is instantiated with the desired HostIrContainer, and runs @@ -177,7 +145,6 @@ class HostIrEvaluator final : public OptOutDispatch { std::unordered_map streams_; std::unordered_map> works_; const int64_t my_local_device_index_; - std::unordered_map allgather_backends_; }; } // namespace hir diff --git a/csrc/multidevice/communicator.cpp b/csrc/multidevice/communicator.cpp index ce102695637..ce8c2226ca1 100644 --- a/csrc/multidevice/communicator.cpp +++ b/csrc/multidevice/communicator.cpp @@ -319,4 +319,45 @@ void Communicator::barrier(std::optional backend) { getWorld(backend)->barrier(options)->wait(); } +struct IpcTensorInfo { + cudaIpcMemHandle_t ipc_handle; + int64_t storage_offset; + int64_t element_size; +}; + +std::vector Communicator::getRemotePtrs(at::Tensor tensor) { + auto it = remote_ptrs_.find(tensor); + if (it == remote_ptrs_.end()) { + if (deviceId() == 0) { + std::cout << "rank " << deviceId() << " registers tensor " << tensor.data_ptr() << "with hash" << std::endl; + } + std::vector remote_ptrs(size(), nullptr); + std::string prefix = "nvfuser_ipc_tensor_info_"; + IpcTensorInfo ipc_tensor_info; + NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcGetMemHandle(&ipc_tensor_info.ipc_handle, tensor.data_ptr())); + ipc_tensor_info.storage_offset = tensor.storage_offset(); + ipc_tensor_info.element_size = tensor.element_size(); + + const int64_t my_rank = deviceId(); + auto store = getTcpStore(); + store->set(prefix + std::to_string(my_rank), toBytes(ipc_tensor_info)); + + barrier(); + + for (int64_t rank: c10::irange(size())) { + if (rank == my_rank) { + remote_ptrs.at(rank) = tensor.data_ptr(); + } else { + ipc_tensor_info = fromBytes(store->get(prefix + std::to_string(rank))); + void*& ptr = remote_ptrs.at(rank); + NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcOpenMemHandle(&ptr, ipc_tensor_info.ipc_handle, cudaIpcMemLazyEnablePeerAccess)); + // TODO: close ipc mem handle at shutdown + ptr = (void*)((uint8_t*)ptr + ipc_tensor_info.storage_offset * ipc_tensor_info.element_size); + } + } + it = remote_ptrs_.emplace(tensor, std::move(remote_ptrs)).first; + } + return it->second; +} + } // namespace nvfuser diff --git a/csrc/multidevice/communicator.h b/csrc/multidevice/communicator.h index 3ac48d9906b..7124b91f006 100644 --- a/csrc/multidevice/communicator.h +++ b/csrc/multidevice/communicator.h @@ -24,6 +24,18 @@ namespace nvfuser { +template +std::vector toBytes(T data) { + return std::vector( + reinterpret_cast(&data), + reinterpret_cast(&data) + sizeof(T)); +} + +template +T fromBytes(std::vector bytes) { + return *reinterpret_cast(bytes.data()); +} + // This file implements the class Communicator which sets up the inter-process // Backend. This class contains inter-process information, such as the rank, the // world size, as well as the Process Group that can be called to perform @@ -142,7 +154,24 @@ class Communicator { return store_; } + std::vector getRemotePtrs(at::Tensor tensor); + private: + struct TensorHash { + std::size_t operator()(const at::Tensor& tensor) const { + auto ptr = reinterpret_cast(tensor.data_ptr()); + auto offset = tensor.storage_offset(); + auto element_size = tensor.element_size(); + return std::hash()(ptr) ^ std::hash()(offset) ^ std::hash()(element_size); + } + }; + + struct TensorEqual { + bool operator()(const at::Tensor& lhs, const at::Tensor& rhs) const { + return lhs.equal(rhs); + } + }; + Communicator( CommunicatorBackend backend = comm_backend_default, RankType server_local_rank = comm_server_local_rank_default); @@ -175,6 +204,7 @@ class Communicator { c10::intrusive_ptr store_; // cache for the created backends. The keys are strings generated from Teams std::unordered_map> backends_; + std::unordered_map, TensorHash, TensorEqual> remote_ptrs_; }; } // namespace nvfuser diff --git a/tests/cpp/test_multidevice_gpu_comms.cpp b/tests/cpp/test_multidevice_gpu_comms.cpp index 026df97ea26..db44f0a5e31 100644 --- a/tests/cpp/test_multidevice_gpu_comms.cpp +++ b/tests/cpp/test_multidevice_gpu_comms.cpp @@ -136,8 +136,7 @@ TEST_F(GpuCommTest, Allgather) { return at::empty({kTensorSize}, tensor_options); }); - hir::AllgatherThroughCudaMemcpyAsync allgather(input, outputs, communicator_); - allgather.post(); + // AllgatherThroughCudaMemcpyAsync(input, outputs, communicator_); torch::cuda::synchronize(); communicator_->barrier(); From 356feeb188c9aec0db0c3dc1d2dabe5f3ddf9f32 Mon Sep 17 00:00:00 2001 From: snordmann Date: Mon, 3 Feb 2025 07:08:09 -0800 Subject: [PATCH 36/55] synchronize running stream with original stream at the beginning of pipeline for-loop --- csrc/host_ir/executor.cpp | 19 ++++++++++++++++++- csrc/host_ir/lower.cpp | 2 ++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index 7bc35107996..4133f3f7a75 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -437,11 +437,28 @@ void HostIrEvaluator::handle(Communication* communication) { std::vector output_tensors = at::tensor_split(output_tensor.squeeze(), communication->team_size(), 0); std::vector input_ptrs = communicator_->getRemotePtrs(input_tensor); - cudaStream_t stream = c10::cuda::getCurrentCUDAStream(static_cast(communicator_->local_rank())).stream(); + cudaStream_t current_stream = c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream(); // TODO: use multicast for (auto i = 0; i < communicator_->size(); i++) { + cudaStream_t stream = c10::cuda::getStreamFromPool(/*isHighPriority=*/false, my_local_device_index_).stream(); + cudaEvent_t event = {}; + NVFUSER_CUDA_RT_SAFE_CALL( + cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); + NVFUSER_CUDA_RT_SAFE_CALL(cudaEventRecord(event, current_stream)); + NVFUSER_CUDA_RT_SAFE_CALL( + cudaStreamWaitEvent(stream, event, cudaEventWaitDefault)); + NVFUSER_CUDA_RT_SAFE_CALL(cudaEventDestroy(event)); + auto output = output_tensors.at(i); NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync(output.data_ptr(), input_ptrs.at(i), output.numel() * output.element_size(), cudaMemcpyDeviceToDevice, stream)); + + // sync + NVFUSER_CUDA_RT_SAFE_CALL( + cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); + NVFUSER_CUDA_RT_SAFE_CALL(cudaEventRecord(event, stream)); + NVFUSER_CUDA_RT_SAFE_CALL( + cudaStreamWaitEvent(current_stream, event, cudaEventWaitDefault)); + NVFUSER_CUDA_RT_SAFE_CALL(cudaEventDestroy(event)); } } diff --git a/csrc/host_ir/lower.cpp b/csrc/host_ir/lower.cpp index 0bb4a8b885c..f4a53783460 100644 --- a/csrc/host_ir/lower.cpp +++ b/csrc/host_ir/lower.cpp @@ -457,6 +457,7 @@ std::vector HostIrLower::lowerToCollectiveBasedPipelinedGemmComm( auto* stream_index = mod(j, number_of_streams); auto* stream = IrBuilder::create(stream_index); auto* set_stream = IrBuilder::create(stream); + auto* initial_sync_stream = IrBuilder::create(original_stream); TensorView* tva_j = select(tva, 0, j); TensorView* tva_allgathered_j = select(tva_allgathered, 0, j); @@ -496,6 +497,7 @@ std::vector HostIrLower::lowerToCollectiveBasedPipelinedGemmComm( std::vector loop_body = { set_stream, + initial_sync_stream, tva_j->definition(), tva_allgathered_j->definition(), communication, From 4c0736a7f06e9a1b0189bd194f3cbee6aa8e93e7 Mon Sep 17 00:00:00 2001 From: snordmann Date: Mon, 3 Feb 2025 07:11:01 -0800 Subject: [PATCH 37/55] lint --- bench/process_outputs | 2 +- bench/test | 1 - csrc/host_ir/executor.cpp | 19 +- csrc/host_ir/lower.cpp | 3 +- csrc/host_ir/lower.h | 4 +- csrc/multidevice/communication.h | 1 - csrc/multidevice/communicator.cpp | 18 +- csrc/multidevice/communicator.h | 6 +- csrc/multidevice/executor.cpp | 4 +- csrc/multidevice/executor.h | 2 +- tests/cpp/multidevice_kernels.cu | 8 +- tests/cpp/test_multidevice_gpu_comms.cpp | 35 +-- tests/cpp/test_multidevice_overlap.cpp | 258 +++++++++++++---------- tests/cpp/test_multidevice_pipeline.cpp | 4 +- 14 files changed, 217 insertions(+), 148 deletions(-) diff --git a/bench/process_outputs b/bench/process_outputs index c1781394dbc..8913a10dd04 100755 --- a/bench/process_outputs +++ b/bench/process_outputs @@ -4,4 +4,4 @@ FILE="/opt/pytorch/Fuser/bench/logs/${1}/info" cat $FILE | grep "rank 0: " #| awk '{print $4}' -# | grep -E 'Streams32\b' \ No newline at end of file +# | grep -E 'Streams32\b' diff --git a/bench/test b/bench/test index 19275e4b2e5..8abc200f9a9 100755 --- a/bench/test +++ b/bench/test @@ -88,4 +88,3 @@ CMD="${NSYSCMD} ${MPICMD}" sudo /bin/sh -c "echo '1' > /proc/sys/kernel/perf_event_paranoid" echo $CMD | tee -a ${LOG_FILE_INFO} $CMD | tee -a ${LOG_FILE_INFO} - diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index 4133f3f7a75..cc30ee58316 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -435,12 +435,16 @@ void HostIrEvaluator::handle(Communication* communication) { NVF_ERROR(communication->type() == CommunicationType::Allgather); - std::vector output_tensors = at::tensor_split(output_tensor.squeeze(), communication->team_size(), 0); + std::vector output_tensors = + at::tensor_split(output_tensor.squeeze(), communication->team_size(), 0); std::vector input_ptrs = communicator_->getRemotePtrs(input_tensor); - cudaStream_t current_stream = c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream(); + cudaStream_t current_stream = + c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream(); // TODO: use multicast for (auto i = 0; i < communicator_->size(); i++) { - cudaStream_t stream = c10::cuda::getStreamFromPool(/*isHighPriority=*/false, my_local_device_index_).stream(); + cudaStream_t stream = c10::cuda::getStreamFromPool( + /*isHighPriority=*/false, my_local_device_index_) + .stream(); cudaEvent_t event = {}; NVFUSER_CUDA_RT_SAFE_CALL( cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); @@ -450,7 +454,12 @@ void HostIrEvaluator::handle(Communication* communication) { NVFUSER_CUDA_RT_SAFE_CALL(cudaEventDestroy(event)); auto output = output_tensors.at(i); - NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync(output.data_ptr(), input_ptrs.at(i), output.numel() * output.element_size(), cudaMemcpyDeviceToDevice, stream)); + NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync( + output.data_ptr(), + input_ptrs.at(i), + output.numel() * output.element_size(), + cudaMemcpyDeviceToDevice, + stream)); // sync NVFUSER_CUDA_RT_SAFE_CALL( @@ -460,7 +469,6 @@ void HostIrEvaluator::handle(Communication* communication) { cudaStreamWaitEvent(current_stream, event, cudaEventWaitDefault)); NVFUSER_CUDA_RT_SAFE_CALL(cudaEventDestroy(event)); } - } void HostIrEvaluator::handle(P2PCommunication* communication) { @@ -474,7 +482,6 @@ void HostIrEvaluator::handle(P2PCommunication* communication) { CommunicatorBackend backend_type = communication->backend(); if (backend_type != CommunicatorBackend::kCuda) { - works_[communication] = postSingleCommunication( communication, communicator_->deviceId(), diff --git a/csrc/host_ir/lower.cpp b/csrc/host_ir/lower.cpp index f4a53783460..194d7ee7170 100644 --- a/csrc/host_ir/lower.cpp +++ b/csrc/host_ir/lower.cpp @@ -457,7 +457,8 @@ std::vector HostIrLower::lowerToCollectiveBasedPipelinedGemmComm( auto* stream_index = mod(j, number_of_streams); auto* stream = IrBuilder::create(stream_index); auto* set_stream = IrBuilder::create(stream); - auto* initial_sync_stream = IrBuilder::create(original_stream); + auto* initial_sync_stream = + IrBuilder::create(original_stream); TensorView* tva_j = select(tva, 0, j); TensorView* tva_allgathered_j = select(tva_allgathered, 0, j); diff --git a/csrc/host_ir/lower.h b/csrc/host_ir/lower.h index 88d5dd10fa7..bce81d3ecab 100644 --- a/csrc/host_ir/lower.h +++ b/csrc/host_ir/lower.h @@ -20,8 +20,8 @@ struct HostIrLowerParams { class HostIrLower { public: - - HostIrLower(HostIrLowerParams params = HostIrLowerParams()) : params_(params) {} + HostIrLower(HostIrLowerParams params = HostIrLowerParams()) + : params_(params) {} // The flag `ignore_inner_resharding` is useful because the preseg passes // `InsertReshardingsPass` and `ReorderShardedAxisPass` want different diff --git a/csrc/multidevice/communication.h b/csrc/multidevice/communication.h index 944df467a62..d8724356e15 100644 --- a/csrc/multidevice/communication.h +++ b/csrc/multidevice/communication.h @@ -164,7 +164,6 @@ class P2PCommunication : public Expr { CommunicatorBackend& backend() { return attribute(2); } - }; // The method "post" triggers the execution of the communication. This call is diff --git a/csrc/multidevice/communicator.cpp b/csrc/multidevice/communicator.cpp index ce8c2226ca1..46be9eb885f 100644 --- a/csrc/multidevice/communicator.cpp +++ b/csrc/multidevice/communicator.cpp @@ -329,12 +329,14 @@ std::vector Communicator::getRemotePtrs(at::Tensor tensor) { auto it = remote_ptrs_.find(tensor); if (it == remote_ptrs_.end()) { if (deviceId() == 0) { - std::cout << "rank " << deviceId() << " registers tensor " << tensor.data_ptr() << "with hash" << std::endl; + std::cout << "rank " << deviceId() << " registers tensor " + << tensor.data_ptr() << "with hash" << std::endl; } std::vector remote_ptrs(size(), nullptr); std::string prefix = "nvfuser_ipc_tensor_info_"; IpcTensorInfo ipc_tensor_info; - NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcGetMemHandle(&ipc_tensor_info.ipc_handle, tensor.data_ptr())); + NVFUSER_CUDA_RT_SAFE_CALL( + cudaIpcGetMemHandle(&ipc_tensor_info.ipc_handle, tensor.data_ptr())); ipc_tensor_info.storage_offset = tensor.storage_offset(); ipc_tensor_info.element_size = tensor.element_size(); @@ -344,15 +346,19 @@ std::vector Communicator::getRemotePtrs(at::Tensor tensor) { barrier(); - for (int64_t rank: c10::irange(size())) { + for (int64_t rank : c10::irange(size())) { if (rank == my_rank) { remote_ptrs.at(rank) = tensor.data_ptr(); } else { - ipc_tensor_info = fromBytes(store->get(prefix + std::to_string(rank))); + ipc_tensor_info = + fromBytes(store->get(prefix + std::to_string(rank))); void*& ptr = remote_ptrs.at(rank); - NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcOpenMemHandle(&ptr, ipc_tensor_info.ipc_handle, cudaIpcMemLazyEnablePeerAccess)); + NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcOpenMemHandle( + &ptr, ipc_tensor_info.ipc_handle, cudaIpcMemLazyEnablePeerAccess)); // TODO: close ipc mem handle at shutdown - ptr = (void*)((uint8_t*)ptr + ipc_tensor_info.storage_offset * ipc_tensor_info.element_size); + ptr = (void*)((uint8_t*)ptr + + ipc_tensor_info.storage_offset * + ipc_tensor_info.element_size); } } it = remote_ptrs_.emplace(tensor, std::move(remote_ptrs)).first; diff --git a/csrc/multidevice/communicator.h b/csrc/multidevice/communicator.h index 7124b91f006..ebe4a60ddfd 100644 --- a/csrc/multidevice/communicator.h +++ b/csrc/multidevice/communicator.h @@ -162,7 +162,8 @@ class Communicator { auto ptr = reinterpret_cast(tensor.data_ptr()); auto offset = tensor.storage_offset(); auto element_size = tensor.element_size(); - return std::hash()(ptr) ^ std::hash()(offset) ^ std::hash()(element_size); + return std::hash()(ptr) ^ std::hash()(offset) ^ + std::hash()(element_size); } }; @@ -204,7 +205,8 @@ class Communicator { c10::intrusive_ptr store_; // cache for the created backends. The keys are strings generated from Teams std::unordered_map> backends_; - std::unordered_map, TensorHash, TensorEqual> remote_ptrs_; + std::unordered_map, TensorHash, TensorEqual> + remote_ptrs_; }; } // namespace nvfuser diff --git a/csrc/multidevice/executor.cpp b/csrc/multidevice/executor.cpp index eaea12ef2f3..5cb0d691f09 100644 --- a/csrc/multidevice/executor.cpp +++ b/csrc/multidevice/executor.cpp @@ -29,8 +29,8 @@ MultiDeviceExecutor::MultiDeviceExecutor( std::unique_ptr hic = lower.lower(std::move(fusion), comm.deviceId()); // Create the HostIrEvaluator representing the host program - host_ir_executor_ = - std::make_unique(std::move(hic), &comm, params.executor); + host_ir_executor_ = std::make_unique( + std::move(hic), &comm, params.executor); } std::vector MultiDeviceExecutor::runWithInput( diff --git a/csrc/multidevice/executor.h b/csrc/multidevice/executor.h index e43b7c57f72..a3953fd0a47 100644 --- a/csrc/multidevice/executor.h +++ b/csrc/multidevice/executor.h @@ -11,8 +11,8 @@ #include #include #include -#include #include +#include #include #include #include diff --git a/tests/cpp/multidevice_kernels.cu b/tests/cpp/multidevice_kernels.cu index 1d38e034137..cd8275dc92c 100644 --- a/tests/cpp/multidevice_kernels.cu +++ b/tests/cpp/multidevice_kernels.cu @@ -10,12 +10,16 @@ // (except raw headers). Compiling dynamic_type.h with nvcc is not supported. // Compiling pytorch with nvcc is not supported either. -#include #include +#include namespace nvfuser { -#define CUDA_CALL(call) NVF_ERROR((call) == cudaSuccess, "CUDA call failed: ", cudaGetErrorString(cudaGetLastError())) +#define CUDA_CALL(call) \ + NVF_ERROR( \ + (call) == cudaSuccess, \ + "CUDA call failed: ", \ + cudaGetErrorString(cudaGetLastError())) __global__ void DummyMultiDeviceKernel() {} diff --git a/tests/cpp/test_multidevice_gpu_comms.cpp b/tests/cpp/test_multidevice_gpu_comms.cpp index db44f0a5e31..37e72445484 100644 --- a/tests/cpp/test_multidevice_gpu_comms.cpp +++ b/tests/cpp/test_multidevice_gpu_comms.cpp @@ -37,10 +37,12 @@ TEST_F(GpuCommTest, IpcMemHandle) { auto store = communicator_->getTcpStore(); store->set("ipc_handle_" + std::to_string(rank), toBytes(ipc_handle)); communicator_->barrier(); - auto peer_ipc_handle = fromBytes(store->get("ipc_handle_" + std::to_string((rank + 1) % num_devices))); + auto peer_ipc_handle = fromBytes( + store->get("ipc_handle_" + std::to_string((rank + 1) % num_devices))); void* peer_d_ptr; - CUDA_CALL(cudaIpcOpenMemHandle(&peer_d_ptr, peer_ipc_handle, cudaIpcMemLazyEnablePeerAccess)); + CUDA_CALL(cudaIpcOpenMemHandle( + &peer_d_ptr, peer_ipc_handle, cudaIpcMemLazyEnablePeerAccess)); int64_t peer_value; CUDA_CALL(cudaMemcpy(&peer_value, peer_d_ptr, size, cudaMemcpyDeviceToHost)); @@ -50,7 +52,6 @@ TEST_F(GpuCommTest, IpcMemHandle) { // Clean up CUDA_CALL(cudaIpcCloseMemHandle(peer_d_ptr)); CUDA_CALL(cudaFree(d_ptr)); - } TEST_F(GpuCommTest, IpcMemHandlePtrArithmeticAtReceiver) { @@ -75,13 +76,16 @@ TEST_F(GpuCommTest, IpcMemHandlePtrArithmeticAtReceiver) { auto store = communicator_->getTcpStore(); store->set("ipc_handle_" + std::to_string(rank), toBytes(ipc_handle)); communicator_->barrier(); - auto peer_ipc_handle = fromBytes(store->get("ipc_handle_" + std::to_string(peer_rank))); + auto peer_ipc_handle = fromBytes( + store->get("ipc_handle_" + std::to_string(peer_rank))); int64_t* peer_d_ptr; - CUDA_CALL(cudaIpcOpenMemHandle((void**)&peer_d_ptr, peer_ipc_handle, cudaIpcMemLazyEnablePeerAccess)); + CUDA_CALL(cudaIpcOpenMemHandle( + (void**)&peer_d_ptr, peer_ipc_handle, cudaIpcMemLazyEnablePeerAccess)); int64_t peer_value; - CUDA_CALL(cudaMemcpy(&peer_value, peer_d_ptr + 1, size / 2, cudaMemcpyDeviceToHost)); + CUDA_CALL(cudaMemcpy( + &peer_value, peer_d_ptr + 1, size / 2, cudaMemcpyDeviceToHost)); EXPECT_EQ(2 * peer_rank + 1, peer_value); @@ -91,7 +95,8 @@ TEST_F(GpuCommTest, IpcMemHandlePtrArithmeticAtReceiver) { } TEST_F(GpuCommTest, IpcMemHandlePtrArithmeticAtSender) { - // TLDR; We CANNOT do pointer arithmetic on the sender side! The IPC handle points to the beginning of the allocated buffer. + // TLDR; We CANNOT do pointer arithmetic on the sender side! The IPC handle + // points to the beginning of the allocated buffer. // Allocate GPU memory constexpr size_t size = 2 * sizeof(int64_t); @@ -112,15 +117,20 @@ TEST_F(GpuCommTest, IpcMemHandlePtrArithmeticAtSender) { auto store = communicator_->getTcpStore(); store->set("ipc_handle_" + std::to_string(rank), toBytes(ipc_handle)); communicator_->barrier(); - auto peer_ipc_handle = fromBytes(store->get("ipc_handle_" + std::to_string(peer_rank))); + auto peer_ipc_handle = fromBytes( + store->get("ipc_handle_" + std::to_string(peer_rank))); int64_t* peer_d_ptr; - CUDA_CALL(cudaIpcOpenMemHandle((void**)&peer_d_ptr, peer_ipc_handle, cudaIpcMemLazyEnablePeerAccess)); + CUDA_CALL(cudaIpcOpenMemHandle( + (void**)&peer_d_ptr, peer_ipc_handle, cudaIpcMemLazyEnablePeerAccess)); int64_t peer_value; - CUDA_CALL(cudaMemcpy(&peer_value, peer_d_ptr, size / 2, cudaMemcpyDeviceToHost)); + CUDA_CALL( + cudaMemcpy(&peer_value, peer_d_ptr, size / 2, cudaMemcpyDeviceToHost)); - EXPECT_EQ(2 * peer_rank, peer_value); // and not 2 * peer_rank + 1 as could be expected! + EXPECT_EQ( + 2 * peer_rank, + peer_value); // and not 2 * peer_rank + 1 as could be expected! // Clean up CUDA_CALL(cudaIpcCloseMemHandle(peer_d_ptr)); @@ -130,7 +140,8 @@ TEST_F(GpuCommTest, IpcMemHandlePtrArithmeticAtSender) { TEST_F(GpuCommTest, Allgather) { constexpr int64_t kTensorSize = 1024; - at::Tensor input = at::full({kTensorSize}, communicator_->deviceId(), tensor_options); + at::Tensor input = + at::full({kTensorSize}, communicator_->deviceId(), tensor_options); auto outputs = std::vector(communicator_->size()); std::generate(outputs.begin(), outputs.end(), [&]() { return at::empty({kTensorSize}, tensor_options); diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp index 6344fcae890..ee916344001 100644 --- a/tests/cpp/test_multidevice_overlap.cpp +++ b/tests/cpp/test_multidevice_overlap.cpp @@ -9,6 +9,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -16,9 +19,6 @@ #include #include #include -#include -#include -#include #define CUSTOM_PG_WITH_INTERNAL_STREAM_ACCESS 0 @@ -55,7 +55,9 @@ using DummyOverlapBenchmarkParams = std::tuple< /*pre_comm=*/bool, /*post_comm=*/bool>; -class DummyOverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterface { +class DummyOverlapBenchmark + : public MultiDeviceTest, + public testing::WithParamInterface { protected: static std::map times; @@ -64,8 +66,9 @@ class DummyOverlapBenchmark : public MultiDeviceTest, public testing::WithParamI if (rank != 0) { return; } - for (auto it: times) { - std::cout << "time " << rank << ": " << it.first << ": " << it.second << std::endl; + for (auto it : times) { + std::cout << "time " << rank << ": " << it.first << ": " << it.second + << std::endl; } } }; @@ -75,38 +78,38 @@ std::map DummyOverlapBenchmark::times = {}; TEST_P(DummyOverlapBenchmark, PipelinedAGMatmulBenchmark) { constexpr int64_t number_of_warmups = 20; constexpr int64_t number_of_iterations = 80; - constexpr int64_t total_number_of_iterations = number_of_warmups + number_of_iterations; + constexpr int64_t total_number_of_iterations = + number_of_warmups + number_of_iterations; constexpr int64_t iteration_profiler_start = 5; constexpr int64_t iteration_profiler_end = 10; - - auto [backend, - M, - K, - N, - L, - pre_comm, - post_comm] = GetParam(); + auto [backend, M, K, N, L, pre_comm, post_comm] = GetParam(); std::vector all_ranks(communicator_->size()); std::iota(all_ranks.begin(), all_ranks.end(), 0); auto world = communicator_->getBackendForTeam(all_ranks, backend); - auto nccl_world = communicator_->getBackendForTeam(all_ranks, CommunicatorBackend::kNccl); + auto nccl_world = + communicator_->getBackendForTeam(all_ranks, CommunicatorBackend::kNccl); std::vector streams = createStreams(2, communicator_->deviceId()); auto& compute_stream = streams.at(0); auto& communication_stream = streams.at(1); - auto options_matmul = at::TensorOptions().dtype(torch::kFloat16).device(communicator_->device()); + auto options_matmul = at::TensorOptions() + .dtype(torch::kFloat16) + .device(communicator_->device()); auto ta = at::randn({M, K}, options_matmul); auto tb = at::randn({K, N}, options_matmul); auto tc = at::empty({M, N}, options_matmul); - auto options_comms = at::TensorOptions().dtype(torch::kFloat32).device(communicator_->device()); + auto options_comms = at::TensorOptions() + .dtype(torch::kFloat32) + .device(communicator_->device()); auto src = at::randn({L}, options_comms); - auto dst = at::empty({L * communicator_->size()}, options_comms); - std::vector barrier_scratch_buffer = {at::randn({1}, options_comms)}; + auto dst = at::empty({L * communicator_->size()}, options_comms); + std::vector barrier_scratch_buffer = { + at::randn({1}, options_comms)}; cudaEvent_t start, stop; cudaEventCreate(&start); @@ -114,10 +117,10 @@ TEST_P(DummyOverlapBenchmark, PipelinedAGMatmulBenchmark) { nccl_world->allreduce(barrier_scratch_buffer)->wait(); - for (const auto& iteration : - c10::irange(total_number_of_iterations)) { + for (const auto& iteration : c10::irange(total_number_of_iterations)) { if (iteration % 10 == 0 && communicator_->deviceId() == 0) { - std::cout << "iteration " << iteration <<"/" << total_number_of_iterations << std::endl; + std::cout << "iteration " << iteration << "/" + << total_number_of_iterations << std::endl; } if (iteration == iteration_profiler_start) { cudaProfilerStart(); @@ -141,7 +144,8 @@ TEST_P(DummyOverlapBenchmark, PipelinedAGMatmulBenchmark) { } if (iteration == iteration_profiler_end) { - cudaProfilerStop();; + cudaProfilerStop(); + ; } if (!pre_comm & !post_comm) { nccl_world->allreduce(barrier_scratch_buffer)->wait(); @@ -157,32 +161,43 @@ TEST_P(DummyOverlapBenchmark, PipelinedAGMatmulBenchmark) { cudaEventElapsedTime(&milliseconds, start, stop); milliseconds /= number_of_iterations; - std::string test_name = ::testing::UnitTest::GetInstance()->current_test_info()->name(); + std::string test_name = + ::testing::UnitTest::GetInstance()->current_test_info()->name(); times.insert({test_name, milliseconds}); - std::cout << "rank " << communicator_->deviceId() << ", " << test_name << " : " << milliseconds << std::endl; + std::cout << "rank " << communicator_->deviceId() << ", " << test_name + << " : " << milliseconds << std::endl; } INSTANTIATE_TEST_SUITE_P( , DummyOverlapBenchmark, testing::Combine( - testing::Values(CommunicatorBackend::kNccl, CommunicatorBackend::kUcc), - /*M=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)), - /*K=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)), - /*N=*/testing::Values(pow(2,10), pow(2,15), pow(2,17)), - /*L=*/testing::Values(1, pow(2,10), pow(2,15), pow(2,17), pow(2,20), pow(2,24), pow(2,26), pow(2,28)), - /*pre-comm=*/testing::Bool(), - /*post-comm=*/testing::Bool()), + testing::Values(CommunicatorBackend::kNccl, CommunicatorBackend::kUcc), + /*M=*/testing::Values(pow(2, 10), pow(2, 15), pow(2, 17)), + /*K=*/testing::Values(pow(2, 10), pow(2, 15), pow(2, 17)), + /*N=*/testing::Values(pow(2, 10), pow(2, 15), pow(2, 17)), + /*L=*/ + testing::Values( + 1, + pow(2, 10), + pow(2, 15), + pow(2, 17), + pow(2, 20), + pow(2, 24), + pow(2, 26), + pow(2, 28)), + /*pre-comm=*/testing::Bool(), + /*post-comm=*/testing::Bool()), [](const testing::TestParamInfo& info) -> std::string { std::ostringstream os; - os << /*backend*/std::get<0>(info.param) << "_" + os << /*backend*/ std::get<0>(info.param) << "_" << "M" << std::get<1>(info.param) << "_" << "K" << std::get<2>(info.param) << "_" << "N" << std::get<3>(info.param) << "_" << "L" << std::get<4>(info.param) - << ((std::get<5>(info.param))? "_pre_comm" : "") - << ((std::get<6>(info.param))? "_post_comm" : ""); + << ((std::get<5>(info.param)) ? "_pre_comm" : "") + << ((std::get<6>(info.param)) ? "_post_comm" : ""); return os.str(); }); @@ -199,7 +214,9 @@ using OverlapBenchmarkParams = std::tuple< /*use_cuda_graph=*/bool, DataType>; -class OverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterface { +class OverlapBenchmark + : public MultiDeviceTest, + public testing::WithParamInterface { protected: static std::map times; @@ -208,8 +225,9 @@ class OverlapBenchmark : public MultiDeviceTest, public testing::WithParamInterf if (rank != 0) { return; } - for (auto it: times) { - std::cout << "time " << rank << ": " << it.first << ": " << it.second << std::endl; + for (auto it : times) { + std::cout << "time " << rank << ": " << it.first << ": " << it.second + << std::endl; } } }; @@ -223,19 +241,19 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) { constexpr int64_t iteration_profiler_end = 15; constexpr int64_t iteration_cuda_graph_capture = 5; - const int64_t D = communicator_->size(); - auto [backend, - S, - M, - K, - N, - number_of_streams, - add_cuStreamWriteValue32, - number_of_pgs, - unfuse_loops, - use_cuda_graph, - dtype] = GetParam(); + auto + [backend, + S, + M, + K, + N, + number_of_streams, + add_cuStreamWriteValue32, + number_of_pgs, + unfuse_loops, + use_cuda_graph, + dtype] = GetParam(); if (backend == CommunicatorBackend::kCuda) { GTEST_SKIP() << "Cuda Backend not supported in this test"; } @@ -248,11 +266,13 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) { createStreams(number_of_streams, communicator_->deviceId()); setCurrentCUDAStream(streams.at(0)); - auto options = at::TensorOptions().dtype(data_type_to_aten(dtype)).device(communicator_->device()); - auto ta = at::randn({S, M/S,K}, options); - auto ta_unsharded = at::empty({S, D, M/S,K}, options); - auto tb = at::randn({K,N}, options); - auto tc = at::empty({S, D, M/S, N}, options); + auto options = at::TensorOptions() + .dtype(data_type_to_aten(dtype)) + .device(communicator_->device()); + auto ta = at::randn({S, M / S, K}, options); + auto ta_unsharded = at::empty({S, D, M / S, K}, options); + auto tb = at::randn({K, N}, options); + auto tc = at::empty({S, D, M / S, N}, options); cudaEvent_t start, stop; cudaEventCreate(&start); @@ -270,7 +290,8 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) { for (const auto& iteration : c10::irange(number_of_warmups + number_of_iterations)) { if (iteration == iteration_profiler_start) { - cudaProfilerStart();; + cudaProfilerStart(); + ; } if (iteration == number_of_warmups) { cudaEventRecord(start); @@ -283,7 +304,8 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) { int64_t stream_index = j % streams.size(); setCurrentCUDAStream(streams.at(stream_index)); - auto world = communicator_->getBackendForTeam(all_ranks, backend, std::to_string(j % number_of_pgs)); + auto world = communicator_->getBackendForTeam( + all_ranks, backend, std::to_string(j % number_of_pgs)); auto ta_j = ta.select(0, j); auto ta_unsharded_j = ta_unsharded.select(0, j); @@ -295,15 +317,17 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) { if (add_cuStreamWriteValue32) { cuStreamWriteValue32( #if CUSTOM_PG_WITH_INTERNAL_STREAM_ACCESS - (CUstream)world->getCudaStream(communicator_->device()).stream(), + (CUstream)world->getCudaStream(communicator_->device()).stream(), #else - (CUstream)streams.at(stream_index).stream(), + (CUstream)streams.at(stream_index).stream(), #endif - (CUdeviceptr)pDevice, (cuuint32_t)(iteration * S + j), (unsigned int)0); + (CUdeviceptr)pDevice, + (cuuint32_t)(iteration * S + j), + (unsigned int)0); } if (unfuse_loops == false) { // compute - torch::matmul_out(tc_j, ta_unsharded_j,tb); + torch::matmul_out(tc_j, ta_unsharded_j, tb); } } if (unfuse_loops) { @@ -314,7 +338,7 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) { auto tc_j = tc.select(0, j); // compute - torch::matmul_out(tc_j, ta_unsharded_j,tb); + torch::matmul_out(tc_j, ta_unsharded_j, tb); } } if (use_cuda_graph && (iteration == iteration_cuda_graph_capture)) { @@ -327,7 +351,8 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) { cuda_graph.replay(); } if (iteration == iteration_profiler_end) { - cudaProfilerStop();; + cudaProfilerStop(); + ; } } cudaEventRecord(stop); @@ -336,14 +361,21 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmark) { cudaEventElapsedTime(&milliseconds, start, stop); milliseconds /= number_of_iterations; - std::string test_name = ::testing::UnitTest::GetInstance()->current_test_info()->name(); + std::string test_name = + ::testing::UnitTest::GetInstance()->current_test_info()->name(); times.insert({test_name, milliseconds}); - std::cout << "rank " << communicator_->deviceId() << ", " << test_name << " : " << milliseconds << std::endl; + std::cout << "rank " << communicator_->deviceId() << ", " << test_name + << " : " << milliseconds << std::endl; if (add_cuStreamWriteValue32) { - std::cout << "RANK " << communicator_->device() << " entering while loop. Max index=" << (number_of_warmups + number_of_iterations)*S + S << std::endl; - while (*ptr < (cuuint32_t)(number_of_warmups + number_of_iterations)*S + S - 1) { - std::cout << "RANK " << communicator_->device() << " waiting at index=" << *ptr << std::endl; + std::cout << "RANK " << communicator_->device() + << " entering while loop. Max index=" + << (number_of_warmups + number_of_iterations) * S + S + << std::endl; + while (*ptr < + (cuuint32_t)(number_of_warmups + number_of_iterations) * S + S - 1) { + std::cout << "RANK " << communicator_->device() + << " waiting at index=" << *ptr << std::endl; } cudaFree((void*)ptr); } @@ -356,27 +388,30 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmarkStreamParallelType) { constexpr int64_t iteration_profiler_end = 15; const int64_t D = communicator_->size(); - auto [backend, - S, - M, - K, - N, - number_of_streams, - add_cuStreamWriteValue32, - number_of_pgs, - unfuse_loops, - use_cuda_graph, - dtype] = GetParam(); + auto + [backend, + S, + M, + K, + N, + number_of_streams, + add_cuStreamWriteValue32, + number_of_pgs, + unfuse_loops, + use_cuda_graph, + dtype] = GetParam(); if (M % (D * S) != 0) { GTEST_SKIP() << "M must be a multiple of D * S, but got M = " << M << ", D = " << D << ", S = " << S; } if (add_cuStreamWriteValue32) { - GTEST_SKIP() << "cuStreamWriteValue32 not supported with StreamParallelType"; + GTEST_SKIP() + << "cuStreamWriteValue32 not supported with StreamParallelType"; } if (number_of_pgs > 1) { - GTEST_SKIP() << "StreamParallelType not supported with multiple process groups"; + GTEST_SKIP() + << "StreamParallelType not supported with multiple process groups"; } if (unfuse_loops) { GTEST_SKIP() << "StreamParallelType not supported with unfused loops"; @@ -385,7 +420,6 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmarkStreamParallelType) { GTEST_SKIP() << "StreamParallelType not supported with cuda graphs"; } - auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); @@ -410,9 +444,9 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmarkStreamParallelType) { params.executor.number_of_streams = number_of_streams; MultiDeviceExecutor executor(std::move(fusion), *communicator_, params); - - auto tensor_options = - at::TensorOptions().dtype(data_type_to_aten(dtype)).device(communicator_->device()); + auto tensor_options = at::TensorOptions() + .dtype(data_type_to_aten(dtype)) + .device(communicator_->device()); at::Tensor ta_unsharded = at::randn({S, D, M / (S * D), K}, tensor_options); at::Tensor ta = ta_unsharded.slice( 1, communicator_->deviceId(), communicator_->deviceId() + 1); @@ -426,8 +460,7 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmarkStreamParallelType) { cudaEventCreate(&start); cudaEventCreate(&stop); - for (const auto& iteration : - c10::irange(number_of_iterations)) { + for (const auto& iteration : c10::irange(number_of_iterations)) { if (iteration == iteration_profiler_start) { cudaProfilerStart(); } @@ -447,46 +480,55 @@ TEST_P(OverlapBenchmark, PipelinedAGMatmulBenchmarkStreamParallelType) { cudaEventElapsedTime(&milliseconds, start, stop); milliseconds /= number_of_iterations; - std::string test_name = ::testing::UnitTest::GetInstance()->current_test_info()->name(); + std::string test_name = + ::testing::UnitTest::GetInstance()->current_test_info()->name(); times.insert({test_name, milliseconds}); - std::cout << "rank " << communicator_->deviceId() << ", " << test_name << " : " << milliseconds << std::endl; + std::cout << "rank " << communicator_->deviceId() << ", " << test_name + << " : " << milliseconds << std::endl; - EXPECT_TRUE(torch::allclose(tc_ref, tc, 1e-1, 1e-1)) << "rank " << communicator_->deviceId() << "failed.\ntc_ref: " << tc_ref << ",\ntc: " << tc; + EXPECT_TRUE(torch::allclose(tc_ref, tc, 1e-1, 1e-1)) + << "rank " << communicator_->deviceId() << "failed.\ntc_ref: " << tc_ref + << ",\ntc: " << tc; } INSTANTIATE_TEST_SUITE_P( , OverlapBenchmark, testing::Combine( - testing::Values(CommunicatorBackend::kNccl, CommunicatorBackend::kUcc, CommunicatorBackend::kCuda), - /*S=*/testing::Values(1,2,4,8, 16, 32), - /*M=*/testing::Values(pow(2,3), pow(2,10), pow(2,15), pow(2,18)), - /*K=*/testing::Values(pow(2,3), pow(2,10), pow(2,15), pow(2,18)), - /*N=*/testing::Values(pow(2,3), pow(2,10), pow(2,15)), - /*number_of_streams=*/testing::Values(3, 8, 32), - /*add_cuStreamWriteValue32*/testing::Values(false, true), - /*number_of_pgs=*/testing::Values(1, 2, 4, 8), - /*unfuse_loops=*/testing::Values(false, true), - /*use_cuda_graph=*/testing::Values(false), // cuda graphs not supported: ucc does not supports it (segfault) and nccl PG has a "syncStream" that throws - testing::Values(DataType::Float, DataType::Half, DataType::BFloat16)), + testing::Values( + CommunicatorBackend::kNccl, + CommunicatorBackend::kUcc, + CommunicatorBackend::kCuda), + /*S=*/testing::Values(1, 2, 4, 8, 16, 32), + /*M=*/testing::Values(pow(2, 3), pow(2, 10), pow(2, 15), pow(2, 18)), + /*K=*/testing::Values(pow(2, 3), pow(2, 10), pow(2, 15), pow(2, 18)), + /*N=*/testing::Values(pow(2, 3), pow(2, 10), pow(2, 15)), + /*number_of_streams=*/testing::Values(3, 8, 32), + /*add_cuStreamWriteValue32*/ testing::Values(false, true), + /*number_of_pgs=*/testing::Values(1, 2, 4, 8), + /*unfuse_loops=*/testing::Values(false, true), + /*use_cuda_graph=*/testing::Values(false), // cuda graphs not supported: + // ucc does not supports it + // (segfault) and nccl PG has + // a "syncStream" that throws + testing::Values(DataType::Float, DataType::Half, DataType::BFloat16)), [](const testing::TestParamInfo& info) -> std::string { std::ostringstream os; - os << /*backend*/std::get<0>(info.param) << "_" + os << /*backend*/ std::get<0>(info.param) << "_" << "S" << std::get<1>(info.param) << "_" << "M" << std::get<2>(info.param) << "_" << "K" << std::get<3>(info.param) << "_" << "N" << std::get<4>(info.param) << "_" << "Streams" << std::get<5>(info.param) << "_" - << /*dtype:*/std::get<10>(info.param) << "_" - << ((std::get<6>(info.param))? "WithcuStreamWriteValue32_" : "") + << /*dtype:*/ std::get<10>(info.param) << "_" + << ((std::get<6>(info.param)) ? "WithcuStreamWriteValue32_" : "") << "Pgs" << std::get<7>(info.param) - << ((std::get<8>(info.param))? "_unfused" : "") - << ((std::get<9>(info.param))? "_WithCudaGraph" : ""); + << ((std::get<8>(info.param)) ? "_unfused" : "") + << ((std::get<9>(info.param)) ? "_WithCudaGraph" : ""); return os.str(); }); - struct OverlapTestParams { // Tensors sizes int64_t M = std::pow(2, 6); diff --git a/tests/cpp/test_multidevice_pipeline.cpp b/tests/cpp/test_multidevice_pipeline.cpp index ca113123e16..bee3bb7c56c 100644 --- a/tests/cpp/test_multidevice_pipeline.cpp +++ b/tests/cpp/test_multidevice_pipeline.cpp @@ -127,9 +127,7 @@ void PipelineTest::executeAndValidate(bool validate_with_prescribed_values) { MultiDeviceExecutorParams params; params.executor = host_ir_executor_params; runtime = std::make_unique( - std::make_unique(*fusion), - *communicator_, - params); + std::make_unique(*fusion), *communicator_, params); auto error_msg = runtime->validate(); if (error_msg != "") { GTEST_SKIP() << error_msg; From 7fca0355ab99c452172c3b2e63f974a88391cd83 Mon Sep 17 00:00:00 2001 From: snordmann Date: Wed, 5 Feb 2025 14:26:49 -0800 Subject: [PATCH 38/55] wip. The send and recv Expr* need to be matched together for associating the buffer. Need to either use (non-P2P)Communication* or better have a Host Node to explicitely share the handles as something explicit in the Host Ir program --- CMakeLists.txt | 1 + csrc/driver_api.h | 1 - csrc/host_ir/executor.cpp | 181 ++++++++++++++---- csrc/multidevice/communicator.cpp | 86 +++++---- csrc/multidevice/communicator.h | 46 ++++- tests/cpp/multidevice_kernels.cu | 3 +- tests/cpp/test_multidevice_communications.cpp | 55 ++++++ tests/cpp/test_multidevice_gpu_comms.cpp | 18 ++ tests/cpp/test_multidevice_overlap.cpp | 3 +- 9 files changed, 311 insertions(+), 83 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 364e39596b9..5e59cfddc65 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -346,6 +346,7 @@ target_link_libraries(codegen_internal PUBLIC ${LIBCUPTI} ${TORCH_LIBRARIES} dl + cuda ) add_library(nvfuser_codegen SHARED $) diff --git a/csrc/driver_api.h b/csrc/driver_api.h index 8105cf855c2..b8c413a4054 100644 --- a/csrc/driver_api.h +++ b/csrc/driver_api.h @@ -32,7 +32,6 @@ namespace nvfuser { fn(cuModuleGetFunction); \ fn(cuModuleLoadDataEx); \ fn(cuModuleUnload); \ - fn(cuStreamWriteValue32); \ fn(cuOccupancyMaxActiveBlocksPerMultiprocessor) #if (CUDA_VERSION >= 12000) diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index cc30ee58316..070cd299ee3 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -21,6 +21,7 @@ #include #include #include +#include namespace nvfuser { @@ -435,40 +436,40 @@ void HostIrEvaluator::handle(Communication* communication) { NVF_ERROR(communication->type() == CommunicationType::Allgather); - std::vector output_tensors = - at::tensor_split(output_tensor.squeeze(), communication->team_size(), 0); - std::vector input_ptrs = communicator_->getRemotePtrs(input_tensor); - cudaStream_t current_stream = - c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream(); - // TODO: use multicast - for (auto i = 0; i < communicator_->size(); i++) { - cudaStream_t stream = c10::cuda::getStreamFromPool( - /*isHighPriority=*/false, my_local_device_index_) - .stream(); - cudaEvent_t event = {}; - NVFUSER_CUDA_RT_SAFE_CALL( - cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); - NVFUSER_CUDA_RT_SAFE_CALL(cudaEventRecord(event, current_stream)); - NVFUSER_CUDA_RT_SAFE_CALL( - cudaStreamWaitEvent(stream, event, cudaEventWaitDefault)); - NVFUSER_CUDA_RT_SAFE_CALL(cudaEventDestroy(event)); - - auto output = output_tensors.at(i); - NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync( - output.data_ptr(), - input_ptrs.at(i), - output.numel() * output.element_size(), - cudaMemcpyDeviceToDevice, - stream)); - - // sync - NVFUSER_CUDA_RT_SAFE_CALL( - cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); - NVFUSER_CUDA_RT_SAFE_CALL(cudaEventRecord(event, stream)); - NVFUSER_CUDA_RT_SAFE_CALL( - cudaStreamWaitEvent(current_stream, event, cudaEventWaitDefault)); - NVFUSER_CUDA_RT_SAFE_CALL(cudaEventDestroy(event)); - } + // std::vector output_tensors = + // at::tensor_split(output_tensor.squeeze(), communication->team_size(), 0); + // const std::vector& input_ptrs = communicator_->getRemotePtrs(input_tensor); + // cudaStream_t current_stream = + // c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream(); + // // TODO: use multicast + // for (auto i = 0; i < communicator_->size(); i++) { + // cudaStream_t stream = c10::cuda::getStreamFromPool( + // /*isHighPriority=*/false, my_local_device_index_) + // .stream(); + // cudaEvent_t event = {}; + // NVFUSER_CUDA_RT_SAFE_CALL( + // cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); + // NVFUSER_CUDA_RT_SAFE_CALL(cudaEventRecord(event, current_stream)); + // NVFUSER_CUDA_RT_SAFE_CALL( + // cudaStreamWaitEvent(stream, event, cudaEventWaitDefault)); + // NVFUSER_CUDA_RT_SAFE_CALL(cudaEventDestroy(event)); + + // auto output = output_tensors.at(i); + // NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync( + // output.data_ptr(), + // input_ptrs.at(i), + // output.numel() * output.element_size(), + // cudaMemcpyDeviceToDevice, + // stream)); + + // // sync + // NVFUSER_CUDA_RT_SAFE_CALL( + // cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); + // NVFUSER_CUDA_RT_SAFE_CALL(cudaEventRecord(event, stream)); + // NVFUSER_CUDA_RT_SAFE_CALL( + // cudaStreamWaitEvent(current_stream, event, cudaEventWaitDefault)); + // NVFUSER_CUDA_RT_SAFE_CALL(cudaEventDestroy(event)); + // } } void HostIrEvaluator::handle(P2PCommunication* communication) { @@ -490,20 +491,118 @@ void HostIrEvaluator::handle(P2PCommunication* communication) { buffer); return; } - NVF_ERROR(false, "CUDA backend not supported yet"); + + + + // FIST TIME: + // sender exports cudaIpc mem handle on input buffer and put it to store + // sender signals recv it can open the mem handle. It needs to be CPU blocking + // recv opens the handle and gets the pointer. + // It copies the data and then signal sender on completion + + // SECOND TIME: + // Sender signals recv it can copy + // Recv copies the data and signals sender on completion + +// each rank must have a bool "recvied" and a "sent" bool per rank. So n+1 +// each rank must have, per rank, a sent_to and a received_from a bool "recvied" and a "sent" bool per rank. So n+1 + + + + // std::string prefix = "nvfuser_ipc_tensor_info_" + communication->buffer()->name() + "_"; + // IpcTensorInfo ipc_tensor_info; + // NVFUSER_CUDA_RT_SAFE_CALL( + // cudaIpcGetMemHandle(&ipc_tensor_info.ipc_handle, buffer.data_ptr())); + // ipc_tensor_info.storage_offset = buffer.storage_offset(); + // ipc_tensor_info.element_size = buffer.element_size(); + + // auto store = communicator_->getTcpStore(); + // store->set(prefix + std::to_string(my_rank), toBytes(ipc_tensor_info)); + + // Team team = {my_rank, peer}; + // communicator_->getBackendForTeam(team, CommunicatorBackend::kNccl)->barrier()->wait(); + + // for (int64_t rank : c10::irange(size())) { + // if (rank == my_rank) { + // remote_ptrs.at(rank) = tensor.data_ptr(); + // } else { + // ipc_tensor_info = + // fromBytes(store->get(prefix + std::to_string(rank))); + // void*& ptr = remote_ptrs.at(rank); + // NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcOpenMemHandle( + // &ptr, ipc_tensor_info.ipc_handle, cudaIpcMemLazyEnablePeerAccess)); + // // TODO: close ipc mem handle at shutdown + // ptr = (void*)((uint8_t*)ptr + + // ipc_tensor_info.storage_offset * + // ipc_tensor_info.element_size); + // } + // } + + const auto current_stream = reinterpret_cast(c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream()); + const std::vector& remote_buffers = communicator_->getRemoteBuffer(buffer, std::to_string(communication->buffer()->name())); + const int64_t my_rank = communicator_->deviceId(); + const int64_t peer = expr_evaluator_.evaluate(communication->peer()).as(); + const RemoteBufferInfo& my_buffer = remote_buffers.at(my_rank); + const RemoteBufferInfo& peer_buffer = remote_buffers.at(peer); + const auto local_semaphore = reinterpret_cast(&my_buffer.semaphores()[peer]); + const auto remote_semaphore = reinterpret_cast(&peer_buffer.semaphores()[my_rank]); + static_assert(sizeof(IpcSemaphore) == sizeof(uint32_t), "IpcSemaphore must be 32 bits"); + + + if (communication->type() == P2PCommunicationType::RECV) { + std::cout << "RANK " << my_rank << " RECV, local semaphore=" << local_semaphore << ", remote semaphore=" << remote_semaphore << std::endl; + // signal to self that transfer is in progress + NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kTransferInProgress), CU_STREAM_WRITE_VALUE_DEFAULT)); + // signal sender that receiver is ready + NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(current_stream, remote_semaphore, (cuuint32_t)(IpcSemaphore::kTransferInProgress), CU_STREAM_WRITE_VALUE_DEFAULT)); // passing CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER gives an error + } else /*sender*/ { + std::cout << "RANK " << my_rank << " SEND, local semaphore=" << local_semaphore << ", remote semaphore=" << remote_semaphore << std::endl; + // wait for sender to be ready + // NVFUSER_CUDA_SAFE_CALL(cuStreamWaitValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kTransferInProgress), CU_STREAM_WAIT_VALUE_EQ)); + std::cout << "RANK " << my_rank << " SEND after 1st WAIT" << std::endl; + // RDMA writes data from sender to receiver + NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync( + remote_buffers.at(my_rank).ptr(), + my_buffer.ptr(), + buffer.numel() * buffer.element_size(), + cudaMemcpyDeviceToDevice, + current_stream)); + std::cout << "RANK " << my_rank << " SEND after memcpy" << std::endl; + // Signals completion to self + NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kReady), CU_STREAM_WRITE_VALUE_DEFAULT)); + // Signals completion to receiver + NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(current_stream, remote_semaphore, (cuuint32_t)(IpcSemaphore::kReady), CU_STREAM_WRITE_VALUE_DEFAULT)); + } } void HostIrEvaluator::handle(Wait* wait) { Expr* communication = wait->communication(); - auto it = works_.find(communication); - if (it == works_.end()) { + auto* p2p_comm = dynamic_cast(communication); + if (p2p_comm && p2p_comm->backend() != CommunicatorBackend::kCuda) { + auto it = works_.find(communication); + if (it == works_.end()) { + return; + } + auto& work = it->second; + if (work != nullptr) { + work->wait(); + } + works_.erase(communication); return; } - auto& work = it->second; - if (work != nullptr) { - work->wait(); + + if (p2p_comm->type() == P2PCommunicationType::RECV) { + // const auto current_stream = static_cast(c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream()); + const std::vector& remote_buffers = communicator_->getRemoteBuffer(getKnownTensorOrUndefined(p2p_comm->buffer(), expr_evaluator_), std::to_string(p2p_comm->buffer()->name())); + const int64_t my_rank = communicator_->deviceId(); + const int64_t peer = expr_evaluator_.evaluate(p2p_comm->peer()).as(); + const RemoteBufferInfo& my_buffer = remote_buffers.at(my_rank); + const auto local_semaphore = reinterpret_cast(&my_buffer.semaphores()[peer]); + + std::cout << "RANK " << my_rank << " WAIT RECV BEFORE cuStreamWaitValue32 on local semaphore " << local_semaphore << std::endl; + // NVFUSER_CUDA_SAFE_CALL(cuStreamWaitValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kReady), CU_STREAM_WAIT_VALUE_EQ)); + std::cout << "RANK " << my_rank << " FINISHED WAIT RECV AFTER cuStreamWaitValue32 on local semaphore " << local_semaphore << std::endl; } - works_.erase(communication); } namespace { diff --git a/csrc/multidevice/communicator.cpp b/csrc/multidevice/communicator.cpp index 46be9eb885f..902ceaaa64d 100644 --- a/csrc/multidevice/communicator.cpp +++ b/csrc/multidevice/communicator.cpp @@ -319,49 +319,63 @@ void Communicator::barrier(std::optional backend) { getWorld(backend)->barrier(options)->wait(); } -struct IpcTensorInfo { - cudaIpcMemHandle_t ipc_handle; - int64_t storage_offset; - int64_t element_size; -}; - -std::vector Communicator::getRemotePtrs(at::Tensor tensor) { - auto it = remote_ptrs_.find(tensor); - if (it == remote_ptrs_.end()) { - if (deviceId() == 0) { - std::cout << "rank " << deviceId() << " registers tensor " - << tensor.data_ptr() << "with hash" << std::endl; - } - std::vector remote_ptrs(size(), nullptr); - std::string prefix = "nvfuser_ipc_tensor_info_"; - IpcTensorInfo ipc_tensor_info; - NVFUSER_CUDA_RT_SAFE_CALL( - cudaIpcGetMemHandle(&ipc_tensor_info.ipc_handle, tensor.data_ptr())); - ipc_tensor_info.storage_offset = tensor.storage_offset(); - ipc_tensor_info.element_size = tensor.element_size(); - - const int64_t my_rank = deviceId(); +RemoteBufferInfo::RemoteBufferInfo(at::Tensor tensor, int64_t size) : ptr_(tensor.data_ptr()), size_(size), storage_offset_(tensor.storage_offset()), element_size_(tensor.element_size()), is_imported_(false) { + NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcGetMemHandle(&ipc_handle_, tensor.data_ptr())); + NVFUSER_CUDA_RT_SAFE_CALL(cudaMalloc((void**)&semaphores_, size_ * sizeof(IpcSemaphore))); + static_assert(sizeof(IpcSemaphore) == sizeof(int), "IpcSemaphore must be same size as int"); + NVFUSER_CUDA_RT_SAFE_CALL(cudaMemset((void*) semaphores_, (int)IpcSemaphore::kReady, size_ * sizeof(IpcSemaphore))); + NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcGetMemHandle(&semaphores_ipc_handle_, semaphores_)); +} + +RemoteBufferInfo::RemoteBufferInfo(std::vector data) : is_imported_(true) { + RemoteBufferInfo imported_buffer = fromBytes(data); + + size_ = imported_buffer.size_; + storage_offset_ = imported_buffer.storage_offset_; + element_size_ = imported_buffer.element_size_; + ipc_handle_ = imported_buffer.ipc_handle_; + semaphores_ipc_handle_ = imported_buffer.semaphores_ipc_handle_; + + NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcOpenMemHandle(&ptr_, ipc_handle_, cudaIpcMemLazyEnablePeerAccess)); + ptr_ = (void*)((uint8_t*)ptr_ + storage_offset_ * element_size_); + + NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcOpenMemHandle((void**)&semaphores_, semaphores_ipc_handle_, cudaIpcMemLazyEnablePeerAccess)); +} + +RemoteBufferInfo::~RemoteBufferInfo() { + // if (is_imported_) { + // NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcCloseMemHandle(&ipc_handle_)); + // NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcCloseMemHandle(&semaphores_ipc_handle_)); + // } else { + // NVFUSER_CUDA_RT_SAFE_CALL(cudaFree(semaphores_)); + // } +} + + +std::vector Communicator::getRemoteBuffer(at::Tensor tensor, std::string key) { + auto it = remote_buffers_.find(tensor); + if (it == remote_buffers_.end()) { + RemoteBufferInfo buffer_handle(tensor, size_); + auto store = getTcpStore(); - store->set(prefix + std::to_string(my_rank), toBytes(ipc_tensor_info)); + std::string prefix = "nvfuser_remote_buffer_info_" + key; + std::cout << "RANK " << deviceId() << "registers at key " << prefix + std::to_string(deviceId()) << std::endl; + store->set(prefix + std::to_string(deviceId()), toBytes(buffer_handle)); barrier(); - for (int64_t rank : c10::irange(size())) { - if (rank == my_rank) { - remote_ptrs.at(rank) = tensor.data_ptr(); + std::cout << "RANK " << deviceId() << "after barrier for key " << prefix + std::to_string(deviceId()) << std::endl; + std::vector remote_buffers; + remote_buffers.reserve(size_); + for (int64_t rank : c10::irange(size_)) { + if (rank == deviceId()) { + remote_buffers.push_back(std::move(buffer_handle)); } else { - ipc_tensor_info = - fromBytes(store->get(prefix + std::to_string(rank))); - void*& ptr = remote_ptrs.at(rank); - NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcOpenMemHandle( - &ptr, ipc_tensor_info.ipc_handle, cudaIpcMemLazyEnablePeerAccess)); - // TODO: close ipc mem handle at shutdown - ptr = (void*)((uint8_t*)ptr + - ipc_tensor_info.storage_offset * - ipc_tensor_info.element_size); + RemoteBufferInfo imported_remote_buffer_info(store->get(prefix + std::to_string(rank))); + remote_buffers.push_back(std::move(imported_remote_buffer_info)); } } - it = remote_ptrs_.emplace(tensor, std::move(remote_ptrs)).first; + it = remote_buffers_.emplace(tensor, std::move(remote_buffers)).first; } return it->second; } diff --git a/csrc/multidevice/communicator.h b/csrc/multidevice/communicator.h index ebe4a60ddfd..39c9d667bf4 100644 --- a/csrc/multidevice/communicator.h +++ b/csrc/multidevice/communicator.h @@ -10,6 +10,11 @@ #include #include #include +// #include +#include +#include +#include + #include #include @@ -36,6 +41,41 @@ T fromBytes(std::vector bytes) { return *reinterpret_cast(bytes.data()); } +enum class IpcSemaphore : cuuint32_t { + kReady, + kTransferInProgress +}; + +class RemoteBufferInfo { + public: + + RemoteBufferInfo(at::Tensor tensor, int64_t size); + RemoteBufferInfo(std::vector data); // means it is imported + ~RemoteBufferInfo(); + + void* ptr() const { + return ptr_; + } + + auto semaphores() const { + return semaphores_; + } + + auto size() const { + return size_; + } + + private: + void* ptr_; + int64_t size_; + int64_t storage_offset_; + int64_t element_size_; + bool is_imported_; + cudaIpcMemHandle_t ipc_handle_; + cudaIpcMemHandle_t semaphores_ipc_handle_; + IpcSemaphore* semaphores_; +}; + // This file implements the class Communicator which sets up the inter-process // Backend. This class contains inter-process information, such as the rank, the // world size, as well as the Process Group that can be called to perform @@ -154,7 +194,7 @@ class Communicator { return store_; } - std::vector getRemotePtrs(at::Tensor tensor); + std::vector getRemoteBuffer(at::Tensor tensor, std::string key); private: struct TensorHash { @@ -205,8 +245,8 @@ class Communicator { c10::intrusive_ptr store_; // cache for the created backends. The keys are strings generated from Teams std::unordered_map> backends_; - std::unordered_map, TensorHash, TensorEqual> - remote_ptrs_; + std::unordered_map, TensorHash, TensorEqual> + remote_buffers_; }; } // namespace nvfuser diff --git a/tests/cpp/multidevice_kernels.cu b/tests/cpp/multidevice_kernels.cu index cd8275dc92c..9634130cb3d 100644 --- a/tests/cpp/multidevice_kernels.cu +++ b/tests/cpp/multidevice_kernels.cu @@ -10,7 +10,8 @@ // (except raw headers). Compiling dynamic_type.h with nvcc is not supported. // Compiling pytorch with nvcc is not supported either. -#include +// #include +#include #include namespace nvfuser { diff --git a/tests/cpp/test_multidevice_communications.cpp b/tests/cpp/test_multidevice_communications.cpp index 82a45c4f87f..8a50d152a4d 100644 --- a/tests/cpp/test_multidevice_communications.cpp +++ b/tests/cpp/test_multidevice_communications.cpp @@ -413,4 +413,59 @@ INSTANTIATE_TEST_SUITE_P( testing::Values(CommunicatorBackend::kNccl, CommunicatorBackend::kUcc), testing::PrintToStringParamName()); +using P2PCommunicationTest = MultiDeviceTest; + +TEST_F(P2PCommunicationTest, CudaComm) { + static constexpr int kTensorSize = 8; + static constexpr int kNumRepetitions = 8; + + if (communicator_->size() < 2 || torch::cuda::device_count() < 2) { + GTEST_SKIP() << "This test needs at least 2 GPUs and 2 ranks."; + } + + const DeviceIdxType my_rank = communicator_->deviceId(); + const DeviceIdxType size = communicator_->size(); + const DeviceIdxType send_peer = (my_rank + 1) % size; + const DeviceIdxType recv_peer = (size + my_rank - 1) % size; + + auto container = std::make_unique(); + FusionGuard fg(container.get()); + auto* send_tv = makeContigTensor(1); + auto* recv_tv = ops::newValLike(send_tv, send_tv->dtype())->as(); + container->addInput(send_tv); + container->addInput(recv_tv); + + auto* val_recv_peer = IrBuilder::create(recv_peer, DataType::Int); + auto* val_send_peer = IrBuilder::create(send_peer, DataType::Int); + + auto recv = IrBuilder::create(P2PCommunicationType::RECV, recv_tv, val_recv_peer, CommunicatorBackend::kCuda); + auto send = IrBuilder::create(P2PCommunicationType::SEND, send_tv, val_send_peer, CommunicatorBackend::kCuda); + auto wait_recv = IrBuilder::create(recv); + auto wait_send = IrBuilder::create(send); + + container->pushBackTopLevelExprs(recv); + container->pushBackTopLevelExprs(send); + container->pushBackTopLevelExprs(wait_recv); + container->pushBackTopLevelExprs(wait_send); + + hir::HostIrEvaluator executor(std::move(container), communicator_); + + at::Tensor send_tensor = at::empty({kTensorSize}, tensor_options); + at::Tensor recv_tensor = at::empty({kTensorSize}, tensor_options); + + std::unordered_map inputs = {{send_tv, send_tensor}, {recv_tv, recv_tensor}}; + + for (auto repetition : c10::irange(kNumRepetitions)) { + send_tensor.copy_(at::arange(kTensorSize, tensor_options) + repetition * my_rank); + std::cout << "RANK " << my_rank << " REPETITION " << repetition << ", send_peer=" << send_peer << ", recv_peer=" << recv_peer << ", send_tensor=" << send_tensor << std::endl; + + executor.runWithInput(inputs); + + torch::cuda::synchronize(); + std::cout << "RANK " << my_rank << " validation at" << " REPETITION " << repetition << std::endl; + auto ref = at::arange(kTensorSize, tensor_options) + repetition * recv_peer; + EXPECT_TRUE(torch::allclose(recv_tensor, ref)) << "Rank " << my_rank << " failed at repetition " << repetition << " with recv tensor " << recv_tensor << " and ref " << ref; + } +} + } // namespace nvfuser diff --git a/tests/cpp/test_multidevice_gpu_comms.cpp b/tests/cpp/test_multidevice_gpu_comms.cpp index 37e72445484..413df0f06a4 100644 --- a/tests/cpp/test_multidevice_gpu_comms.cpp +++ b/tests/cpp/test_multidevice_gpu_comms.cpp @@ -13,6 +13,7 @@ #include #include #include +#include namespace nvfuser { @@ -137,6 +138,23 @@ TEST_F(GpuCommTest, IpcMemHandlePtrArithmeticAtSender) { CUDA_CALL(cudaFree(d_ptr)); } +class StreamOpTest : public NVFuserTest {}; + +TEST_F(StreamOpTest, StreamWriteValue32) { + cudaStream_t stream; + void* buf; + int value = 0; + constexpr int new_value = 42; + NVFUSER_CUDA_RT_SAFE_CALL(cudaSetDevice(0)); + NVFUSER_CUDA_RT_SAFE_CALL(cudaStreamCreate(&stream)); + NVFUSER_CUDA_RT_SAFE_CALL(cudaMalloc(&buf, sizeof(int))); + NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync(buf, &value, sizeof(int), cudaMemcpyHostToDevice, stream)); + NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(stream, (CUdeviceptr)buf, new_value, CU_STREAM_WRITE_VALUE_DEFAULT)); + NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync(&value, buf, sizeof(int), cudaMemcpyDeviceToHost, stream)); + NVFUSER_CUDA_RT_SAFE_CALL(cudaStreamSynchronize(stream)); + EXPECT_EQ(value, new_value); +} + TEST_F(GpuCommTest, Allgather) { constexpr int64_t kTensorSize = 1024; diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp index ee916344001..47bb5d915db 100644 --- a/tests/cpp/test_multidevice_overlap.cpp +++ b/tests/cpp/test_multidevice_overlap.cpp @@ -9,7 +9,8 @@ #include #include #include -#include +// #include +#include #include #include #include From 371554e0bd21948933c72e9108d496b0ff26ff12 Mon Sep 17 00:00:00 2001 From: snordmann Date: Wed, 5 Feb 2025 15:06:14 -0800 Subject: [PATCH 39/55] working chkpt well prepared for two ranks --- csrc/host_ir/executor.cpp | 4 +- tests/cpp/test_multidevice_communications.cpp | 62 +++++++++++-------- 2 files changed, 38 insertions(+), 28 deletions(-) diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index 070cd299ee3..5a45c9b23e0 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -539,7 +539,7 @@ void HostIrEvaluator::handle(P2PCommunication* communication) { // } const auto current_stream = reinterpret_cast(c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream()); - const std::vector& remote_buffers = communicator_->getRemoteBuffer(buffer, std::to_string(communication->buffer()->name())); + const std::vector& remote_buffers = communicator_->getRemoteBuffer(buffer, ""); const int64_t my_rank = communicator_->deviceId(); const int64_t peer = expr_evaluator_.evaluate(communication->peer()).as(); const RemoteBufferInfo& my_buffer = remote_buffers.at(my_rank); @@ -562,7 +562,7 @@ void HostIrEvaluator::handle(P2PCommunication* communication) { std::cout << "RANK " << my_rank << " SEND after 1st WAIT" << std::endl; // RDMA writes data from sender to receiver NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync( - remote_buffers.at(my_rank).ptr(), + peer_buffer.ptr(), my_buffer.ptr(), buffer.numel() * buffer.element_size(), cudaMemcpyDeviceToDevice, diff --git a/tests/cpp/test_multidevice_communications.cpp b/tests/cpp/test_multidevice_communications.cpp index 8a50d152a4d..ad13c747a5b 100644 --- a/tests/cpp/test_multidevice_communications.cpp +++ b/tests/cpp/test_multidevice_communications.cpp @@ -423,48 +423,58 @@ TEST_F(P2PCommunicationTest, CudaComm) { GTEST_SKIP() << "This test needs at least 2 GPUs and 2 ranks."; } + if (communicator_->size() != 2) { + GTEST_SKIP() << "This test needs for now exactly 2 GPUs and 2 ranks."; + } + + + const DeviceIdxType my_rank = communicator_->deviceId(); const DeviceIdxType size = communicator_->size(); - const DeviceIdxType send_peer = (my_rank + 1) % size; - const DeviceIdxType recv_peer = (size + my_rank - 1) % size; auto container = std::make_unique(); FusionGuard fg(container.get()); - auto* send_tv = makeContigTensor(1); - auto* recv_tv = ops::newValLike(send_tv, send_tv->dtype())->as(); - container->addInput(send_tv); - container->addInput(recv_tv); - - auto* val_recv_peer = IrBuilder::create(recv_peer, DataType::Int); - auto* val_send_peer = IrBuilder::create(send_peer, DataType::Int); - - auto recv = IrBuilder::create(P2PCommunicationType::RECV, recv_tv, val_recv_peer, CommunicatorBackend::kCuda); - auto send = IrBuilder::create(P2PCommunicationType::SEND, send_tv, val_send_peer, CommunicatorBackend::kCuda); - auto wait_recv = IrBuilder::create(recv); - auto wait_send = IrBuilder::create(send); - - container->pushBackTopLevelExprs(recv); - container->pushBackTopLevelExprs(send); - container->pushBackTopLevelExprs(wait_recv); - container->pushBackTopLevelExprs(wait_send); + auto* tv = makeContigTensor(1); + container->addInput(tv); + if (my_rank == 0) { + const DeviceIdxType send_peer = (my_rank + 1) % size; + + auto* val_send_peer = IrBuilder::create(send_peer, DataType::Int); + auto send = IrBuilder::create(P2PCommunicationType::SEND, tv, val_send_peer, CommunicatorBackend::kCuda); + auto wait_send = IrBuilder::create(send); + container->pushBackTopLevelExprs(send); + container->pushBackTopLevelExprs(wait_send); + } else { + ASSERT_EQ(my_rank, 1); + const DeviceIdxType recv_peer = (size + my_rank - 1) % size; + auto* val_recv_peer = IrBuilder::create(recv_peer, DataType::Int); + auto recv = IrBuilder::create(P2PCommunicationType::RECV, tv, val_recv_peer, CommunicatorBackend::kCuda); + auto wait_recv = IrBuilder::create(recv); + container->pushBackTopLevelExprs(recv); + container->pushBackTopLevelExprs(wait_recv); + } hir::HostIrEvaluator executor(std::move(container), communicator_); - at::Tensor send_tensor = at::empty({kTensorSize}, tensor_options); - at::Tensor recv_tensor = at::empty({kTensorSize}, tensor_options); + at::Tensor tensor = at::empty({kTensorSize}, tensor_options); - std::unordered_map inputs = {{send_tv, send_tensor}, {recv_tv, recv_tensor}}; + std::unordered_map inputs = {{tv, tensor}}; for (auto repetition : c10::irange(kNumRepetitions)) { - send_tensor.copy_(at::arange(kTensorSize, tensor_options) + repetition * my_rank); - std::cout << "RANK " << my_rank << " REPETITION " << repetition << ", send_peer=" << send_peer << ", recv_peer=" << recv_peer << ", send_tensor=" << send_tensor << std::endl; + tensor.copy_(at::arange(kTensorSize, tensor_options) + (1+repetition) * 10 + 100* (1+(1-my_rank))); + torch::cuda::synchronize(); + communicator_->barrier(); + std::cout << "RANK " << my_rank << " REPETITION " << repetition << ", tensor=" << tensor << std::endl; executor.runWithInput(inputs); torch::cuda::synchronize(); + communicator_->barrier(); std::cout << "RANK " << my_rank << " validation at" << " REPETITION " << repetition << std::endl; - auto ref = at::arange(kTensorSize, tensor_options) + repetition * recv_peer; - EXPECT_TRUE(torch::allclose(recv_tensor, ref)) << "Rank " << my_rank << " failed at repetition " << repetition << " with recv tensor " << recv_tensor << " and ref " << ref; + if (my_rank == 1) { + auto ref = at::arange(kTensorSize, tensor_options) + (1+repetition) * 10 + 100* (1+my_rank); + EXPECT_TRUE(torch::allclose(tensor, ref)) << "Rank " << my_rank << " failed at repetition " << repetition << " with tensor " << tensor << " and ref " << ref; + } } } From c7c0404d50ae60f1c8b9596f1cc3ca73bdfaae74 Mon Sep 17 00:00:00 2001 From: snordmann Date: Fri, 7 Feb 2025 06:00:49 -0800 Subject: [PATCH 40/55] change signature of P2Pcomms to accept src and dst --- csrc/host_ir/executor.cpp | 30 +++++++++---- csrc/multidevice/communication.cpp | 43 ++++++------------- csrc/multidevice/communication.h | 21 ++++----- tests/cpp/test_multidevice_communications.cpp | 5 ++- tests/cpp/test_multidevice_host_ir.cpp | 12 +++--- tests/cpp/test_multidevice_overlap.cpp | 8 ++-- 6 files changed, 57 insertions(+), 62 deletions(-) diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index 5a45c9b23e0..3e454270ec5 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -477,16 +477,26 @@ void HostIrEvaluator::handle(P2PCommunication* communication) { communicator_ != nullptr && communicator_->is_available(), "A valid communicator must be provided"); + const int64_t my_rank = communicator_->deviceId(); + const auto dst = expr_evaluator_.evaluate(communication->dst()).as(); + const auto src = expr_evaluator_.evaluate(communication->src()).as(); + const bool is_sender = my_rank == src; + const bool is_receiver = my_rank == dst; + if (!(is_sender || is_receiver)) { + return; + } + + CommunicatorBackend backend_type = communication->backend(); at::Tensor buffer = getKnownTensorOrUndefined(communication->buffer(), expr_evaluator_); - CommunicatorBackend backend_type = communication->backend(); if (backend_type != CommunicatorBackend::kCuda) { works_[communication] = postSingleCommunication( communication, communicator_->deviceId(), - expr_evaluator_.evaluate(communication->peer()).as(), + expr_evaluator_.evaluate(communication->dst()).as(), + expr_evaluator_.evaluate(communication->src()).as(), communicator_->getWorld(), buffer); return; @@ -540,8 +550,7 @@ void HostIrEvaluator::handle(P2PCommunication* communication) { const auto current_stream = reinterpret_cast(c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream()); const std::vector& remote_buffers = communicator_->getRemoteBuffer(buffer, ""); - const int64_t my_rank = communicator_->deviceId(); - const int64_t peer = expr_evaluator_.evaluate(communication->peer()).as(); + const int64_t peer = is_sender ? dst : src; const RemoteBufferInfo& my_buffer = remote_buffers.at(my_rank); const RemoteBufferInfo& peer_buffer = remote_buffers.at(peer); const auto local_semaphore = reinterpret_cast(&my_buffer.semaphores()[peer]); @@ -549,7 +558,7 @@ void HostIrEvaluator::handle(P2PCommunication* communication) { static_assert(sizeof(IpcSemaphore) == sizeof(uint32_t), "IpcSemaphore must be 32 bits"); - if (communication->type() == P2PCommunicationType::RECV) { + if (is_receiver) { std::cout << "RANK " << my_rank << " RECV, local semaphore=" << local_semaphore << ", remote semaphore=" << remote_semaphore << std::endl; // signal to self that transfer is in progress NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kTransferInProgress), CU_STREAM_WRITE_VALUE_DEFAULT)); @@ -591,13 +600,16 @@ void HostIrEvaluator::handle(Wait* wait) { return; } - if (p2p_comm->type() == P2PCommunicationType::RECV) { + + const auto dst = expr_evaluator_.evaluate(p2p_comm->dst()).as(); + const auto src = expr_evaluator_.evaluate(p2p_comm->src()).as(); + const int64_t my_rank = communicator_->deviceId(); + const bool is_receiver = my_rank == dst; + if (is_receiver) { // const auto current_stream = static_cast(c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream()); const std::vector& remote_buffers = communicator_->getRemoteBuffer(getKnownTensorOrUndefined(p2p_comm->buffer(), expr_evaluator_), std::to_string(p2p_comm->buffer()->name())); - const int64_t my_rank = communicator_->deviceId(); - const int64_t peer = expr_evaluator_.evaluate(p2p_comm->peer()).as(); const RemoteBufferInfo& my_buffer = remote_buffers.at(my_rank); - const auto local_semaphore = reinterpret_cast(&my_buffer.semaphores()[peer]); + const auto local_semaphore = reinterpret_cast(&my_buffer.semaphores()[src]); std::cout << "RANK " << my_rank << " WAIT RECV BEFORE cuStreamWaitValue32 on local semaphore " << local_semaphore << std::endl; // NVFUSER_CUDA_SAFE_CALL(cuStreamWaitValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kReady), CU_STREAM_WAIT_VALUE_EQ)); diff --git a/csrc/multidevice/communication.cpp b/csrc/multidevice/communication.cpp index 07861329567..e48290241b0 100644 --- a/csrc/multidevice/communication.cpp +++ b/csrc/multidevice/communication.cpp @@ -215,30 +215,16 @@ std::string Communication::toInlineString(int indent_size) const { return toString(indent_size); } -std::ostream& operator<<(std::ostream& os, const P2PCommunicationType& type) { - switch (type) { - case P2PCommunicationType::SEND: - os << "send"; - break; - case P2PCommunicationType::RECV: - os << "recv"; - break; - default: - NVF_THROW("unrecognized P2PCommunicationType: ", type); - } - return os; -} - P2PCommunication::P2PCommunication( IrBuilderPasskey passkey, - P2PCommunicationType type, TensorView* buffer, - Val* peer, + Val* dst, + Val* src, CommunicatorBackend backend) : Expr(passkey) { addInput(buffer); - addDataAttribute(type); - addAttribute(peer); + addAttribute(dst); + addAttribute(src); addDataAttribute(backend); } @@ -247,9 +233,9 @@ NVFUSER_DEFINE_CLONE_AND_CREATE(P2PCommunication) std::string P2PCommunication::toString(const int indent_size) const { std::stringstream ss; indent(ss, indent_size) << "P2PCommunication " << name() << " (" - << "type=" << type() << ", " << "buffer=" << buffer() << ", " - << "peer=" << peer() << ")\n"; + << "dst=" << dst() << ", " + << "src=" << src() << ")\n"; return ss.str(); } @@ -588,19 +574,18 @@ c10::intrusive_ptr postRecv( c10::intrusive_ptr postSingleCommunication( P2PCommunication* communication, DeviceIdxType my_device_index, - DeviceIdxType peer, + DeviceIdxType dst, + DeviceIdxType src, c10d::Backend* backend, at::Tensor buffer) { NVF_ERROR(backend != nullptr); - switch (communication->type()) { - case P2PCommunicationType::SEND: - return postSend(communication, my_device_index, peer, backend, buffer); - case P2PCommunicationType::RECV: - return postRecv(communication, my_device_index, peer, backend, buffer); - default: - NVF_THROW("Wrong communication type: ", communication->type()); - return nullptr; + if (my_device_index == src) { + return postSend(communication, my_device_index, dst, backend, buffer); + } else if (my_device_index == dst) { + return postRecv(communication, my_device_index, src, backend, buffer); + } else { + return nullptr; } } diff --git a/csrc/multidevice/communication.h b/csrc/multidevice/communication.h index d8724356e15..6c2049fba3e 100644 --- a/csrc/multidevice/communication.h +++ b/csrc/multidevice/communication.h @@ -121,19 +121,15 @@ class Communication : public Expr { void validate(); }; -enum class P2PCommunicationType { SEND, RECV }; - -std::ostream& operator<<(std::ostream& os, const P2PCommunicationType& type); - class P2PCommunication : public Expr { public: using Expr::Expr; P2PCommunication( IrBuilderPasskey passkey, - P2PCommunicationType type, TensorView* buffer, - Val* peer, + Val* dst, + Val* src, CommunicatorBackend backend = CommunicatorBackend::kNccl); P2PCommunication(const P2PCommunication& other) = delete; @@ -149,15 +145,15 @@ class P2PCommunication : public Expr { return "P2PCommunication"; } - P2PCommunicationType type() const { - return attribute(0); - } - TensorView* buffer() const { return input(0)->as(); } - Val* peer() const { + Val* dst() const { + return attributeVal(0); + } + + Val* src() const { return attributeVal(1); } @@ -235,7 +231,8 @@ c10::intrusive_ptr postSingleCommunication( c10::intrusive_ptr postSingleCommunication( P2PCommunication* communication, DeviceIdxType my_device_index, - DeviceIdxType peer, + DeviceIdxType dst, + DeviceIdxType src, c10d::Backend* backend, at::Tensor buffer); diff --git a/tests/cpp/test_multidevice_communications.cpp b/tests/cpp/test_multidevice_communications.cpp index ad13c747a5b..e5e6e3e78e1 100644 --- a/tests/cpp/test_multidevice_communications.cpp +++ b/tests/cpp/test_multidevice_communications.cpp @@ -435,12 +435,13 @@ TEST_F(P2PCommunicationTest, CudaComm) { auto container = std::make_unique(); FusionGuard fg(container.get()); auto* tv = makeContigTensor(1); + auto* val_my_rank = IrBuilder::create(my_rank, DataType::Int); container->addInput(tv); if (my_rank == 0) { const DeviceIdxType send_peer = (my_rank + 1) % size; auto* val_send_peer = IrBuilder::create(send_peer, DataType::Int); - auto send = IrBuilder::create(P2PCommunicationType::SEND, tv, val_send_peer, CommunicatorBackend::kCuda); + auto send = IrBuilder::create(tv, val_send_peer, val_my_rank, CommunicatorBackend::kCuda); auto wait_send = IrBuilder::create(send); container->pushBackTopLevelExprs(send); container->pushBackTopLevelExprs(wait_send); @@ -448,7 +449,7 @@ TEST_F(P2PCommunicationTest, CudaComm) { ASSERT_EQ(my_rank, 1); const DeviceIdxType recv_peer = (size + my_rank - 1) % size; auto* val_recv_peer = IrBuilder::create(recv_peer, DataType::Int); - auto recv = IrBuilder::create(P2PCommunicationType::RECV, tv, val_recv_peer, CommunicatorBackend::kCuda); + auto recv = IrBuilder::create(tv, val_my_rank, val_recv_peer, CommunicatorBackend::kCuda); auto wait_recv = IrBuilder::create(recv); container->pushBackTopLevelExprs(recv); container->pushBackTopLevelExprs(wait_recv); diff --git a/tests/cpp/test_multidevice_host_ir.cpp b/tests/cpp/test_multidevice_host_ir.cpp index b2e2d12cb6d..ab65c27405f 100644 --- a/tests/cpp/test_multidevice_host_ir.cpp +++ b/tests/cpp/test_multidevice_host_ir.cpp @@ -262,13 +262,13 @@ TEST_F(P2PCommHostIrTest, RingPairwiseExchange) { TensorView* recv_buffer = makeContigTensor(1); auto* send = IrBuilder::create( - P2PCommunicationType::SEND, send_buffer, - IrBuilder::create(send_peer)); + IrBuilder::create(send_peer), + IrBuilder::create(my_device_index)); auto* recv = IrBuilder::create( - P2PCommunicationType::RECV, recv_buffer, + IrBuilder::create(my_device_index), IrBuilder::create(recv_peer)); auto* wait = IrBuilder::create(recv); @@ -316,12 +316,12 @@ TEST_F(P2PCommHostIrTest, CoalescedRingPairwiseExchange) { auto* start_coalescing = IrBuilder::create(); auto* send = IrBuilder::create( - P2PCommunicationType::SEND, send_buffer, - IrBuilder::create(send_peer)); + IrBuilder::create(send_peer), + IrBuilder::create(my_device_index)); auto* recv = IrBuilder::create( - P2PCommunicationType::RECV, recv_buffer, + IrBuilder::create(my_device_index), IrBuilder::create(recv_peer)); auto* end_coalescing = IrBuilder::create(); auto* wait = IrBuilder::create(end_coalescing); diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp index 47bb5d915db..c1df5684c47 100644 --- a/tests/cpp/test_multidevice_overlap.cpp +++ b/tests/cpp/test_multidevice_overlap.cpp @@ -1076,9 +1076,9 @@ TEST_F( auto* start_coalescing = IrBuilder::create(); auto* send = IrBuilder::create( - P2PCommunicationType::SEND, src_buffer_ij, send_rank); + src_buffer_ij, send_rank, my_device_index_val); auto* recv = IrBuilder::create( - P2PCommunicationType::RECV, dst_buffer_ij, recv_rank); + dst_buffer_ij, my_device_index_val, recv_rank); auto* end_coalescing = IrBuilder::create(); auto* wait = IrBuilder::create(end_coalescing); @@ -1668,9 +1668,9 @@ TEST_F( auto* start_coalescing = IrBuilder::create(); auto* send = IrBuilder::create( - P2PCommunicationType::SEND, tva_j_curr_slice, send_rank); + tva_j_curr_slice, send_rank, my_device_index_val); auto* recv = IrBuilder::create( - P2PCommunicationType::RECV, tva_j_next_slice, recv_rank); + tva_j_next_slice, my_device_index_val, recv_rank); auto* end_coalescing = IrBuilder::create(); auto* wait = IrBuilder::create(end_coalescing); From 6c20a20bc62fc255e8585b6ea442079224424eb9 Mon Sep 17 00:00:00 2001 From: snordmann Date: Tue, 11 Feb 2025 15:38:14 -0800 Subject: [PATCH 41/55] working chkpt with get zcopy --- csrc/dispatch.h | 3 +- csrc/host_ir/executor.cpp | 224 ++++++++++++------ csrc/host_ir/executor.h | 46 ++++ csrc/host_ir/host_ir.cpp | 22 ++ csrc/host_ir/host_ir.h | 23 ++ csrc/multidevice/communicator.cpp | 61 ----- csrc/multidevice/communicator.h | 65 +---- tests/cpp/test_multidevice_communications.cpp | 67 +++--- 8 files changed, 288 insertions(+), 223 deletions(-) diff --git a/csrc/dispatch.h b/csrc/dispatch.h index ee47464a6fb..1eb584bc2d7 100644 --- a/csrc/dispatch.h +++ b/csrc/dispatch.h @@ -155,7 +155,8 @@ class Val; f(Wait); \ f(Synchronize); \ f(StartCoalescing); \ - f(EndCoalescing); + f(EndCoalescing); \ + f(ShareMemHandles); // Forward declarations for all Val and Expr types diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index 3e454270ec5..6b144ddd7d8 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -410,6 +410,125 @@ void HostIrEvaluator::handle(PostOnStream* post_ir) { } } +RemoteBufferInfo::RemoteBufferInfo(at::Tensor tensor) : ptr_(tensor.data_ptr()), storage_offset_(tensor.storage_offset()), element_size_(tensor.element_size()), is_imported_(false) { + + NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcGetMemHandle(&ipc_handle_, tensor.data_ptr())); + const auto number_of_semaphores = Communicator::getInstance().size(); + NVFUSER_CUDA_RT_SAFE_CALL(cudaMalloc((void**)&semaphores_, number_of_semaphores * sizeof(IpcSemaphore))); + static_assert(sizeof(IpcSemaphore) == sizeof(int), "IpcSemaphore must be same size as int"); + NVFUSER_CUDA_RT_SAFE_CALL(cudaMemset((void*) semaphores_, (int)IpcSemaphore::kReady, number_of_semaphores * sizeof(IpcSemaphore))); + NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcGetMemHandle(&semaphores_ipc_handle_, semaphores_)); +} + +RemoteBufferInfo::RemoteBufferInfo(std::vector data) : is_imported_(true) { + const RemoteBufferInfo& imported_buffer = fromBytes(data); + + storage_offset_ = imported_buffer.storage_offset_; + element_size_ = imported_buffer.element_size_; + ipc_handle_ = imported_buffer.ipc_handle_; + semaphores_ipc_handle_ = imported_buffer.semaphores_ipc_handle_; + + NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcOpenMemHandle(&ptr_, ipc_handle_, cudaIpcMemLazyEnablePeerAccess)); + ptr_ = (void*)((uint8_t*)ptr_ + storage_offset_ * element_size_); + + NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcOpenMemHandle((void**)&semaphores_, semaphores_ipc_handle_, cudaIpcMemLazyEnablePeerAccess)); +} + +RemoteBufferInfo::~RemoteBufferInfo() { + if (is_imported_) { + std::cout << "RANK " << Communicator::getInstance().deviceId() << " closes ipc handle" << std::endl; + NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcCloseMemHandle(ptr_)); + NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcCloseMemHandle((void*)semaphores_)); + } else { + std::cout << "RANK " << Communicator::getInstance().deviceId() << " frees semaphores" << std::endl; + NVFUSER_CUDA_RT_SAFE_CALL(cudaFree((void*)semaphores_)); + } +} + +std::ostream& operator<<(std::ostream& os, const cudaIpcMemHandle_t& info) { + uint8_t* ptr = (uint8_t*)&info; + for (int i = 0; i < (int)sizeof(cudaIpcMemHandle_t); i++) { + os << ptr[i]; + } + return os; +} + + + +std::ostream& operator<<(std::ostream& os, const RemoteBufferInfo& info) { + os << "RemoteBufferInfo(ptr=" << info.ptr_ + << ", storage_offset=" << info.storage_offset_ + << ", element_size=" << info.element_size_ + << ", is_imported=" << info.is_imported_ + << ", semaphores_=" << info.semaphores_ + << ", ipc_handle_=" << info.ipc_handle_ + << ", semaphores_ipc_handle_=" << info.semaphores_ipc_handle_ + << ")"; + return os; +} + +void HostIrEvaluator::handle(ShareMemHandles* share_mem_handles) { + const int64_t my_rank = communicator_->deviceId(); + auto get_tensor = [this](P2PCommunication* communication) -> at::Tensor { + return this->expr_evaluator_.evaluate(communication->buffer()).as(); + }; + + std::vector communications; + for (auto expr: share_mem_handles->communications()) { + auto communication = expr->as(); + const auto dst = expr_evaluator_.evaluate(communication->dst()).as(); + const auto src = expr_evaluator_.evaluate(communication->src()).as(); + const bool is_sender = my_rank == src; + const bool is_receiver = my_rank == dst; + if (!(is_sender ^ is_receiver)) { // REMOVE or adapt exporting/opening the handle + return; + } + if (remote_buffers_.find(get_tensor(communication)) != remote_buffers_.end()) { + continue; + } + communications.push_back(communication); + } + + // put memhandles to TCP store + auto get_key = [this] (P2PCommunication* communication, int64_t rank) -> std::string { + return "nvfuser_remote_buffer_info_P2PComm_dst=" + std::to_string(this->expr_evaluator_.evaluate(communication->dst()).as()) + "_src=" + std::to_string(this->expr_evaluator_.evaluate(communication->src()).as()) + "_rank=" + std::to_string(rank); + }; + std::unordered_map> buffer_handles; + auto store = communicator_->getTcpStore(); + for (P2PCommunication* communication: communications) { + auto buffer_handle = std::make_unique(get_tensor(communication)); + std::cout << "RANK " << my_rank << " registers at key " << get_key(communication, my_rank) << std::endl; + store->set(get_key(communication, my_rank), toBytes(*buffer_handle)); + std::cout << "RANK " << my_rank << " creates buffer_handle " << *buffer_handle << std::endl; + buffer_handles.emplace(communication, std::move(buffer_handle)); + } + + // barrier to ensure all ranks have pushed their memhandles to the store + // TODO: precisely select what ranks need to wait on that barrier. + communicator_->barrier(); + + // get memhandles to TCP store + for (P2PCommunication* communication: communications) { + std::vector> remote_buffers; + remote_buffers.reserve(communicator_->size()); + for (int64_t rank : c10::irange(communicator_->size())) { + std::cout << "RANK " << my_rank << " after barrier for key " << get_key(communication, rank) << std::endl; + if (rank == my_rank) { + // opening an ipc handle on the exporter's device is not supported + remote_buffers.push_back(std::move(buffer_handles.at(communication))); + } else { + std::string key = get_key(communication, rank); + NVF_ERROR(store->check({key}), "key ", key, " not found in store at rank ", my_rank); + auto imported_remote_buffer_info = std::make_unique(store->get(key)); + remote_buffers.push_back(std::move(imported_remote_buffer_info)); + } + std::cout << "RANK " << my_rank << " emplaces at rank " << rank << " remote buffer " << *remote_buffers.back() << std::endl; + } + remote_buffers_.emplace(get_tensor(communication), std::move(remote_buffers)); + } +} + + void HostIrEvaluator::handle(Communication* communication) { NVF_ERROR( communicator_ != nullptr && communicator_->is_available(), @@ -482,7 +601,7 @@ void HostIrEvaluator::handle(P2PCommunication* communication) { const auto src = expr_evaluator_.evaluate(communication->src()).as(); const bool is_sender = my_rank == src; const bool is_receiver = my_rank == dst; - if (!(is_sender || is_receiver)) { + if (!(is_sender ^ is_receiver)) { return; } @@ -502,80 +621,49 @@ void HostIrEvaluator::handle(P2PCommunication* communication) { return; } - - - // FIST TIME: - // sender exports cudaIpc mem handle on input buffer and put it to store - // sender signals recv it can open the mem handle. It needs to be CPU blocking - // recv opens the handle and gets the pointer. - // It copies the data and then signal sender on completion - - // SECOND TIME: - // Sender signals recv it can copy - // Recv copies the data and signals sender on completion - -// each rank must have a bool "recvied" and a "sent" bool per rank. So n+1 -// each rank must have, per rank, a sent_to and a received_from a bool "recvied" and a "sent" bool per rank. So n+1 - - - - // std::string prefix = "nvfuser_ipc_tensor_info_" + communication->buffer()->name() + "_"; - // IpcTensorInfo ipc_tensor_info; - // NVFUSER_CUDA_RT_SAFE_CALL( - // cudaIpcGetMemHandle(&ipc_tensor_info.ipc_handle, buffer.data_ptr())); - // ipc_tensor_info.storage_offset = buffer.storage_offset(); - // ipc_tensor_info.element_size = buffer.element_size(); - - // auto store = communicator_->getTcpStore(); - // store->set(prefix + std::to_string(my_rank), toBytes(ipc_tensor_info)); - - // Team team = {my_rank, peer}; - // communicator_->getBackendForTeam(team, CommunicatorBackend::kNccl)->barrier()->wait(); - - // for (int64_t rank : c10::irange(size())) { - // if (rank == my_rank) { - // remote_ptrs.at(rank) = tensor.data_ptr(); - // } else { - // ipc_tensor_info = - // fromBytes(store->get(prefix + std::to_string(rank))); - // void*& ptr = remote_ptrs.at(rank); - // NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcOpenMemHandle( - // &ptr, ipc_tensor_info.ipc_handle, cudaIpcMemLazyEnablePeerAccess)); - // // TODO: close ipc mem handle at shutdown - // ptr = (void*)((uint8_t*)ptr + - // ipc_tensor_info.storage_offset * - // ipc_tensor_info.element_size); - // } - // } - - const auto current_stream = reinterpret_cast(c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream()); - const std::vector& remote_buffers = communicator_->getRemoteBuffer(buffer, ""); + const auto it = remote_buffers_.find(buffer); + NVF_ERROR(it != remote_buffers_.end(), "No remote buffer found for ", communication->toString(), " at rank ", my_rank); + const std::vector>& remote_buffers = it->second; const int64_t peer = is_sender ? dst : src; - const RemoteBufferInfo& my_buffer = remote_buffers.at(my_rank); - const RemoteBufferInfo& peer_buffer = remote_buffers.at(peer); + const RemoteBufferInfo& my_buffer = *remote_buffers.at(my_rank); + const RemoteBufferInfo& peer_buffer = *remote_buffers.at(peer); const auto local_semaphore = reinterpret_cast(&my_buffer.semaphores()[peer]); const auto remote_semaphore = reinterpret_cast(&peer_buffer.semaphores()[my_rank]); static_assert(sizeof(IpcSemaphore) == sizeof(uint32_t), "IpcSemaphore must be 32 bits"); + const auto current_stream = reinterpret_cast(c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream()); if (is_receiver) { - std::cout << "RANK " << my_rank << " RECV, local semaphore=" << local_semaphore << ", remote semaphore=" << remote_semaphore << std::endl; + std::cout << "RANK " << my_rank << " RECV, peer=" << peer << ", local semaphore=" << local_semaphore << ", remote semaphore=" << remote_semaphore << ", my_buffer.ptr()=" << my_buffer.ptr() << ", buffer.data_ptr()=" << buffer.data_ptr() << "recv tensor=" << buffer << std::endl; // signal to self that transfer is in progress NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kTransferInProgress), CU_STREAM_WRITE_VALUE_DEFAULT)); // signal sender that receiver is ready NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(current_stream, remote_semaphore, (cuuint32_t)(IpcSemaphore::kTransferInProgress), CU_STREAM_WRITE_VALUE_DEFAULT)); // passing CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER gives an error + std::cout << "RANK " << my_rank << " RECV BEFORE MEMCPY, peer=" << peer << ", local semaphore=" << local_semaphore << ", remote semaphore=" << remote_semaphore << ", my_buffer.ptr()=" << my_buffer.ptr() << ", buffer.data_ptr()=" << buffer.data_ptr() << "recv tensor=" << buffer << std::endl; + NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpy( + buffer.data_ptr(), + peer_buffer.ptr(), + // my_buffer.ptr(), + buffer.numel() * buffer.element_size(), + cudaMemcpyDeviceToDevice + // current_stream)); + )); + std::cout << "RANK " << my_rank << " RECV AFTER MEMCPY, peer=" << peer << ", local semaphore=" << local_semaphore << ", remote semaphore=" << remote_semaphore << ", my_buffer.ptr()=" << my_buffer.ptr() << ", buffer.data_ptr()=" << buffer.data_ptr() << "recv tensor=" << buffer << std::endl; } else /*sender*/ { - std::cout << "RANK " << my_rank << " SEND, local semaphore=" << local_semaphore << ", remote semaphore=" << remote_semaphore << std::endl; + std::cout << "RANK " << my_rank << " SEND, peer=" << peer << ", local semaphore=" << local_semaphore << ", remote semaphore=" << remote_semaphore << std::endl; // wait for sender to be ready - // NVFUSER_CUDA_SAFE_CALL(cuStreamWaitValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kTransferInProgress), CU_STREAM_WAIT_VALUE_EQ)); - std::cout << "RANK " << my_rank << " SEND after 1st WAIT" << std::endl; + NVFUSER_CUDA_SAFE_CALL(cuStreamWaitValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kTransferInProgress), CU_STREAM_WAIT_VALUE_EQ)); + std::cout << "RANK " << my_rank << " SEND after 1st WAIT" << ", buffer.data_ptr()=" << buffer.data_ptr() << ", my_buffer.ptr()=" << my_buffer.ptr() << ", sent tensor=" << buffer << ", buffer.numel()=" << buffer.numel() << ", buffer.element_size()=" << buffer.element_size() << std::endl; // RDMA writes data from sender to receiver - NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync( - peer_buffer.ptr(), - my_buffer.ptr(), - buffer.numel() * buffer.element_size(), - cudaMemcpyDeviceToDevice, - current_stream)); + // NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync( + // NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpy( + // peer_buffer.ptr(), + // buffer.data_ptr(), + // // my_buffer.ptr(), + // buffer.numel() * buffer.element_size(), + // cudaMemcpyDeviceToDevice + // // current_stream)); + // )); std::cout << "RANK " << my_rank << " SEND after memcpy" << std::endl; // Signals completion to self NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kReady), CU_STREAM_WRITE_VALUE_DEFAULT)); @@ -606,14 +694,18 @@ void HostIrEvaluator::handle(Wait* wait) { const int64_t my_rank = communicator_->deviceId(); const bool is_receiver = my_rank == dst; if (is_receiver) { - // const auto current_stream = static_cast(c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream()); - const std::vector& remote_buffers = communicator_->getRemoteBuffer(getKnownTensorOrUndefined(p2p_comm->buffer(), expr_evaluator_), std::to_string(p2p_comm->buffer()->name())); - const RemoteBufferInfo& my_buffer = remote_buffers.at(my_rank); + const auto current_stream = static_cast(c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream()); + at::Tensor buffer = getKnownTensorOrUndefined(p2p_comm->buffer(), expr_evaluator_); + const auto it = remote_buffers_.find(buffer); + NVF_ERROR(it != remote_buffers_.end(), "No remote buffer found for ", p2p_comm->toString(), " at rank ", my_rank); + const std::vector>& remote_buffers = it->second; + + const RemoteBufferInfo& my_buffer = *remote_buffers.at(my_rank); const auto local_semaphore = reinterpret_cast(&my_buffer.semaphores()[src]); std::cout << "RANK " << my_rank << " WAIT RECV BEFORE cuStreamWaitValue32 on local semaphore " << local_semaphore << std::endl; - // NVFUSER_CUDA_SAFE_CALL(cuStreamWaitValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kReady), CU_STREAM_WAIT_VALUE_EQ)); - std::cout << "RANK " << my_rank << " FINISHED WAIT RECV AFTER cuStreamWaitValue32 on local semaphore " << local_semaphore << std::endl; + NVFUSER_CUDA_SAFE_CALL(cuStreamWaitValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kReady), CU_STREAM_WAIT_VALUE_EQ)); + std::cout << "RANK " << my_rank << " FINISHED WAIT RECV AFTER cuStreamWaitValue32 on local semaphore " << local_semaphore << ", buffer.data_ptr()=" << buffer.data_ptr() << ", my_buffer.ptr()=" << my_buffer.ptr() << "recv tensor=" << buffer << std::endl; } } diff --git a/csrc/host_ir/executor.h b/csrc/host_ir/executor.h index 8e281b66143..f052c1bfeb7 100644 --- a/csrc/host_ir/executor.h +++ b/csrc/host_ir/executor.h @@ -50,6 +50,35 @@ class HostIrExecutor : public ExecutorAbstract { namespace hir { +enum class IpcSemaphore : cuuint32_t { + kReady, + kTransferInProgress +}; + +class RemoteBufferInfo { + public: + + RemoteBufferInfo(at::Tensor tensor); + RemoteBufferInfo(std::vector data); // means it is imported + ~RemoteBufferInfo(); + + void* ptr() const { + return ptr_; + } + + auto semaphores() const { + return semaphores_; + } + + void* ptr_; + int64_t storage_offset_; + int64_t element_size_; + bool is_imported_; + cudaIpcMemHandle_t ipc_handle_; + cudaIpcMemHandle_t semaphores_ipc_handle_; + IpcSemaphore* semaphores_; +}; + /* a HostIrEvaluator evaluates a host programs represented through a HostIrContainer It is instantiated with the desired HostIrContainer, and runs @@ -129,6 +158,7 @@ class HostIrEvaluator final : public OptOutDispatch { void handle(MatmulOp* matmul) override; void handle(LinearOp* linear) override; void handle(kir::Allocate* allocate) override; + void handle(ShareMemHandles* share_mem_handles) override; void unhandled(Statement* stmt) override; c10::cuda::CUDAStream getCUDAStream(Stream* stream); @@ -145,6 +175,22 @@ class HostIrEvaluator final : public OptOutDispatch { std::unordered_map streams_; std::unordered_map> works_; const int64_t my_local_device_index_; + struct TensorHash { + std::size_t operator()(const at::Tensor& tensor) const { + auto ptr = reinterpret_cast(tensor.data_ptr()); + auto offset = tensor.storage_offset(); + auto element_size = tensor.element_size(); + return std::hash()(ptr) ^ std::hash()(offset) ^ + std::hash()(element_size); + } + }; + struct TensorEqual { + bool operator()(const at::Tensor& lhs, const at::Tensor& rhs) const { + return lhs.equal(rhs); + } + }; + std::unordered_map>, TensorHash, TensorEqual> + remote_buffers_; }; } // namespace hir diff --git a/csrc/host_ir/host_ir.cpp b/csrc/host_ir/host_ir.cpp index c99ddb2f345..5ea51fd82ff 100644 --- a/csrc/host_ir/host_ir.cpp +++ b/csrc/host_ir/host_ir.cpp @@ -323,6 +323,28 @@ std::string EndCoalescing::toInlineString(int indent_size) const { NVF_CHECK(false, "Cannot be printed inline"); } + +ShareMemHandles::ShareMemHandles(IrBuilderPasskey passkey, std::vector communications) : Expr(passkey) { + NVF_ERROR(passkey.ir_container_ != nullptr); + NVF_ERROR( + passkey.ir_container_->isA(), + this, + "must be registered in a HostIrContainer"); + addDataAttribute(std::move(communications)); +} + +NVFUSER_DEFINE_CLONE_AND_CREATE(ShareMemHandles) + +std::string ShareMemHandles::toString(int indent_size) const { + std::stringstream ss; + indent(ss, indent_size) << "ShareMemHandles" << std::endl; + return ss.str(); +} + +std::string ShareMemHandles::toInlineString(int indent_size) const { + NVF_CHECK(false, "Cannot be printed inline"); +} + } // namespace hir } // namespace nvfuser diff --git a/csrc/host_ir/host_ir.h b/csrc/host_ir/host_ir.h index 3ca06779684..64cdb404a8c 100644 --- a/csrc/host_ir/host_ir.h +++ b/csrc/host_ir/host_ir.h @@ -315,6 +315,29 @@ class EndCoalescing : public Expr { } }; +class ShareMemHandles : public Expr { + public: + using Expr::Expr; + ShareMemHandles(IrBuilderPasskey passkey, std::vector communications); + + ShareMemHandles(const ShareMemHandles& other) = delete; + ShareMemHandles& operator=(const ShareMemHandles& other) = delete; + ShareMemHandles(ShareMemHandles&& other) = delete; + ShareMemHandles& operator=(ShareMemHandles&& other) = delete; + + NVFUSER_DECLARE_CLONE_AND_CREATE + + std::string toString(int indent_size = 0) const override; + std::string toInlineString(int indent_size = 0) const override; + const char* getOpString() const override { + return "hir::ShareMemHandles"; + } + + const std::vector& communications() { + return attribute>(0); + } +}; + } // namespace hir } // namespace nvfuser diff --git a/csrc/multidevice/communicator.cpp b/csrc/multidevice/communicator.cpp index 902ceaaa64d..ce102695637 100644 --- a/csrc/multidevice/communicator.cpp +++ b/csrc/multidevice/communicator.cpp @@ -319,65 +319,4 @@ void Communicator::barrier(std::optional backend) { getWorld(backend)->barrier(options)->wait(); } -RemoteBufferInfo::RemoteBufferInfo(at::Tensor tensor, int64_t size) : ptr_(tensor.data_ptr()), size_(size), storage_offset_(tensor.storage_offset()), element_size_(tensor.element_size()), is_imported_(false) { - NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcGetMemHandle(&ipc_handle_, tensor.data_ptr())); - NVFUSER_CUDA_RT_SAFE_CALL(cudaMalloc((void**)&semaphores_, size_ * sizeof(IpcSemaphore))); - static_assert(sizeof(IpcSemaphore) == sizeof(int), "IpcSemaphore must be same size as int"); - NVFUSER_CUDA_RT_SAFE_CALL(cudaMemset((void*) semaphores_, (int)IpcSemaphore::kReady, size_ * sizeof(IpcSemaphore))); - NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcGetMemHandle(&semaphores_ipc_handle_, semaphores_)); -} - -RemoteBufferInfo::RemoteBufferInfo(std::vector data) : is_imported_(true) { - RemoteBufferInfo imported_buffer = fromBytes(data); - - size_ = imported_buffer.size_; - storage_offset_ = imported_buffer.storage_offset_; - element_size_ = imported_buffer.element_size_; - ipc_handle_ = imported_buffer.ipc_handle_; - semaphores_ipc_handle_ = imported_buffer.semaphores_ipc_handle_; - - NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcOpenMemHandle(&ptr_, ipc_handle_, cudaIpcMemLazyEnablePeerAccess)); - ptr_ = (void*)((uint8_t*)ptr_ + storage_offset_ * element_size_); - - NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcOpenMemHandle((void**)&semaphores_, semaphores_ipc_handle_, cudaIpcMemLazyEnablePeerAccess)); -} - -RemoteBufferInfo::~RemoteBufferInfo() { - // if (is_imported_) { - // NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcCloseMemHandle(&ipc_handle_)); - // NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcCloseMemHandle(&semaphores_ipc_handle_)); - // } else { - // NVFUSER_CUDA_RT_SAFE_CALL(cudaFree(semaphores_)); - // } -} - - -std::vector Communicator::getRemoteBuffer(at::Tensor tensor, std::string key) { - auto it = remote_buffers_.find(tensor); - if (it == remote_buffers_.end()) { - RemoteBufferInfo buffer_handle(tensor, size_); - - auto store = getTcpStore(); - std::string prefix = "nvfuser_remote_buffer_info_" + key; - std::cout << "RANK " << deviceId() << "registers at key " << prefix + std::to_string(deviceId()) << std::endl; - store->set(prefix + std::to_string(deviceId()), toBytes(buffer_handle)); - - barrier(); - - std::cout << "RANK " << deviceId() << "after barrier for key " << prefix + std::to_string(deviceId()) << std::endl; - std::vector remote_buffers; - remote_buffers.reserve(size_); - for (int64_t rank : c10::irange(size_)) { - if (rank == deviceId()) { - remote_buffers.push_back(std::move(buffer_handle)); - } else { - RemoteBufferInfo imported_remote_buffer_info(store->get(prefix + std::to_string(rank))); - remote_buffers.push_back(std::move(imported_remote_buffer_info)); - } - } - it = remote_buffers_.emplace(tensor, std::move(remote_buffers)).first; - } - return it->second; -} - } // namespace nvfuser diff --git a/csrc/multidevice/communicator.h b/csrc/multidevice/communicator.h index 39c9d667bf4..53c9fbcead8 100644 --- a/csrc/multidevice/communicator.h +++ b/csrc/multidevice/communicator.h @@ -30,52 +30,17 @@ namespace nvfuser { template -std::vector toBytes(T data) { +std::vector toBytes(const T& data) { return std::vector( - reinterpret_cast(&data), - reinterpret_cast(&data) + sizeof(T)); + reinterpret_cast(&data), + reinterpret_cast(&data) + sizeof(T)); } template -T fromBytes(std::vector bytes) { - return *reinterpret_cast(bytes.data()); +const T& fromBytes(const std::vector& bytes) { + return *reinterpret_cast(bytes.data()); } -enum class IpcSemaphore : cuuint32_t { - kReady, - kTransferInProgress -}; - -class RemoteBufferInfo { - public: - - RemoteBufferInfo(at::Tensor tensor, int64_t size); - RemoteBufferInfo(std::vector data); // means it is imported - ~RemoteBufferInfo(); - - void* ptr() const { - return ptr_; - } - - auto semaphores() const { - return semaphores_; - } - - auto size() const { - return size_; - } - - private: - void* ptr_; - int64_t size_; - int64_t storage_offset_; - int64_t element_size_; - bool is_imported_; - cudaIpcMemHandle_t ipc_handle_; - cudaIpcMemHandle_t semaphores_ipc_handle_; - IpcSemaphore* semaphores_; -}; - // This file implements the class Communicator which sets up the inter-process // Backend. This class contains inter-process information, such as the rank, the // world size, as well as the Process Group that can be called to perform @@ -194,25 +159,7 @@ class Communicator { return store_; } - std::vector getRemoteBuffer(at::Tensor tensor, std::string key); - private: - struct TensorHash { - std::size_t operator()(const at::Tensor& tensor) const { - auto ptr = reinterpret_cast(tensor.data_ptr()); - auto offset = tensor.storage_offset(); - auto element_size = tensor.element_size(); - return std::hash()(ptr) ^ std::hash()(offset) ^ - std::hash()(element_size); - } - }; - - struct TensorEqual { - bool operator()(const at::Tensor& lhs, const at::Tensor& rhs) const { - return lhs.equal(rhs); - } - }; - Communicator( CommunicatorBackend backend = comm_backend_default, RankType server_local_rank = comm_server_local_rank_default); @@ -245,8 +192,6 @@ class Communicator { c10::intrusive_ptr store_; // cache for the created backends. The keys are strings generated from Teams std::unordered_map> backends_; - std::unordered_map, TensorHash, TensorEqual> - remote_buffers_; }; } // namespace nvfuser diff --git a/tests/cpp/test_multidevice_communications.cpp b/tests/cpp/test_multidevice_communications.cpp index e5e6e3e78e1..ebd6ef0600b 100644 --- a/tests/cpp/test_multidevice_communications.cpp +++ b/tests/cpp/test_multidevice_communications.cpp @@ -417,65 +417,62 @@ using P2PCommunicationTest = MultiDeviceTest; TEST_F(P2PCommunicationTest, CudaComm) { static constexpr int kTensorSize = 8; - static constexpr int kNumRepetitions = 8; + static constexpr int kNumRepetitions = 2; if (communicator_->size() < 2 || torch::cuda::device_count() < 2) { GTEST_SKIP() << "This test needs at least 2 GPUs and 2 ranks."; } - if (communicator_->size() != 2) { - GTEST_SKIP() << "This test needs for now exactly 2 GPUs and 2 ranks."; - } - - - const DeviceIdxType my_rank = communicator_->deviceId(); const DeviceIdxType size = communicator_->size(); + const DeviceIdxType send_peer = (my_rank + 1) % size; + const DeviceIdxType recv_peer = (size + my_rank - 1) % size; auto container = std::make_unique(); FusionGuard fg(container.get()); - auto* tv = makeContigTensor(1); - auto* val_my_rank = IrBuilder::create(my_rank, DataType::Int); - container->addInput(tv); - if (my_rank == 0) { - const DeviceIdxType send_peer = (my_rank + 1) % size; - - auto* val_send_peer = IrBuilder::create(send_peer, DataType::Int); - auto send = IrBuilder::create(tv, val_send_peer, val_my_rank, CommunicatorBackend::kCuda); - auto wait_send = IrBuilder::create(send); - container->pushBackTopLevelExprs(send); - container->pushBackTopLevelExprs(wait_send); - } else { - ASSERT_EQ(my_rank, 1); - const DeviceIdxType recv_peer = (size + my_rank - 1) % size; - auto* val_recv_peer = IrBuilder::create(recv_peer, DataType::Int); - auto recv = IrBuilder::create(tv, val_my_rank, val_recv_peer, CommunicatorBackend::kCuda); - auto wait_recv = IrBuilder::create(recv); - container->pushBackTopLevelExprs(recv); - container->pushBackTopLevelExprs(wait_recv); - } + + auto* my_rank_val = IrBuilder::create(my_rank, DataType::Int); + auto* recv_peer_val = IrBuilder::create(recv_peer, DataType::Int); + auto* send_peer_val = IrBuilder::create(send_peer, DataType::Int); + + auto* send_tv = makeContigTensor(1); + auto* recv_tv = makeContigTensor(1); + container->addInput(send_tv); + container->addInput(recv_tv); + + auto recv = IrBuilder::create(recv_tv, my_rank_val, recv_peer_val, CommunicatorBackend::kCuda); + auto send = IrBuilder::create(send_tv, send_peer_val, my_rank_val, CommunicatorBackend::kCuda); + std::vector grouped_communications = {recv, send}; + auto share_mem_handles = IrBuilder::create(std::move(grouped_communications)); + auto wait_recv = IrBuilder::create(recv); + auto wait_send = IrBuilder::create(send); + + container->pushBackTopLevelExprs(share_mem_handles); + container->pushBackTopLevelExprs(recv); + container->pushBackTopLevelExprs(send); + container->pushBackTopLevelExprs(wait_recv); + container->pushBackTopLevelExprs(wait_send); hir::HostIrEvaluator executor(std::move(container), communicator_); - at::Tensor tensor = at::empty({kTensorSize}, tensor_options); + at::Tensor send_tensor = at::empty({kTensorSize}, tensor_options); + at::Tensor recv_tensor = at::empty({kTensorSize}, tensor_options); - std::unordered_map inputs = {{tv, tensor}}; + std::unordered_map inputs = {{send_tv, send_tensor}, {recv_tv, recv_tensor}}; for (auto repetition : c10::irange(kNumRepetitions)) { - tensor.copy_(at::arange(kTensorSize, tensor_options) + (1+repetition) * 10 + 100* (1+(1-my_rank))); + send_tensor.copy_(at::arange(kTensorSize, tensor_options) + repetition * my_rank); + std::cout << "RANK " << my_rank << " REPETITION " << repetition << ", send_peer=" << send_peer << ", recv_peer=" << recv_peer << ", send_tensor=" << send_tensor << std::endl; torch::cuda::synchronize(); communicator_->barrier(); - std::cout << "RANK " << my_rank << " REPETITION " << repetition << ", tensor=" << tensor << std::endl; executor.runWithInput(inputs); torch::cuda::synchronize(); communicator_->barrier(); std::cout << "RANK " << my_rank << " validation at" << " REPETITION " << repetition << std::endl; - if (my_rank == 1) { - auto ref = at::arange(kTensorSize, tensor_options) + (1+repetition) * 10 + 100* (1+my_rank); - EXPECT_TRUE(torch::allclose(tensor, ref)) << "Rank " << my_rank << " failed at repetition " << repetition << " with tensor " << tensor << " and ref " << ref; - } + auto ref = at::arange(kTensorSize, tensor_options) + repetition * recv_peer; + EXPECT_TRUE(torch::allclose(recv_tensor, ref)) << "Rank " << my_rank << " failed at repetition " << repetition << " with recv tensor " << recv_tensor << " and ref " << ref; } } From f7409b20b0a649d21bf4e7445e075f4f12d42498 Mon Sep 17 00:00:00 2001 From: snordmann Date: Wed, 12 Feb 2025 04:46:15 -0800 Subject: [PATCH 42/55] working checkpt with many ranks --- csrc/host_ir/executor.cpp | 6 ++++++ tests/cpp/test_multidevice_communications.cpp | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index 6b144ddd7d8..b1f45ee610c 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -511,7 +511,13 @@ void HostIrEvaluator::handle(ShareMemHandles* share_mem_handles) { for (P2PCommunication* communication: communications) { std::vector> remote_buffers; remote_buffers.reserve(communicator_->size()); + const auto dst = expr_evaluator_.evaluate(communication->dst()).as(); + const auto src = expr_evaluator_.evaluate(communication->src()).as(); for (int64_t rank : c10::irange(communicator_->size())) { + if (rank != src && rank != dst) { + remote_buffers.push_back(nullptr); + continue; + } std::cout << "RANK " << my_rank << " after barrier for key " << get_key(communication, rank) << std::endl; if (rank == my_rank) { // opening an ipc handle on the exporter's device is not supported diff --git a/tests/cpp/test_multidevice_communications.cpp b/tests/cpp/test_multidevice_communications.cpp index ebd6ef0600b..df43712d4d3 100644 --- a/tests/cpp/test_multidevice_communications.cpp +++ b/tests/cpp/test_multidevice_communications.cpp @@ -461,7 +461,7 @@ TEST_F(P2PCommunicationTest, CudaComm) { std::unordered_map inputs = {{send_tv, send_tensor}, {recv_tv, recv_tensor}}; for (auto repetition : c10::irange(kNumRepetitions)) { - send_tensor.copy_(at::arange(kTensorSize, tensor_options) + repetition * my_rank); + send_tensor.copy_(at::arange(kTensorSize, tensor_options) + repetition * 10 + 100 * my_rank); std::cout << "RANK " << my_rank << " REPETITION " << repetition << ", send_peer=" << send_peer << ", recv_peer=" << recv_peer << ", send_tensor=" << send_tensor << std::endl; torch::cuda::synchronize(); communicator_->barrier(); @@ -471,7 +471,7 @@ TEST_F(P2PCommunicationTest, CudaComm) { torch::cuda::synchronize(); communicator_->barrier(); std::cout << "RANK " << my_rank << " validation at" << " REPETITION " << repetition << std::endl; - auto ref = at::arange(kTensorSize, tensor_options) + repetition * recv_peer; + auto ref = at::arange(kTensorSize, tensor_options) + repetition * 10 + 100 * recv_peer; EXPECT_TRUE(torch::allclose(recv_tensor, ref)) << "Rank " << my_rank << " failed at repetition " << repetition << " with recv tensor " << recv_tensor << " and ref " << ref; } } From 08f8fe03c041b3960943825551439279578dd5d2 Mon Sep 17 00:00:00 2001 From: snordmann Date: Wed, 12 Feb 2025 05:09:10 -0800 Subject: [PATCH 43/55] chkpt non blocking --- csrc/host_ir/executor.cpp | 39 +++++++++---------- tests/cpp/test_multidevice_communications.cpp | 2 +- 2 files changed, 20 insertions(+), 21 deletions(-) diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index b1f45ee610c..d6cfa474e52 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -640,12 +640,11 @@ void HostIrEvaluator::handle(P2PCommunication* communication) { const auto current_stream = reinterpret_cast(c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream()); if (is_receiver) { - std::cout << "RANK " << my_rank << " RECV, peer=" << peer << ", local semaphore=" << local_semaphore << ", remote semaphore=" << remote_semaphore << ", my_buffer.ptr()=" << my_buffer.ptr() << ", buffer.data_ptr()=" << buffer.data_ptr() << "recv tensor=" << buffer << std::endl; - // signal to self that transfer is in progress - NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kTransferInProgress), CU_STREAM_WRITE_VALUE_DEFAULT)); - // signal sender that receiver is ready - NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(current_stream, remote_semaphore, (cuuint32_t)(IpcSemaphore::kTransferInProgress), CU_STREAM_WRITE_VALUE_DEFAULT)); // passing CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER gives an error - std::cout << "RANK " << my_rank << " RECV BEFORE MEMCPY, peer=" << peer << ", local semaphore=" << local_semaphore << ", remote semaphore=" << remote_semaphore << ", my_buffer.ptr()=" << my_buffer.ptr() << ", buffer.data_ptr()=" << buffer.data_ptr() << "recv tensor=" << buffer << std::endl; + // wait for sender to be ready + std::cout << "RANK " << my_rank << " RECV, peer=" << peer << ", local semaphore=" << local_semaphore << ", remote semaphore=" << remote_semaphore << std::endl; + NVFUSER_CUDA_SAFE_CALL(cuStreamWaitValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kTransferInProgress), CU_STREAM_WAIT_VALUE_EQ)); + std::cout << "RANK " << my_rank << " RECV after 1st WAIT" << ", buffer.data_ptr()=" << buffer.data_ptr() << ", my_buffer.ptr()=" << my_buffer.ptr() << ", sent tensor=" << buffer << ", buffer.numel()=" << buffer.numel() << ", buffer.element_size()=" << buffer.element_size() << std::endl; + // RDMA get the data from the sender NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpy( buffer.data_ptr(), peer_buffer.ptr(), @@ -654,13 +653,19 @@ void HostIrEvaluator::handle(P2PCommunication* communication) { cudaMemcpyDeviceToDevice // current_stream)); )); - std::cout << "RANK " << my_rank << " RECV AFTER MEMCPY, peer=" << peer << ", local semaphore=" << local_semaphore << ", remote semaphore=" << remote_semaphore << ", my_buffer.ptr()=" << my_buffer.ptr() << ", buffer.data_ptr()=" << buffer.data_ptr() << "recv tensor=" << buffer << std::endl; + std::cout << "RANK " << my_rank << " RECV after memcpy" << std::endl; + // Signals completion to self + NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kReady), CU_STREAM_WRITE_VALUE_DEFAULT)); + // Signals completion to receiver + NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(current_stream, remote_semaphore, (cuuint32_t)(IpcSemaphore::kReady), CU_STREAM_WRITE_VALUE_DEFAULT)); } else /*sender*/ { - std::cout << "RANK " << my_rank << " SEND, peer=" << peer << ", local semaphore=" << local_semaphore << ", remote semaphore=" << remote_semaphore << std::endl; - // wait for sender to be ready - NVFUSER_CUDA_SAFE_CALL(cuStreamWaitValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kTransferInProgress), CU_STREAM_WAIT_VALUE_EQ)); - std::cout << "RANK " << my_rank << " SEND after 1st WAIT" << ", buffer.data_ptr()=" << buffer.data_ptr() << ", my_buffer.ptr()=" << my_buffer.ptr() << ", sent tensor=" << buffer << ", buffer.numel()=" << buffer.numel() << ", buffer.element_size()=" << buffer.element_size() << std::endl; - // RDMA writes data from sender to receiver + std::cout << "RANK " << my_rank << " SEND, peer=" << peer << ", local semaphore=" << local_semaphore << ", remote semaphore=" << remote_semaphore << ", my_buffer.ptr()=" << my_buffer.ptr() << ", buffer.data_ptr()=" << buffer.data_ptr() << "recv tensor=" << buffer << std::endl; + std::cout << "RANK " << my_rank << " SEND BEFORE signaling, peer=" << peer << ", local semaphore=" << local_semaphore << ", remote semaphore=" << remote_semaphore << ", my_buffer.ptr()=" << my_buffer.ptr() << ", buffer.data_ptr()=" << buffer.data_ptr() << "recv tensor=" << buffer << std::endl; + // signal to self that transfer is in progress + NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kTransferInProgress), CU_STREAM_WRITE_VALUE_DEFAULT)); + // signal to receiver that the buffer is ready + NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(current_stream, remote_semaphore, (cuuint32_t)(IpcSemaphore::kTransferInProgress), CU_STREAM_WRITE_VALUE_DEFAULT)); // passing CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER gives an error + std::cout << "RANK " << my_rank << " SEND AFTER signaling, peer=" << peer << ", local semaphore=" << local_semaphore << ", remote semaphore=" << remote_semaphore << ", my_buffer.ptr()=" << my_buffer.ptr() << ", buffer.data_ptr()=" << buffer.data_ptr() << "recv tensor=" << buffer << std::endl; // NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync( // NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpy( // peer_buffer.ptr(), @@ -670,11 +675,6 @@ void HostIrEvaluator::handle(P2PCommunication* communication) { // cudaMemcpyDeviceToDevice // // current_stream)); // )); - std::cout << "RANK " << my_rank << " SEND after memcpy" << std::endl; - // Signals completion to self - NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kReady), CU_STREAM_WRITE_VALUE_DEFAULT)); - // Signals completion to receiver - NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(current_stream, remote_semaphore, (cuuint32_t)(IpcSemaphore::kReady), CU_STREAM_WRITE_VALUE_DEFAULT)); } } @@ -698,8 +698,7 @@ void HostIrEvaluator::handle(Wait* wait) { const auto dst = expr_evaluator_.evaluate(p2p_comm->dst()).as(); const auto src = expr_evaluator_.evaluate(p2p_comm->src()).as(); const int64_t my_rank = communicator_->deviceId(); - const bool is_receiver = my_rank == dst; - if (is_receiver) { + if (my_rank == src) { const auto current_stream = static_cast(c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream()); at::Tensor buffer = getKnownTensorOrUndefined(p2p_comm->buffer(), expr_evaluator_); const auto it = remote_buffers_.find(buffer); @@ -707,7 +706,7 @@ void HostIrEvaluator::handle(Wait* wait) { const std::vector>& remote_buffers = it->second; const RemoteBufferInfo& my_buffer = *remote_buffers.at(my_rank); - const auto local_semaphore = reinterpret_cast(&my_buffer.semaphores()[src]); + const auto local_semaphore = reinterpret_cast(&my_buffer.semaphores()[dst]); std::cout << "RANK " << my_rank << " WAIT RECV BEFORE cuStreamWaitValue32 on local semaphore " << local_semaphore << std::endl; NVFUSER_CUDA_SAFE_CALL(cuStreamWaitValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kReady), CU_STREAM_WAIT_VALUE_EQ)); diff --git a/tests/cpp/test_multidevice_communications.cpp b/tests/cpp/test_multidevice_communications.cpp index df43712d4d3..c0a41348a1f 100644 --- a/tests/cpp/test_multidevice_communications.cpp +++ b/tests/cpp/test_multidevice_communications.cpp @@ -448,8 +448,8 @@ TEST_F(P2PCommunicationTest, CudaComm) { auto wait_send = IrBuilder::create(send); container->pushBackTopLevelExprs(share_mem_handles); - container->pushBackTopLevelExprs(recv); container->pushBackTopLevelExprs(send); + container->pushBackTopLevelExprs(recv); container->pushBackTopLevelExprs(wait_recv); container->pushBackTopLevelExprs(wait_send); From de843bb8c13d3b9e4f412ccf38bdb02507eba175 Mon Sep 17 00:00:00 2001 From: snordmann Date: Wed, 12 Feb 2025 09:02:18 -0800 Subject: [PATCH 44/55] harden tests by removing hard syncs --- tests/cpp/test_multidevice_communications.cpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tests/cpp/test_multidevice_communications.cpp b/tests/cpp/test_multidevice_communications.cpp index c0a41348a1f..74efa49efbe 100644 --- a/tests/cpp/test_multidevice_communications.cpp +++ b/tests/cpp/test_multidevice_communications.cpp @@ -417,7 +417,7 @@ using P2PCommunicationTest = MultiDeviceTest; TEST_F(P2PCommunicationTest, CudaComm) { static constexpr int kTensorSize = 8; - static constexpr int kNumRepetitions = 2; + static constexpr int kNumRepetitions = 32; if (communicator_->size() < 2 || torch::cuda::device_count() < 2) { GTEST_SKIP() << "This test needs at least 2 GPUs and 2 ranks."; @@ -463,13 +463,9 @@ TEST_F(P2PCommunicationTest, CudaComm) { for (auto repetition : c10::irange(kNumRepetitions)) { send_tensor.copy_(at::arange(kTensorSize, tensor_options) + repetition * 10 + 100 * my_rank); std::cout << "RANK " << my_rank << " REPETITION " << repetition << ", send_peer=" << send_peer << ", recv_peer=" << recv_peer << ", send_tensor=" << send_tensor << std::endl; - torch::cuda::synchronize(); - communicator_->barrier(); executor.runWithInput(inputs); - torch::cuda::synchronize(); - communicator_->barrier(); std::cout << "RANK " << my_rank << " validation at" << " REPETITION " << repetition << std::endl; auto ref = at::arange(kTensorSize, tensor_options) + repetition * 10 + 100 * recv_peer; EXPECT_TRUE(torch::allclose(recv_tensor, ref)) << "Rank " << my_rank << " failed at repetition " << repetition << " with recv tensor " << recv_tensor << " and ref " << ref; From 4dc9936e9b22c6d5f9bbaaa356bd10dda6f6126b Mon Sep 17 00:00:00 2001 From: snordmann Date: Wed, 12 Feb 2025 09:09:15 -0800 Subject: [PATCH 45/55] use cudaMemcpyAsync --- csrc/host_ir/executor.cpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index d6cfa474e52..f74e1a93306 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -645,15 +645,12 @@ void HostIrEvaluator::handle(P2PCommunication* communication) { NVFUSER_CUDA_SAFE_CALL(cuStreamWaitValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kTransferInProgress), CU_STREAM_WAIT_VALUE_EQ)); std::cout << "RANK " << my_rank << " RECV after 1st WAIT" << ", buffer.data_ptr()=" << buffer.data_ptr() << ", my_buffer.ptr()=" << my_buffer.ptr() << ", sent tensor=" << buffer << ", buffer.numel()=" << buffer.numel() << ", buffer.element_size()=" << buffer.element_size() << std::endl; // RDMA get the data from the sender - NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpy( + NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync( buffer.data_ptr(), peer_buffer.ptr(), - // my_buffer.ptr(), buffer.numel() * buffer.element_size(), - cudaMemcpyDeviceToDevice - // current_stream)); - )); - std::cout << "RANK " << my_rank << " RECV after memcpy" << std::endl; + cudaMemcpyDeviceToDevice, + current_stream)); // Signals completion to self NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kReady), CU_STREAM_WRITE_VALUE_DEFAULT)); // Signals completion to receiver From 4e056093a10e170b29e5af3146c633285c3506db Mon Sep 17 00:00:00 2001 From: snordmann Date: Wed, 12 Feb 2025 09:11:44 -0800 Subject: [PATCH 46/55] clean and lint --- csrc/host_ir/executor.cpp | 255 +++++++++--------- csrc/host_ir/executor.h | 12 +- csrc/host_ir/host_ir.cpp | 6 +- csrc/host_ir/host_ir.h | 4 +- csrc/multidevice/communicator.h | 5 +- tests/cpp/test_multidevice_communications.cpp | 27 +- tests/cpp/test_multidevice_gpu_comms.cpp | 11 +- tests/cpp/test_multidevice_overlap.cpp | 2 +- 8 files changed, 173 insertions(+), 149 deletions(-) diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index f74e1a93306..ae4dbb028b8 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -8,6 +8,7 @@ #include +#include #include #include #include @@ -21,7 +22,6 @@ #include #include #include -#include namespace nvfuser { @@ -410,17 +410,29 @@ void HostIrEvaluator::handle(PostOnStream* post_ir) { } } -RemoteBufferInfo::RemoteBufferInfo(at::Tensor tensor) : ptr_(tensor.data_ptr()), storage_offset_(tensor.storage_offset()), element_size_(tensor.element_size()), is_imported_(false) { - - NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcGetMemHandle(&ipc_handle_, tensor.data_ptr())); +RemoteBufferInfo::RemoteBufferInfo(at::Tensor tensor) + : ptr_(tensor.data_ptr()), + storage_offset_(tensor.storage_offset()), + element_size_(tensor.element_size()), + is_imported_(false) { + NVFUSER_CUDA_RT_SAFE_CALL( + cudaIpcGetMemHandle(&ipc_handle_, tensor.data_ptr())); const auto number_of_semaphores = Communicator::getInstance().size(); - NVFUSER_CUDA_RT_SAFE_CALL(cudaMalloc((void**)&semaphores_, number_of_semaphores * sizeof(IpcSemaphore))); - static_assert(sizeof(IpcSemaphore) == sizeof(int), "IpcSemaphore must be same size as int"); - NVFUSER_CUDA_RT_SAFE_CALL(cudaMemset((void*) semaphores_, (int)IpcSemaphore::kReady, number_of_semaphores * sizeof(IpcSemaphore))); - NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcGetMemHandle(&semaphores_ipc_handle_, semaphores_)); + NVFUSER_CUDA_RT_SAFE_CALL(cudaMalloc( + (void**)&semaphores_, number_of_semaphores * sizeof(IpcSemaphore))); + static_assert( + sizeof(IpcSemaphore) == sizeof(int), + "IpcSemaphore must be same size as int"); + NVFUSER_CUDA_RT_SAFE_CALL(cudaMemset( + (void*)semaphores_, + (int)IpcSemaphore::kReady, + number_of_semaphores * sizeof(IpcSemaphore))); + NVFUSER_CUDA_RT_SAFE_CALL( + cudaIpcGetMemHandle(&semaphores_ipc_handle_, semaphores_)); } -RemoteBufferInfo::RemoteBufferInfo(std::vector data) : is_imported_(true) { +RemoteBufferInfo::RemoteBufferInfo(std::vector data) + : is_imported_(true) { const RemoteBufferInfo& imported_buffer = fromBytes(data); storage_offset_ = imported_buffer.storage_offset_; @@ -428,78 +440,79 @@ RemoteBufferInfo::RemoteBufferInfo(std::vector data) : is_imported_(tru ipc_handle_ = imported_buffer.ipc_handle_; semaphores_ipc_handle_ = imported_buffer.semaphores_ipc_handle_; - NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcOpenMemHandle(&ptr_, ipc_handle_, cudaIpcMemLazyEnablePeerAccess)); + NVFUSER_CUDA_RT_SAFE_CALL( + cudaIpcOpenMemHandle(&ptr_, ipc_handle_, cudaIpcMemLazyEnablePeerAccess)); ptr_ = (void*)((uint8_t*)ptr_ + storage_offset_ * element_size_); - NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcOpenMemHandle((void**)&semaphores_, semaphores_ipc_handle_, cudaIpcMemLazyEnablePeerAccess)); + NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcOpenMemHandle( + (void**)&semaphores_, + semaphores_ipc_handle_, + cudaIpcMemLazyEnablePeerAccess)); } RemoteBufferInfo::~RemoteBufferInfo() { if (is_imported_) { - std::cout << "RANK " << Communicator::getInstance().deviceId() << " closes ipc handle" << std::endl; NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcCloseMemHandle(ptr_)); NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcCloseMemHandle((void*)semaphores_)); } else { - std::cout << "RANK " << Communicator::getInstance().deviceId() << " frees semaphores" << std::endl; NVFUSER_CUDA_RT_SAFE_CALL(cudaFree((void*)semaphores_)); } } -std::ostream& operator<<(std::ostream& os, const cudaIpcMemHandle_t& info) { - uint8_t* ptr = (uint8_t*)&info; - for (int i = 0; i < (int)sizeof(cudaIpcMemHandle_t); i++) { - os << ptr[i]; - } - return os; -} - - - std::ostream& operator<<(std::ostream& os, const RemoteBufferInfo& info) { os << "RemoteBufferInfo(ptr=" << info.ptr_ << ", storage_offset=" << info.storage_offset_ << ", element_size=" << info.element_size_ << ", is_imported=" << info.is_imported_ - << ", semaphores_=" << info.semaphores_ - << ", ipc_handle_=" << info.ipc_handle_ - << ", semaphores_ipc_handle_=" << info.semaphores_ipc_handle_ - << ")"; + << ", semaphores_=" << info.semaphores_ << ")"; return os; } void HostIrEvaluator::handle(ShareMemHandles* share_mem_handles) { const int64_t my_rank = communicator_->deviceId(); - auto get_tensor = [this](P2PCommunication* communication) -> at::Tensor { - return this->expr_evaluator_.evaluate(communication->buffer()).as(); + auto get_tensor = [this](P2PCommunication* communication) -> at::Tensor { + return this->expr_evaluator_.evaluate(communication->buffer()) + .as(); }; std::vector communications; - for (auto expr: share_mem_handles->communications()) { + for (auto expr : share_mem_handles->communications()) { auto communication = expr->as(); - const auto dst = expr_evaluator_.evaluate(communication->dst()).as(); - const auto src = expr_evaluator_.evaluate(communication->src()).as(); + const auto dst = + expr_evaluator_.evaluate(communication->dst()).as(); + const auto src = + expr_evaluator_.evaluate(communication->src()).as(); const bool is_sender = my_rank == src; const bool is_receiver = my_rank == dst; - if (!(is_sender ^ is_receiver)) { // REMOVE or adapt exporting/opening the handle + if (!(is_sender ^ + is_receiver)) { // REMOVE or adapt exporting/opening the handle return; } - if (remote_buffers_.find(get_tensor(communication)) != remote_buffers_.end()) { + if (remote_buffers_.find(get_tensor(communication)) != + remote_buffers_.end()) { continue; } communications.push_back(communication); } // put memhandles to TCP store - auto get_key = [this] (P2PCommunication* communication, int64_t rank) -> std::string { - return "nvfuser_remote_buffer_info_P2PComm_dst=" + std::to_string(this->expr_evaluator_.evaluate(communication->dst()).as()) + "_src=" + std::to_string(this->expr_evaluator_.evaluate(communication->src()).as()) + "_rank=" + std::to_string(rank); + auto get_key = + [this](P2PCommunication* communication, int64_t rank) -> std::string { + return "nvfuser_remote_buffer_info_P2PComm_dst=" + + std::to_string(this->expr_evaluator_.evaluate(communication->dst()) + .as()) + + "_src=" + + std::to_string(this->expr_evaluator_.evaluate(communication->src()) + .as()) + + "_rank=" + std::to_string(rank); }; - std::unordered_map> buffer_handles; + std::unordered_map> + buffer_handles; auto store = communicator_->getTcpStore(); - for (P2PCommunication* communication: communications) { - auto buffer_handle = std::make_unique(get_tensor(communication)); - std::cout << "RANK " << my_rank << " registers at key " << get_key(communication, my_rank) << std::endl; + for (P2PCommunication* communication : communications) { + auto buffer_handle = + std::make_unique(get_tensor(communication)); store->set(get_key(communication, my_rank), toBytes(*buffer_handle)); - std::cout << "RANK " << my_rank << " creates buffer_handle " << *buffer_handle << std::endl; buffer_handles.emplace(communication, std::move(buffer_handle)); } @@ -507,34 +520,40 @@ void HostIrEvaluator::handle(ShareMemHandles* share_mem_handles) { // TODO: precisely select what ranks need to wait on that barrier. communicator_->barrier(); - // get memhandles to TCP store - for (P2PCommunication* communication: communications) { + // get memhandles to TCP store + for (P2PCommunication* communication : communications) { std::vector> remote_buffers; remote_buffers.reserve(communicator_->size()); - const auto dst = expr_evaluator_.evaluate(communication->dst()).as(); - const auto src = expr_evaluator_.evaluate(communication->src()).as(); + const auto dst = + expr_evaluator_.evaluate(communication->dst()).as(); + const auto src = + expr_evaluator_.evaluate(communication->src()).as(); for (int64_t rank : c10::irange(communicator_->size())) { if (rank != src && rank != dst) { remote_buffers.push_back(nullptr); continue; } - std::cout << "RANK " << my_rank << " after barrier for key " << get_key(communication, rank) << std::endl; if (rank == my_rank) { // opening an ipc handle on the exporter's device is not supported remote_buffers.push_back(std::move(buffer_handles.at(communication))); } else { std::string key = get_key(communication, rank); - NVF_ERROR(store->check({key}), "key ", key, " not found in store at rank ", my_rank); - auto imported_remote_buffer_info = std::make_unique(store->get(key)); + NVF_ERROR( + store->check({key}), + "key ", + key, + " not found in store at rank ", + my_rank); + auto imported_remote_buffer_info = + std::make_unique(store->get(key)); remote_buffers.push_back(std::move(imported_remote_buffer_info)); } - std::cout << "RANK " << my_rank << " emplaces at rank " << rank << " remote buffer " << *remote_buffers.back() << std::endl; } - remote_buffers_.emplace(get_tensor(communication), std::move(remote_buffers)); + remote_buffers_.emplace( + get_tensor(communication), std::move(remote_buffers)); } } - void HostIrEvaluator::handle(Communication* communication) { NVF_ERROR( communicator_ != nullptr && communicator_->is_available(), @@ -560,41 +579,6 @@ void HostIrEvaluator::handle(Communication* communication) { } NVF_ERROR(communication->type() == CommunicationType::Allgather); - - // std::vector output_tensors = - // at::tensor_split(output_tensor.squeeze(), communication->team_size(), 0); - // const std::vector& input_ptrs = communicator_->getRemotePtrs(input_tensor); - // cudaStream_t current_stream = - // c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream(); - // // TODO: use multicast - // for (auto i = 0; i < communicator_->size(); i++) { - // cudaStream_t stream = c10::cuda::getStreamFromPool( - // /*isHighPriority=*/false, my_local_device_index_) - // .stream(); - // cudaEvent_t event = {}; - // NVFUSER_CUDA_RT_SAFE_CALL( - // cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); - // NVFUSER_CUDA_RT_SAFE_CALL(cudaEventRecord(event, current_stream)); - // NVFUSER_CUDA_RT_SAFE_CALL( - // cudaStreamWaitEvent(stream, event, cudaEventWaitDefault)); - // NVFUSER_CUDA_RT_SAFE_CALL(cudaEventDestroy(event)); - - // auto output = output_tensors.at(i); - // NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync( - // output.data_ptr(), - // input_ptrs.at(i), - // output.numel() * output.element_size(), - // cudaMemcpyDeviceToDevice, - // stream)); - - // // sync - // NVFUSER_CUDA_RT_SAFE_CALL( - // cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); - // NVFUSER_CUDA_RT_SAFE_CALL(cudaEventRecord(event, stream)); - // NVFUSER_CUDA_RT_SAFE_CALL( - // cudaStreamWaitEvent(current_stream, event, cudaEventWaitDefault)); - // NVFUSER_CUDA_RT_SAFE_CALL(cudaEventDestroy(event)); - // } } void HostIrEvaluator::handle(P2PCommunication* communication) { @@ -615,7 +599,6 @@ void HostIrEvaluator::handle(P2PCommunication* communication) { at::Tensor buffer = getKnownTensorOrUndefined(communication->buffer(), expr_evaluator_); - if (backend_type != CommunicatorBackend::kCuda) { works_[communication] = postSingleCommunication( communication, @@ -628,22 +611,34 @@ void HostIrEvaluator::handle(P2PCommunication* communication) { } const auto it = remote_buffers_.find(buffer); - NVF_ERROR(it != remote_buffers_.end(), "No remote buffer found for ", communication->toString(), " at rank ", my_rank); - const std::vector>& remote_buffers = it->second; + NVF_ERROR( + it != remote_buffers_.end(), + "No remote buffer found for ", + communication->toString(), + " at rank ", + my_rank); + const std::vector>& remote_buffers = + it->second; const int64_t peer = is_sender ? dst : src; const RemoteBufferInfo& my_buffer = *remote_buffers.at(my_rank); const RemoteBufferInfo& peer_buffer = *remote_buffers.at(peer); - const auto local_semaphore = reinterpret_cast(&my_buffer.semaphores()[peer]); - const auto remote_semaphore = reinterpret_cast(&peer_buffer.semaphores()[my_rank]); - static_assert(sizeof(IpcSemaphore) == sizeof(uint32_t), "IpcSemaphore must be 32 bits"); + const auto local_semaphore = + reinterpret_cast(&my_buffer.semaphores()[peer]); + const auto remote_semaphore = + reinterpret_cast(&peer_buffer.semaphores()[my_rank]); + static_assert( + sizeof(IpcSemaphore) == sizeof(uint32_t), "IpcSemaphore must be 32 bits"); - const auto current_stream = reinterpret_cast(c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream()); + const auto current_stream = reinterpret_cast( + c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream()); if (is_receiver) { // wait for sender to be ready - std::cout << "RANK " << my_rank << " RECV, peer=" << peer << ", local semaphore=" << local_semaphore << ", remote semaphore=" << remote_semaphore << std::endl; - NVFUSER_CUDA_SAFE_CALL(cuStreamWaitValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kTransferInProgress), CU_STREAM_WAIT_VALUE_EQ)); - std::cout << "RANK " << my_rank << " RECV after 1st WAIT" << ", buffer.data_ptr()=" << buffer.data_ptr() << ", my_buffer.ptr()=" << my_buffer.ptr() << ", sent tensor=" << buffer << ", buffer.numel()=" << buffer.numel() << ", buffer.element_size()=" << buffer.element_size() << std::endl; + NVFUSER_CUDA_SAFE_CALL(cuStreamWaitValue32( + current_stream, + local_semaphore, + (cuuint32_t)(IpcSemaphore::kTransferInProgress), + CU_STREAM_WAIT_VALUE_EQ)); // RDMA get the data from the sender NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync( buffer.data_ptr(), @@ -652,26 +647,32 @@ void HostIrEvaluator::handle(P2PCommunication* communication) { cudaMemcpyDeviceToDevice, current_stream)); // Signals completion to self - NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kReady), CU_STREAM_WRITE_VALUE_DEFAULT)); + NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32( + current_stream, + local_semaphore, + (cuuint32_t)(IpcSemaphore::kReady), + CU_STREAM_WRITE_VALUE_DEFAULT)); // Signals completion to receiver - NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(current_stream, remote_semaphore, (cuuint32_t)(IpcSemaphore::kReady), CU_STREAM_WRITE_VALUE_DEFAULT)); + NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32( + current_stream, + remote_semaphore, + (cuuint32_t)(IpcSemaphore::kReady), + CU_STREAM_WRITE_VALUE_DEFAULT)); } else /*sender*/ { - std::cout << "RANK " << my_rank << " SEND, peer=" << peer << ", local semaphore=" << local_semaphore << ", remote semaphore=" << remote_semaphore << ", my_buffer.ptr()=" << my_buffer.ptr() << ", buffer.data_ptr()=" << buffer.data_ptr() << "recv tensor=" << buffer << std::endl; - std::cout << "RANK " << my_rank << " SEND BEFORE signaling, peer=" << peer << ", local semaphore=" << local_semaphore << ", remote semaphore=" << remote_semaphore << ", my_buffer.ptr()=" << my_buffer.ptr() << ", buffer.data_ptr()=" << buffer.data_ptr() << "recv tensor=" << buffer << std::endl; // signal to self that transfer is in progress - NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kTransferInProgress), CU_STREAM_WRITE_VALUE_DEFAULT)); + NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32( + current_stream, + local_semaphore, + (cuuint32_t)(IpcSemaphore::kTransferInProgress), + CU_STREAM_WRITE_VALUE_DEFAULT)); // signal to receiver that the buffer is ready - NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(current_stream, remote_semaphore, (cuuint32_t)(IpcSemaphore::kTransferInProgress), CU_STREAM_WRITE_VALUE_DEFAULT)); // passing CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER gives an error - std::cout << "RANK " << my_rank << " SEND AFTER signaling, peer=" << peer << ", local semaphore=" << local_semaphore << ", remote semaphore=" << remote_semaphore << ", my_buffer.ptr()=" << my_buffer.ptr() << ", buffer.data_ptr()=" << buffer.data_ptr() << "recv tensor=" << buffer << std::endl; - // NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync( - // NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpy( - // peer_buffer.ptr(), - // buffer.data_ptr(), - // // my_buffer.ptr(), - // buffer.numel() * buffer.element_size(), - // cudaMemcpyDeviceToDevice - // // current_stream)); - // )); + NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32( + current_stream, + remote_semaphore, + (cuuint32_t)(IpcSemaphore::kTransferInProgress), + CU_STREAM_WRITE_VALUE_DEFAULT)); // passing + // CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER + // gives an error } } @@ -691,23 +692,33 @@ void HostIrEvaluator::handle(Wait* wait) { return; } - const auto dst = expr_evaluator_.evaluate(p2p_comm->dst()).as(); const auto src = expr_evaluator_.evaluate(p2p_comm->src()).as(); const int64_t my_rank = communicator_->deviceId(); if (my_rank == src) { - const auto current_stream = static_cast(c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream()); - at::Tensor buffer = getKnownTensorOrUndefined(p2p_comm->buffer(), expr_evaluator_); + const auto current_stream = static_cast( + c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream()); + at::Tensor buffer = + getKnownTensorOrUndefined(p2p_comm->buffer(), expr_evaluator_); const auto it = remote_buffers_.find(buffer); - NVF_ERROR(it != remote_buffers_.end(), "No remote buffer found for ", p2p_comm->toString(), " at rank ", my_rank); - const std::vector>& remote_buffers = it->second; + NVF_ERROR( + it != remote_buffers_.end(), + "No remote buffer found for ", + p2p_comm->toString(), + " at rank ", + my_rank); + const std::vector>& remote_buffers = + it->second; const RemoteBufferInfo& my_buffer = *remote_buffers.at(my_rank); - const auto local_semaphore = reinterpret_cast(&my_buffer.semaphores()[dst]); - - std::cout << "RANK " << my_rank << " WAIT RECV BEFORE cuStreamWaitValue32 on local semaphore " << local_semaphore << std::endl; - NVFUSER_CUDA_SAFE_CALL(cuStreamWaitValue32(current_stream, local_semaphore, (cuuint32_t)(IpcSemaphore::kReady), CU_STREAM_WAIT_VALUE_EQ)); - std::cout << "RANK " << my_rank << " FINISHED WAIT RECV AFTER cuStreamWaitValue32 on local semaphore " << local_semaphore << ", buffer.data_ptr()=" << buffer.data_ptr() << ", my_buffer.ptr()=" << my_buffer.ptr() << "recv tensor=" << buffer << std::endl; + const auto local_semaphore = + reinterpret_cast(&my_buffer.semaphores()[dst]); + + NVFUSER_CUDA_SAFE_CALL(cuStreamWaitValue32( + current_stream, + local_semaphore, + (cuuint32_t)(IpcSemaphore::kReady), + CU_STREAM_WAIT_VALUE_EQ)); } } diff --git a/csrc/host_ir/executor.h b/csrc/host_ir/executor.h index f052c1bfeb7..6727fc7622a 100644 --- a/csrc/host_ir/executor.h +++ b/csrc/host_ir/executor.h @@ -50,14 +50,10 @@ class HostIrExecutor : public ExecutorAbstract { namespace hir { -enum class IpcSemaphore : cuuint32_t { - kReady, - kTransferInProgress -}; +enum class IpcSemaphore : cuuint32_t { kReady, kTransferInProgress }; class RemoteBufferInfo { public: - RemoteBufferInfo(at::Tensor tensor); RemoteBufferInfo(std::vector data); // means it is imported ~RemoteBufferInfo(); @@ -189,7 +185,11 @@ class HostIrEvaluator final : public OptOutDispatch { return lhs.equal(rhs); } }; - std::unordered_map>, TensorHash, TensorEqual> + std::unordered_map< + at::Tensor, + std::vector>, + TensorHash, + TensorEqual> remote_buffers_; }; diff --git a/csrc/host_ir/host_ir.cpp b/csrc/host_ir/host_ir.cpp index 5ea51fd82ff..edc9c476eaf 100644 --- a/csrc/host_ir/host_ir.cpp +++ b/csrc/host_ir/host_ir.cpp @@ -323,8 +323,10 @@ std::string EndCoalescing::toInlineString(int indent_size) const { NVF_CHECK(false, "Cannot be printed inline"); } - -ShareMemHandles::ShareMemHandles(IrBuilderPasskey passkey, std::vector communications) : Expr(passkey) { +ShareMemHandles::ShareMemHandles( + IrBuilderPasskey passkey, + std::vector communications) + : Expr(passkey) { NVF_ERROR(passkey.ir_container_ != nullptr); NVF_ERROR( passkey.ir_container_->isA(), diff --git a/csrc/host_ir/host_ir.h b/csrc/host_ir/host_ir.h index 64cdb404a8c..efb23b95d67 100644 --- a/csrc/host_ir/host_ir.h +++ b/csrc/host_ir/host_ir.h @@ -318,7 +318,9 @@ class EndCoalescing : public Expr { class ShareMemHandles : public Expr { public: using Expr::Expr; - ShareMemHandles(IrBuilderPasskey passkey, std::vector communications); + ShareMemHandles( + IrBuilderPasskey passkey, + std::vector communications); ShareMemHandles(const ShareMemHandles& other) = delete; ShareMemHandles& operator=(const ShareMemHandles& other) = delete; diff --git a/csrc/multidevice/communicator.h b/csrc/multidevice/communicator.h index 53c9fbcead8..65b994aa125 100644 --- a/csrc/multidevice/communicator.h +++ b/csrc/multidevice/communicator.h @@ -11,10 +11,9 @@ #include #include // #include -#include -#include #include - +#include +#include #include #include diff --git a/tests/cpp/test_multidevice_communications.cpp b/tests/cpp/test_multidevice_communications.cpp index 74efa49efbe..9db4f3a78eb 100644 --- a/tests/cpp/test_multidevice_communications.cpp +++ b/tests/cpp/test_multidevice_communications.cpp @@ -413,7 +413,7 @@ INSTANTIATE_TEST_SUITE_P( testing::Values(CommunicatorBackend::kNccl, CommunicatorBackend::kUcc), testing::PrintToStringParamName()); -using P2PCommunicationTest = MultiDeviceTest; +using P2PCommunicationTest = MultiDeviceTest; TEST_F(P2PCommunicationTest, CudaComm) { static constexpr int kTensorSize = 8; @@ -440,10 +440,13 @@ TEST_F(P2PCommunicationTest, CudaComm) { container->addInput(send_tv); container->addInput(recv_tv); - auto recv = IrBuilder::create(recv_tv, my_rank_val, recv_peer_val, CommunicatorBackend::kCuda); - auto send = IrBuilder::create(send_tv, send_peer_val, my_rank_val, CommunicatorBackend::kCuda); + auto recv = IrBuilder::create( + recv_tv, my_rank_val, recv_peer_val, CommunicatorBackend::kCuda); + auto send = IrBuilder::create( + send_tv, send_peer_val, my_rank_val, CommunicatorBackend::kCuda); std::vector grouped_communications = {recv, send}; - auto share_mem_handles = IrBuilder::create(std::move(grouped_communications)); + auto share_mem_handles = IrBuilder::create( + std::move(grouped_communications)); auto wait_recv = IrBuilder::create(recv); auto wait_send = IrBuilder::create(send); @@ -458,17 +461,21 @@ TEST_F(P2PCommunicationTest, CudaComm) { at::Tensor send_tensor = at::empty({kTensorSize}, tensor_options); at::Tensor recv_tensor = at::empty({kTensorSize}, tensor_options); - std::unordered_map inputs = {{send_tv, send_tensor}, {recv_tv, recv_tensor}}; + std::unordered_map inputs = { + {send_tv, send_tensor}, {recv_tv, recv_tensor}}; for (auto repetition : c10::irange(kNumRepetitions)) { - send_tensor.copy_(at::arange(kTensorSize, tensor_options) + repetition * 10 + 100 * my_rank); - std::cout << "RANK " << my_rank << " REPETITION " << repetition << ", send_peer=" << send_peer << ", recv_peer=" << recv_peer << ", send_tensor=" << send_tensor << std::endl; + send_tensor.copy_( + at::arange(kTensorSize, tensor_options) + repetition * 10 + + 100 * my_rank); executor.runWithInput(inputs); - std::cout << "RANK " << my_rank << " validation at" << " REPETITION " << repetition << std::endl; - auto ref = at::arange(kTensorSize, tensor_options) + repetition * 10 + 100 * recv_peer; - EXPECT_TRUE(torch::allclose(recv_tensor, ref)) << "Rank " << my_rank << " failed at repetition " << repetition << " with recv tensor " << recv_tensor << " and ref " << ref; + auto ref = at::arange(kTensorSize, tensor_options) + repetition * 10 + + 100 * recv_peer; + EXPECT_TRUE(torch::allclose(recv_tensor, ref)) + << "Rank " << my_rank << " failed at repetition " << repetition + << " with recv tensor " << recv_tensor << " and ref " << ref; } } diff --git a/tests/cpp/test_multidevice_gpu_comms.cpp b/tests/cpp/test_multidevice_gpu_comms.cpp index 413df0f06a4..a46fb5c1758 100644 --- a/tests/cpp/test_multidevice_gpu_comms.cpp +++ b/tests/cpp/test_multidevice_gpu_comms.cpp @@ -5,6 +5,7 @@ * SPDX-License-Identifier: BSD-3-Clause */ // clang-format on +#include #include #include #include @@ -13,7 +14,6 @@ #include #include #include -#include namespace nvfuser { @@ -148,9 +148,12 @@ TEST_F(StreamOpTest, StreamWriteValue32) { NVFUSER_CUDA_RT_SAFE_CALL(cudaSetDevice(0)); NVFUSER_CUDA_RT_SAFE_CALL(cudaStreamCreate(&stream)); NVFUSER_CUDA_RT_SAFE_CALL(cudaMalloc(&buf, sizeof(int))); - NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync(buf, &value, sizeof(int), cudaMemcpyHostToDevice, stream)); - NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32(stream, (CUdeviceptr)buf, new_value, CU_STREAM_WRITE_VALUE_DEFAULT)); - NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync(&value, buf, sizeof(int), cudaMemcpyDeviceToHost, stream)); + NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync( + buf, &value, sizeof(int), cudaMemcpyHostToDevice, stream)); + NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32( + stream, (CUdeviceptr)buf, new_value, CU_STREAM_WRITE_VALUE_DEFAULT)); + NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync( + &value, buf, sizeof(int), cudaMemcpyDeviceToHost, stream)); NVFUSER_CUDA_RT_SAFE_CALL(cudaStreamSynchronize(stream)); EXPECT_EQ(value, new_value); } diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp index c1df5684c47..af7e153ba46 100644 --- a/tests/cpp/test_multidevice_overlap.cpp +++ b/tests/cpp/test_multidevice_overlap.cpp @@ -10,9 +10,9 @@ #include #include // #include -#include #include #include +#include #include #include #include From 326b683932536d067e2b3c6063e90e72e117a560 Mon Sep 17 00:00:00 2001 From: snordmann Date: Wed, 12 Feb 2025 09:39:27 -0800 Subject: [PATCH 47/55] Move distributed tensors to separate file --- CMakeLists.txt | 1 + csrc/host_ir/executor.cpp | 111 ++++++----------------- csrc/host_ir/executor.h | 30 +----- csrc/multidevice/communicator.h | 12 --- tests/cpp/test_multidevice_gpu_comms.cpp | 1 + 5 files changed, 31 insertions(+), 124 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5e59cfddc65..ee371dc64dc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -169,6 +169,7 @@ list(APPEND NVFUSER_SRCS ${NVFUSER_SRCS_DIR}/mma_type.cpp ${NVFUSER_SRCS_DIR}/multidevice/communication.cpp ${NVFUSER_SRCS_DIR}/multidevice/communicator.cpp + ${NVFUSER_SRCS_DIR}/multidevice/distributed_buffer.cpp ${NVFUSER_SRCS_DIR}/multidevice/device_mesh.cpp ${NVFUSER_SRCS_DIR}/multidevice/executor.cpp ${NVFUSER_SRCS_DIR}/multidevice/utils.cpp diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index ae4dbb028b8..bb5462d6985 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -8,7 +8,6 @@ #include -#include #include #include #include @@ -410,64 +409,6 @@ void HostIrEvaluator::handle(PostOnStream* post_ir) { } } -RemoteBufferInfo::RemoteBufferInfo(at::Tensor tensor) - : ptr_(tensor.data_ptr()), - storage_offset_(tensor.storage_offset()), - element_size_(tensor.element_size()), - is_imported_(false) { - NVFUSER_CUDA_RT_SAFE_CALL( - cudaIpcGetMemHandle(&ipc_handle_, tensor.data_ptr())); - const auto number_of_semaphores = Communicator::getInstance().size(); - NVFUSER_CUDA_RT_SAFE_CALL(cudaMalloc( - (void**)&semaphores_, number_of_semaphores * sizeof(IpcSemaphore))); - static_assert( - sizeof(IpcSemaphore) == sizeof(int), - "IpcSemaphore must be same size as int"); - NVFUSER_CUDA_RT_SAFE_CALL(cudaMemset( - (void*)semaphores_, - (int)IpcSemaphore::kReady, - number_of_semaphores * sizeof(IpcSemaphore))); - NVFUSER_CUDA_RT_SAFE_CALL( - cudaIpcGetMemHandle(&semaphores_ipc_handle_, semaphores_)); -} - -RemoteBufferInfo::RemoteBufferInfo(std::vector data) - : is_imported_(true) { - const RemoteBufferInfo& imported_buffer = fromBytes(data); - - storage_offset_ = imported_buffer.storage_offset_; - element_size_ = imported_buffer.element_size_; - ipc_handle_ = imported_buffer.ipc_handle_; - semaphores_ipc_handle_ = imported_buffer.semaphores_ipc_handle_; - - NVFUSER_CUDA_RT_SAFE_CALL( - cudaIpcOpenMemHandle(&ptr_, ipc_handle_, cudaIpcMemLazyEnablePeerAccess)); - ptr_ = (void*)((uint8_t*)ptr_ + storage_offset_ * element_size_); - - NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcOpenMemHandle( - (void**)&semaphores_, - semaphores_ipc_handle_, - cudaIpcMemLazyEnablePeerAccess)); -} - -RemoteBufferInfo::~RemoteBufferInfo() { - if (is_imported_) { - NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcCloseMemHandle(ptr_)); - NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcCloseMemHandle((void*)semaphores_)); - } else { - NVFUSER_CUDA_RT_SAFE_CALL(cudaFree((void*)semaphores_)); - } -} - -std::ostream& operator<<(std::ostream& os, const RemoteBufferInfo& info) { - os << "RemoteBufferInfo(ptr=" << info.ptr_ - << ", storage_offset=" << info.storage_offset_ - << ", element_size=" << info.element_size_ - << ", is_imported=" << info.is_imported_ - << ", semaphores_=" << info.semaphores_ << ")"; - return os; -} - void HostIrEvaluator::handle(ShareMemHandles* share_mem_handles) { const int64_t my_rank = communicator_->deviceId(); auto get_tensor = [this](P2PCommunication* communication) -> at::Tensor { @@ -488,8 +429,8 @@ void HostIrEvaluator::handle(ShareMemHandles* share_mem_handles) { is_receiver)) { // REMOVE or adapt exporting/opening the handle return; } - if (remote_buffers_.find(get_tensor(communication)) != - remote_buffers_.end()) { + if (distributed_buffers_.find(get_tensor(communication)) != + distributed_buffers_.end()) { continue; } communications.push_back(communication); @@ -498,7 +439,7 @@ void HostIrEvaluator::handle(ShareMemHandles* share_mem_handles) { // put memhandles to TCP store auto get_key = [this](P2PCommunication* communication, int64_t rank) -> std::string { - return "nvfuser_remote_buffer_info_P2PComm_dst=" + + return "nvfuser_distributed_buffer_info_P2PComm_dst=" + std::to_string(this->expr_evaluator_.evaluate(communication->dst()) .as()) + "_src=" + @@ -506,12 +447,12 @@ void HostIrEvaluator::handle(ShareMemHandles* share_mem_handles) { .as()) + "_rank=" + std::to_string(rank); }; - std::unordered_map> + std::unordered_map> buffer_handles; auto store = communicator_->getTcpStore(); for (P2PCommunication* communication : communications) { auto buffer_handle = - std::make_unique(get_tensor(communication)); + std::make_unique(get_tensor(communication)); store->set(get_key(communication, my_rank), toBytes(*buffer_handle)); buffer_handles.emplace(communication, std::move(buffer_handle)); } @@ -522,20 +463,20 @@ void HostIrEvaluator::handle(ShareMemHandles* share_mem_handles) { // get memhandles to TCP store for (P2PCommunication* communication : communications) { - std::vector> remote_buffers; - remote_buffers.reserve(communicator_->size()); + std::vector> distributed_buffers; + distributed_buffers.reserve(communicator_->size()); const auto dst = expr_evaluator_.evaluate(communication->dst()).as(); const auto src = expr_evaluator_.evaluate(communication->src()).as(); for (int64_t rank : c10::irange(communicator_->size())) { if (rank != src && rank != dst) { - remote_buffers.push_back(nullptr); + distributed_buffers.push_back(nullptr); continue; } if (rank == my_rank) { // opening an ipc handle on the exporter's device is not supported - remote_buffers.push_back(std::move(buffer_handles.at(communication))); + distributed_buffers.push_back(std::move(buffer_handles.at(communication))); } else { std::string key = get_key(communication, rank); NVF_ERROR( @@ -544,13 +485,13 @@ void HostIrEvaluator::handle(ShareMemHandles* share_mem_handles) { key, " not found in store at rank ", my_rank); - auto imported_remote_buffer_info = - std::make_unique(store->get(key)); - remote_buffers.push_back(std::move(imported_remote_buffer_info)); + auto imported_distributed_buffer_info = + std::make_unique(store->get(key)); + distributed_buffers.push_back(std::move(imported_distributed_buffer_info)); } } - remote_buffers_.emplace( - get_tensor(communication), std::move(remote_buffers)); + distributed_buffers_.emplace( + get_tensor(communication), std::move(distributed_buffers)); } } @@ -610,18 +551,18 @@ void HostIrEvaluator::handle(P2PCommunication* communication) { return; } - const auto it = remote_buffers_.find(buffer); + const auto it = distributed_buffers_.find(buffer); NVF_ERROR( - it != remote_buffers_.end(), + it != distributed_buffers_.end(), "No remote buffer found for ", communication->toString(), " at rank ", my_rank); - const std::vector>& remote_buffers = + const std::vector>& distributed_buffers = it->second; const int64_t peer = is_sender ? dst : src; - const RemoteBufferInfo& my_buffer = *remote_buffers.at(my_rank); - const RemoteBufferInfo& peer_buffer = *remote_buffers.at(peer); + const DistributedBuffer& my_buffer = *distributed_buffers.at(my_rank); + const DistributedBuffer& peer_buffer = *distributed_buffers.at(peer); const auto local_semaphore = reinterpret_cast(&my_buffer.semaphores()[peer]); const auto remote_semaphore = @@ -637,7 +578,7 @@ void HostIrEvaluator::handle(P2PCommunication* communication) { NVFUSER_CUDA_SAFE_CALL(cuStreamWaitValue32( current_stream, local_semaphore, - (cuuint32_t)(IpcSemaphore::kTransferInProgress), + (cuuint32_t)(IpcSemaphore::kInUse), CU_STREAM_WAIT_VALUE_EQ)); // RDMA get the data from the sender NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync( @@ -663,13 +604,13 @@ void HostIrEvaluator::handle(P2PCommunication* communication) { NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32( current_stream, local_semaphore, - (cuuint32_t)(IpcSemaphore::kTransferInProgress), + (cuuint32_t)(IpcSemaphore::kInUse), CU_STREAM_WRITE_VALUE_DEFAULT)); // signal to receiver that the buffer is ready NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32( current_stream, remote_semaphore, - (cuuint32_t)(IpcSemaphore::kTransferInProgress), + (cuuint32_t)(IpcSemaphore::kInUse), CU_STREAM_WRITE_VALUE_DEFAULT)); // passing // CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER // gives an error @@ -700,17 +641,17 @@ void HostIrEvaluator::handle(Wait* wait) { c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream()); at::Tensor buffer = getKnownTensorOrUndefined(p2p_comm->buffer(), expr_evaluator_); - const auto it = remote_buffers_.find(buffer); + const auto it = distributed_buffers_.find(buffer); NVF_ERROR( - it != remote_buffers_.end(), + it != distributed_buffers_.end(), "No remote buffer found for ", p2p_comm->toString(), " at rank ", my_rank); - const std::vector>& remote_buffers = + const std::vector>& distributed_buffers = it->second; - const RemoteBufferInfo& my_buffer = *remote_buffers.at(my_rank); + const DistributedBuffer& my_buffer = *distributed_buffers.at(my_rank); const auto local_semaphore = reinterpret_cast(&my_buffer.semaphores()[dst]); diff --git a/csrc/host_ir/executor.h b/csrc/host_ir/executor.h index 6727fc7622a..634b27c755e 100644 --- a/csrc/host_ir/executor.h +++ b/csrc/host_ir/executor.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -50,31 +51,6 @@ class HostIrExecutor : public ExecutorAbstract { namespace hir { -enum class IpcSemaphore : cuuint32_t { kReady, kTransferInProgress }; - -class RemoteBufferInfo { - public: - RemoteBufferInfo(at::Tensor tensor); - RemoteBufferInfo(std::vector data); // means it is imported - ~RemoteBufferInfo(); - - void* ptr() const { - return ptr_; - } - - auto semaphores() const { - return semaphores_; - } - - void* ptr_; - int64_t storage_offset_; - int64_t element_size_; - bool is_imported_; - cudaIpcMemHandle_t ipc_handle_; - cudaIpcMemHandle_t semaphores_ipc_handle_; - IpcSemaphore* semaphores_; -}; - /* a HostIrEvaluator evaluates a host programs represented through a HostIrContainer It is instantiated with the desired HostIrContainer, and runs @@ -187,10 +163,10 @@ class HostIrEvaluator final : public OptOutDispatch { }; std::unordered_map< at::Tensor, - std::vector>, + std::vector>, TensorHash, TensorEqual> - remote_buffers_; + distributed_buffers_; }; } // namespace hir diff --git a/csrc/multidevice/communicator.h b/csrc/multidevice/communicator.h index 65b994aa125..e8b71df1465 100644 --- a/csrc/multidevice/communicator.h +++ b/csrc/multidevice/communicator.h @@ -28,18 +28,6 @@ namespace nvfuser { -template -std::vector toBytes(const T& data) { - return std::vector( - reinterpret_cast(&data), - reinterpret_cast(&data) + sizeof(T)); -} - -template -const T& fromBytes(const std::vector& bytes) { - return *reinterpret_cast(bytes.data()); -} - // This file implements the class Communicator which sets up the inter-process // Backend. This class contains inter-process information, such as the rank, the // world size, as well as the Process Group that can be called to perform diff --git a/tests/cpp/test_multidevice_gpu_comms.cpp b/tests/cpp/test_multidevice_gpu_comms.cpp index a46fb5c1758..acddba06547 100644 --- a/tests/cpp/test_multidevice_gpu_comms.cpp +++ b/tests/cpp/test_multidevice_gpu_comms.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include From cf8991c033eebe08186e05fbe9e08f7bafe88fe9 Mon Sep 17 00:00:00 2001 From: snordmann Date: Wed, 12 Feb 2025 09:42:48 -0800 Subject: [PATCH 48/55] rename DistributedBuffer to IpcHandle --- CMakeLists.txt | 2 +- csrc/host_ir/executor.cpp | 46 ++++++++++++------------ csrc/host_ir/executor.h | 6 ++-- tests/cpp/test_multidevice_gpu_comms.cpp | 2 +- 4 files changed, 28 insertions(+), 28 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ee371dc64dc..3c862cc5a3e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -169,7 +169,7 @@ list(APPEND NVFUSER_SRCS ${NVFUSER_SRCS_DIR}/mma_type.cpp ${NVFUSER_SRCS_DIR}/multidevice/communication.cpp ${NVFUSER_SRCS_DIR}/multidevice/communicator.cpp - ${NVFUSER_SRCS_DIR}/multidevice/distributed_buffer.cpp + ${NVFUSER_SRCS_DIR}/multidevice/ipc_handle.cpp ${NVFUSER_SRCS_DIR}/multidevice/device_mesh.cpp ${NVFUSER_SRCS_DIR}/multidevice/executor.cpp ${NVFUSER_SRCS_DIR}/multidevice/utils.cpp diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index bb5462d6985..a4e9ec1a701 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -429,8 +429,8 @@ void HostIrEvaluator::handle(ShareMemHandles* share_mem_handles) { is_receiver)) { // REMOVE or adapt exporting/opening the handle return; } - if (distributed_buffers_.find(get_tensor(communication)) != - distributed_buffers_.end()) { + if (ipc_handles_.find(get_tensor(communication)) != + ipc_handles_.end()) { continue; } communications.push_back(communication); @@ -439,7 +439,7 @@ void HostIrEvaluator::handle(ShareMemHandles* share_mem_handles) { // put memhandles to TCP store auto get_key = [this](P2PCommunication* communication, int64_t rank) -> std::string { - return "nvfuser_distributed_buffer_info_P2PComm_dst=" + + return "nvfuser_ipc_handle_info_P2PComm_dst=" + std::to_string(this->expr_evaluator_.evaluate(communication->dst()) .as()) + "_src=" + @@ -447,12 +447,12 @@ void HostIrEvaluator::handle(ShareMemHandles* share_mem_handles) { .as()) + "_rank=" + std::to_string(rank); }; - std::unordered_map> + std::unordered_map> buffer_handles; auto store = communicator_->getTcpStore(); for (P2PCommunication* communication : communications) { auto buffer_handle = - std::make_unique(get_tensor(communication)); + std::make_unique(get_tensor(communication)); store->set(get_key(communication, my_rank), toBytes(*buffer_handle)); buffer_handles.emplace(communication, std::move(buffer_handle)); } @@ -463,20 +463,20 @@ void HostIrEvaluator::handle(ShareMemHandles* share_mem_handles) { // get memhandles to TCP store for (P2PCommunication* communication : communications) { - std::vector> distributed_buffers; - distributed_buffers.reserve(communicator_->size()); + std::vector> ipc_handles; + ipc_handles.reserve(communicator_->size()); const auto dst = expr_evaluator_.evaluate(communication->dst()).as(); const auto src = expr_evaluator_.evaluate(communication->src()).as(); for (int64_t rank : c10::irange(communicator_->size())) { if (rank != src && rank != dst) { - distributed_buffers.push_back(nullptr); + ipc_handles.push_back(nullptr); continue; } if (rank == my_rank) { // opening an ipc handle on the exporter's device is not supported - distributed_buffers.push_back(std::move(buffer_handles.at(communication))); + ipc_handles.push_back(std::move(buffer_handles.at(communication))); } else { std::string key = get_key(communication, rank); NVF_ERROR( @@ -485,13 +485,13 @@ void HostIrEvaluator::handle(ShareMemHandles* share_mem_handles) { key, " not found in store at rank ", my_rank); - auto imported_distributed_buffer_info = - std::make_unique(store->get(key)); - distributed_buffers.push_back(std::move(imported_distributed_buffer_info)); + auto imported_ipc_handle_info = + std::make_unique(store->get(key)); + ipc_handles.push_back(std::move(imported_ipc_handle_info)); } } - distributed_buffers_.emplace( - get_tensor(communication), std::move(distributed_buffers)); + ipc_handles_.emplace( + get_tensor(communication), std::move(ipc_handles)); } } @@ -551,18 +551,18 @@ void HostIrEvaluator::handle(P2PCommunication* communication) { return; } - const auto it = distributed_buffers_.find(buffer); + const auto it = ipc_handles_.find(buffer); NVF_ERROR( - it != distributed_buffers_.end(), + it != ipc_handles_.end(), "No remote buffer found for ", communication->toString(), " at rank ", my_rank); - const std::vector>& distributed_buffers = + const std::vector>& ipc_handles = it->second; const int64_t peer = is_sender ? dst : src; - const DistributedBuffer& my_buffer = *distributed_buffers.at(my_rank); - const DistributedBuffer& peer_buffer = *distributed_buffers.at(peer); + const IpcHandle& my_buffer = *ipc_handles.at(my_rank); + const IpcHandle& peer_buffer = *ipc_handles.at(peer); const auto local_semaphore = reinterpret_cast(&my_buffer.semaphores()[peer]); const auto remote_semaphore = @@ -641,17 +641,17 @@ void HostIrEvaluator::handle(Wait* wait) { c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream()); at::Tensor buffer = getKnownTensorOrUndefined(p2p_comm->buffer(), expr_evaluator_); - const auto it = distributed_buffers_.find(buffer); + const auto it = ipc_handles_.find(buffer); NVF_ERROR( - it != distributed_buffers_.end(), + it != ipc_handles_.end(), "No remote buffer found for ", p2p_comm->toString(), " at rank ", my_rank); - const std::vector>& distributed_buffers = + const std::vector>& ipc_handles = it->second; - const DistributedBuffer& my_buffer = *distributed_buffers.at(my_rank); + const IpcHandle& my_buffer = *ipc_handles.at(my_rank); const auto local_semaphore = reinterpret_cast(&my_buffer.semaphores()[dst]); diff --git a/csrc/host_ir/executor.h b/csrc/host_ir/executor.h index 634b27c755e..2badda7f516 100644 --- a/csrc/host_ir/executor.h +++ b/csrc/host_ir/executor.h @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include #include @@ -163,10 +163,10 @@ class HostIrEvaluator final : public OptOutDispatch { }; std::unordered_map< at::Tensor, - std::vector>, + std::vector>, TensorHash, TensorEqual> - distributed_buffers_; + ipc_handles_; }; } // namespace hir diff --git a/tests/cpp/test_multidevice_gpu_comms.cpp b/tests/cpp/test_multidevice_gpu_comms.cpp index acddba06547..75a6aeba472 100644 --- a/tests/cpp/test_multidevice_gpu_comms.cpp +++ b/tests/cpp/test_multidevice_gpu_comms.cpp @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include #include #include From 541fe8020b57477744edd8f78c8084c6c5b10691 Mon Sep 17 00:00:00 2001 From: snordmann Date: Wed, 12 Feb 2025 13:14:46 -0800 Subject: [PATCH 49/55] working chkpt. Added in the commit the new files that were forgotten before --- csrc/host_ir/executor.cpp | 31 +++--- csrc/host_ir/executor.h | 21 +---- csrc/multidevice/ipc_handle.cpp | 63 +++++++++++++ csrc/multidevice/ipc_handle.h | 161 ++++++++++++++++++++++++++++++++ 4 files changed, 239 insertions(+), 37 deletions(-) create mode 100644 csrc/multidevice/ipc_handle.cpp create mode 100644 csrc/multidevice/ipc_handle.h diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index a4e9ec1a701..7cebe36f792 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -429,8 +429,7 @@ void HostIrEvaluator::handle(ShareMemHandles* share_mem_handles) { is_receiver)) { // REMOVE or adapt exporting/opening the handle return; } - if (ipc_handles_.find(get_tensor(communication)) != - ipc_handles_.end()) { + if (ipc_handle_cache_.find(communication, expr_evaluator_) != nullptr) { continue; } communications.push_back(communication); @@ -463,20 +462,20 @@ void HostIrEvaluator::handle(ShareMemHandles* share_mem_handles) { // get memhandles to TCP store for (P2PCommunication* communication : communications) { - std::vector> ipc_handles; - ipc_handles.reserve(communicator_->size()); + auto ipc_handles = std::make_unique>>(); + ipc_handles->reserve(communicator_->size()); const auto dst = expr_evaluator_.evaluate(communication->dst()).as(); const auto src = expr_evaluator_.evaluate(communication->src()).as(); for (int64_t rank : c10::irange(communicator_->size())) { if (rank != src && rank != dst) { - ipc_handles.push_back(nullptr); + ipc_handles->push_back(nullptr); continue; } if (rank == my_rank) { // opening an ipc handle on the exporter's device is not supported - ipc_handles.push_back(std::move(buffer_handles.at(communication))); + ipc_handles->push_back(std::move(buffer_handles.at(communication))); } else { std::string key = get_key(communication, rank); NVF_ERROR( @@ -487,11 +486,11 @@ void HostIrEvaluator::handle(ShareMemHandles* share_mem_handles) { my_rank); auto imported_ipc_handle_info = std::make_unique(store->get(key)); - ipc_handles.push_back(std::move(imported_ipc_handle_info)); + ipc_handles->push_back(std::move(imported_ipc_handle_info)); } } - ipc_handles_.emplace( - get_tensor(communication), std::move(ipc_handles)); + ipc_handle_cache_.insert( + communication, expr_evaluator_, std::move(ipc_handles)); } } @@ -551,15 +550,14 @@ void HostIrEvaluator::handle(P2PCommunication* communication) { return; } - const auto it = ipc_handles_.find(buffer); + const auto it = ipc_handle_cache_.find(communication, expr_evaluator_); NVF_ERROR( - it != ipc_handles_.end(), + it != nullptr, "No remote buffer found for ", communication->toString(), " at rank ", my_rank); - const std::vector>& ipc_handles = - it->second; + const std::vector>& ipc_handles = *it; const int64_t peer = is_sender ? dst : src; const IpcHandle& my_buffer = *ipc_handles.at(my_rank); const IpcHandle& peer_buffer = *ipc_handles.at(peer); @@ -641,15 +639,14 @@ void HostIrEvaluator::handle(Wait* wait) { c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream()); at::Tensor buffer = getKnownTensorOrUndefined(p2p_comm->buffer(), expr_evaluator_); - const auto it = ipc_handles_.find(buffer); + const auto it = ipc_handle_cache_.find(p2p_comm, expr_evaluator_); NVF_ERROR( - it != ipc_handles_.end(), + it != nullptr, "No remote buffer found for ", p2p_comm->toString(), " at rank ", my_rank); - const std::vector>& ipc_handles = - it->second; + const std::vector>& ipc_handles = *it; const IpcHandle& my_buffer = *ipc_handles.at(my_rank); const auto local_semaphore = diff --git a/csrc/host_ir/executor.h b/csrc/host_ir/executor.h index 2badda7f516..baac74b6756 100644 --- a/csrc/host_ir/executor.h +++ b/csrc/host_ir/executor.h @@ -147,26 +147,7 @@ class HostIrEvaluator final : public OptOutDispatch { std::unordered_map streams_; std::unordered_map> works_; const int64_t my_local_device_index_; - struct TensorHash { - std::size_t operator()(const at::Tensor& tensor) const { - auto ptr = reinterpret_cast(tensor.data_ptr()); - auto offset = tensor.storage_offset(); - auto element_size = tensor.element_size(); - return std::hash()(ptr) ^ std::hash()(offset) ^ - std::hash()(element_size); - } - }; - struct TensorEqual { - bool operator()(const at::Tensor& lhs, const at::Tensor& rhs) const { - return lhs.equal(rhs); - } - }; - std::unordered_map< - at::Tensor, - std::vector>, - TensorHash, - TensorEqual> - ipc_handles_; + IpcHandleCache ipc_handle_cache_; }; } // namespace hir diff --git a/csrc/multidevice/ipc_handle.cpp b/csrc/multidevice/ipc_handle.cpp new file mode 100644 index 00000000000..ccaf9bf5c4d --- /dev/null +++ b/csrc/multidevice/ipc_handle.cpp @@ -0,0 +1,63 @@ +// clang-format off +/* + * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES. + * All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + */ +// clang-format on +#include +#include +#include + +namespace nvfuser { + +IpcHandle::IpcHandle(at::Tensor tensor) + : ptr_(tensor.data_ptr()), + storage_offset_(tensor.storage_offset()), + element_size_(tensor.element_size()), + is_imported_(false) { + NVFUSER_CUDA_RT_SAFE_CALL( + cudaIpcGetMemHandle(&ipc_handle_, tensor.data_ptr())); + const auto number_of_semaphores = Communicator::getInstance().size(); + NVFUSER_CUDA_RT_SAFE_CALL(cudaMalloc( + (void**)&semaphores_, number_of_semaphores * sizeof(IpcSemaphore))); + static_assert( + sizeof(IpcSemaphore) == sizeof(int), + "IpcSemaphore must be same size as int"); + NVFUSER_CUDA_RT_SAFE_CALL(cudaMemset( + (void*)semaphores_, + (int)IpcSemaphore::kReady, + number_of_semaphores * sizeof(IpcSemaphore))); + NVFUSER_CUDA_RT_SAFE_CALL( + cudaIpcGetMemHandle(&semaphores_ipc_handle_, semaphores_)); +} + +IpcHandle::IpcHandle(std::vector data) + : is_imported_(true) { + const IpcHandle& imported_buffer = fromBytes(data); + + storage_offset_ = imported_buffer.storage_offset_; + element_size_ = imported_buffer.element_size_; + ipc_handle_ = imported_buffer.ipc_handle_; + semaphores_ipc_handle_ = imported_buffer.semaphores_ipc_handle_; + + NVFUSER_CUDA_RT_SAFE_CALL( + cudaIpcOpenMemHandle(&ptr_, ipc_handle_, cudaIpcMemLazyEnablePeerAccess)); + ptr_ = (void*)((uint8_t*)ptr_ + storage_offset_ * element_size_); + + NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcOpenMemHandle( + (void**)&semaphores_, + semaphores_ipc_handle_, + cudaIpcMemLazyEnablePeerAccess)); +} + +IpcHandle::~IpcHandle() { + if (is_imported_) { + NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcCloseMemHandle(ptr_)); + NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcCloseMemHandle((void*)semaphores_)); + } else { + NVFUSER_CUDA_RT_SAFE_CALL(cudaFree((void*)semaphores_)); + } +} + +} // nvfuser diff --git a/csrc/multidevice/ipc_handle.h b/csrc/multidevice/ipc_handle.h new file mode 100644 index 00000000000..5ab790f7e8f --- /dev/null +++ b/csrc/multidevice/ipc_handle.h @@ -0,0 +1,161 @@ +// clang-format off +/* + * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES. + * All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + */ +// clang-format on +#pragma once +#include +#include +#include + +namespace nvfuser { + +template +std::vector toBytes(const T& data) { + return std::vector( + reinterpret_cast(&data), + reinterpret_cast(&data) + sizeof(T)); +} + +template +const T& fromBytes(const std::vector& bytes) { + return *reinterpret_cast(bytes.data()); +} + +enum class IpcSemaphore : cuuint32_t { kReady, kInUse }; + +class IpcHandle { + public: + IpcHandle(at::Tensor tensor); + IpcHandle(std::vector data); // means it is imported + ~IpcHandle(); + + void* ptr() const { + return ptr_; + } + + auto semaphores() const { + return semaphores_; + } + + private: + void* ptr_; + int64_t storage_offset_; + int64_t element_size_; + bool is_imported_; + cudaIpcMemHandle_t ipc_handle_; + cudaIpcMemHandle_t semaphores_ipc_handle_; + IpcSemaphore* semaphores_; +}; + +using P2pIpcHandle = std::vector>; +// class P2pIpcHandle { +// public: +// P2pIpcHandle(IpcHandle local_handle, IpcHandle peer_handle) +// : local_handle_(local_handle), peer_handle_(peer_handle) {}; + +// ~P2pIpcHandle(); + +// const auto& local() { +// return local_handle_; +// } + +// const auto& peer() { +// return peer_handle_; +// } + +// private: +// IpcHandle local_handle_; +// IpcHandle peer_handle_; +// }; + + +class IpcHandleCache { + public: + IpcHandleCache() = default; + ~IpcHandleCache() = default; + + + P2pIpcHandle* find(P2PCommunication* comm, ExpressionEvaluator& expr_evaluator) { + return find(getKey(comm, expr_evaluator)); + } + + void insert(P2PCommunication* comm, ExpressionEvaluator& expr_evaluator, std::unique_ptr handle) { + handles_[getKey(comm, expr_evaluator)] = std::move(handle); + } + + private: + using KeyType = std::tuple; + + KeyType getKey(P2PCommunication* comm, ExpressionEvaluator& expr_evaluator) { + int64_t dst = expr_evaluator.evaluate(comm->dst()).as(); + int64_t src = expr_evaluator.evaluate(comm->src()).as(); + at::Tensor buffer = expr_evaluator.evaluate(comm->buffer()).as(); + return std::make_tuple(dst, src, buffer, comm); + } + + void insert(KeyType key, std::unique_ptr handle) { + handles_[key] = std::move(handle); + } + + P2pIpcHandle* find(KeyType key) { + auto it = handles_.find(key); + if (it == handles_.end()) { + return nullptr; + } + return it->second.get(); + } + + struct TensorHash { + std::size_t operator()(const at::Tensor& tensor) const { + auto ptr = reinterpret_cast(tensor.data_ptr()); + auto offset = tensor.storage_offset(); + auto element_size = tensor.element_size(); + return std::hash()(ptr) ^ std::hash()(offset) ^ + std::hash()(element_size); + } + }; + + struct TensorEqual { + bool operator()(const at::Tensor& lhs, const at::Tensor& rhs) const { + return lhs.equal(rhs); + } + }; + + + + struct KeyHash { + std::size_t operator()(const KeyType& key) const { + return (std::hash()(std::get<0>(key)) << 13) ^ + (std::hash()(std::get<1>(key)) << 7) ^ + (TensorHash{}(std::get<2>(key))) ^ + (std::hash()(std::get<3>(key))); + } + }; + + struct KeyEqual { + bool operator()(const KeyType& lhs, const KeyType& rhs) const { + return std::get<0>(lhs) == std::get<0>(rhs) && + std::get<1>(lhs) == std::get<1>(rhs) && + TensorEqual{}(std::get<2>(lhs), std::get<2>(rhs)) && + std::get<3>(lhs) == std::get<3>(rhs); + } + }; + + std::unordered_map< + KeyType, + std::unique_ptr, + KeyHash, + KeyEqual> + handles_; +}; + + +// The cache key must be match on (dst, src, tensor, Id of SendComm, Id of RecvComm) or (int64_t dst, int64_t src, tensor, P2PCommunication*) +// we need a counter on Tensor+P2PCommunication* for each given dst, src +// In the store, we need the key to be computed on (dst, src, counter), also bc it cannot depend nor on tensor neither on P2PCommunication* (not even its ID) +// We could store separately the local and remote handles, or by first mapping with the IpcHandle's rank. Btw, we need to add rank to IpcHandle. + +} // nvfuser From a4960044cb1ee41c5eacc733d17c11761791df27 Mon Sep 17 00:00:00 2001 From: snordmann Date: Wed, 12 Feb 2025 14:39:41 -0800 Subject: [PATCH 50/55] refactor --- csrc/host_ir/executor.cpp | 115 ++------------------- csrc/multidevice/ipc_handle.cpp | 124 ++++++++++++++++++++--- csrc/multidevice/ipc_handle.h | 85 +++++++--------- tests/cpp/test_multidevice_gpu_comms.cpp | 12 +++ 4 files changed, 165 insertions(+), 171 deletions(-) diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index 7cebe36f792..4d0bbc34776 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -410,88 +410,7 @@ void HostIrEvaluator::handle(PostOnStream* post_ir) { } void HostIrEvaluator::handle(ShareMemHandles* share_mem_handles) { - const int64_t my_rank = communicator_->deviceId(); - auto get_tensor = [this](P2PCommunication* communication) -> at::Tensor { - return this->expr_evaluator_.evaluate(communication->buffer()) - .as(); - }; - - std::vector communications; - for (auto expr : share_mem_handles->communications()) { - auto communication = expr->as(); - const auto dst = - expr_evaluator_.evaluate(communication->dst()).as(); - const auto src = - expr_evaluator_.evaluate(communication->src()).as(); - const bool is_sender = my_rank == src; - const bool is_receiver = my_rank == dst; - if (!(is_sender ^ - is_receiver)) { // REMOVE or adapt exporting/opening the handle - return; - } - if (ipc_handle_cache_.find(communication, expr_evaluator_) != nullptr) { - continue; - } - communications.push_back(communication); - } - - // put memhandles to TCP store - auto get_key = - [this](P2PCommunication* communication, int64_t rank) -> std::string { - return "nvfuser_ipc_handle_info_P2PComm_dst=" + - std::to_string(this->expr_evaluator_.evaluate(communication->dst()) - .as()) + - "_src=" + - std::to_string(this->expr_evaluator_.evaluate(communication->src()) - .as()) + - "_rank=" + std::to_string(rank); - }; - std::unordered_map> - buffer_handles; - auto store = communicator_->getTcpStore(); - for (P2PCommunication* communication : communications) { - auto buffer_handle = - std::make_unique(get_tensor(communication)); - store->set(get_key(communication, my_rank), toBytes(*buffer_handle)); - buffer_handles.emplace(communication, std::move(buffer_handle)); - } - - // barrier to ensure all ranks have pushed their memhandles to the store - // TODO: precisely select what ranks need to wait on that barrier. - communicator_->barrier(); - - // get memhandles to TCP store - for (P2PCommunication* communication : communications) { - auto ipc_handles = std::make_unique>>(); - ipc_handles->reserve(communicator_->size()); - const auto dst = - expr_evaluator_.evaluate(communication->dst()).as(); - const auto src = - expr_evaluator_.evaluate(communication->src()).as(); - for (int64_t rank : c10::irange(communicator_->size())) { - if (rank != src && rank != dst) { - ipc_handles->push_back(nullptr); - continue; - } - if (rank == my_rank) { - // opening an ipc handle on the exporter's device is not supported - ipc_handles->push_back(std::move(buffer_handles.at(communication))); - } else { - std::string key = get_key(communication, rank); - NVF_ERROR( - store->check({key}), - "key ", - key, - " not found in store at rank ", - my_rank); - auto imported_ipc_handle_info = - std::make_unique(store->get(key)); - ipc_handles->push_back(std::move(imported_ipc_handle_info)); - } - } - ipc_handle_cache_.insert( - communication, expr_evaluator_, std::move(ipc_handles)); - } + ipc_handle_cache_.exchangeHandles(share_mem_handles->communications(), expr_evaluator_); } void HostIrEvaluator::handle(Communication* communication) { @@ -550,21 +469,12 @@ void HostIrEvaluator::handle(P2PCommunication* communication) { return; } - const auto it = ipc_handle_cache_.find(communication, expr_evaluator_); - NVF_ERROR( - it != nullptr, - "No remote buffer found for ", - communication->toString(), - " at rank ", - my_rank); - const std::vector>& ipc_handles = *it; - const int64_t peer = is_sender ? dst : src; - const IpcHandle& my_buffer = *ipc_handles.at(my_rank); - const IpcHandle& peer_buffer = *ipc_handles.at(peer); + const P2pIpcHandle& ipc_handles = ipc_handle_cache_.get(communication, expr_evaluator_); + const IpcHandle& peer_buffer = ipc_handles.peer(); const auto local_semaphore = - reinterpret_cast(&my_buffer.semaphores()[peer]); + reinterpret_cast(ipc_handles.local().semaphore()); const auto remote_semaphore = - reinterpret_cast(&peer_buffer.semaphores()[my_rank]); + reinterpret_cast(ipc_handles.peer().semaphore()); static_assert( sizeof(IpcSemaphore) == sizeof(uint32_t), "IpcSemaphore must be 32 bits"); @@ -631,7 +541,6 @@ void HostIrEvaluator::handle(Wait* wait) { return; } - const auto dst = expr_evaluator_.evaluate(p2p_comm->dst()).as(); const auto src = expr_evaluator_.evaluate(p2p_comm->src()).as(); const int64_t my_rank = communicator_->deviceId(); if (my_rank == src) { @@ -639,18 +548,10 @@ void HostIrEvaluator::handle(Wait* wait) { c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream()); at::Tensor buffer = getKnownTensorOrUndefined(p2p_comm->buffer(), expr_evaluator_); - const auto it = ipc_handle_cache_.find(p2p_comm, expr_evaluator_); - NVF_ERROR( - it != nullptr, - "No remote buffer found for ", - p2p_comm->toString(), - " at rank ", - my_rank); - const std::vector>& ipc_handles = *it; - - const IpcHandle& my_buffer = *ipc_handles.at(my_rank); + + const P2pIpcHandle& ipc_handles = ipc_handle_cache_.get(p2p_comm, expr_evaluator_); const auto local_semaphore = - reinterpret_cast(&my_buffer.semaphores()[dst]); + reinterpret_cast(ipc_handles.local().semaphore()); NVFUSER_CUDA_SAFE_CALL(cuStreamWaitValue32( current_stream, diff --git a/csrc/multidevice/ipc_handle.cpp b/csrc/multidevice/ipc_handle.cpp index ccaf9bf5c4d..9e3c893f223 100644 --- a/csrc/multidevice/ipc_handle.cpp +++ b/csrc/multidevice/ipc_handle.cpp @@ -11,52 +11,146 @@ namespace nvfuser { +namespace { + +template +std::vector toBytes(const T& data) { + return std::vector( + reinterpret_cast(&data), + reinterpret_cast(&data) + sizeof(T)); +} + +template +const T& fromBytes(const std::vector& bytes) { + return *reinterpret_cast(bytes.data()); +} + +} // namespace + + IpcHandle::IpcHandle(at::Tensor tensor) : ptr_(tensor.data_ptr()), storage_offset_(tensor.storage_offset()), element_size_(tensor.element_size()), - is_imported_(false) { + rank_(Communicator::getInstance().deviceId()) { NVFUSER_CUDA_RT_SAFE_CALL( cudaIpcGetMemHandle(&ipc_handle_, tensor.data_ptr())); - const auto number_of_semaphores = Communicator::getInstance().size(); NVFUSER_CUDA_RT_SAFE_CALL(cudaMalloc( - (void**)&semaphores_, number_of_semaphores * sizeof(IpcSemaphore))); + (void**)&semaphore_, sizeof(IpcSemaphore))); static_assert( sizeof(IpcSemaphore) == sizeof(int), "IpcSemaphore must be same size as int"); NVFUSER_CUDA_RT_SAFE_CALL(cudaMemset( - (void*)semaphores_, + (void*)semaphore_, (int)IpcSemaphore::kReady, - number_of_semaphores * sizeof(IpcSemaphore))); + sizeof(IpcSemaphore))); NVFUSER_CUDA_RT_SAFE_CALL( - cudaIpcGetMemHandle(&semaphores_ipc_handle_, semaphores_)); + cudaIpcGetMemHandle(&semaphore_ipc_handle_, semaphore_)); } -IpcHandle::IpcHandle(std::vector data) - : is_imported_(true) { +IpcHandle::IpcHandle(std::vector data) { const IpcHandle& imported_buffer = fromBytes(data); storage_offset_ = imported_buffer.storage_offset_; element_size_ = imported_buffer.element_size_; ipc_handle_ = imported_buffer.ipc_handle_; - semaphores_ipc_handle_ = imported_buffer.semaphores_ipc_handle_; + semaphore_ipc_handle_ = imported_buffer.semaphore_ipc_handle_; + rank_ = imported_buffer.rank_; NVFUSER_CUDA_RT_SAFE_CALL( cudaIpcOpenMemHandle(&ptr_, ipc_handle_, cudaIpcMemLazyEnablePeerAccess)); ptr_ = (void*)((uint8_t*)ptr_ + storage_offset_ * element_size_); NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcOpenMemHandle( - (void**)&semaphores_, - semaphores_ipc_handle_, + (void**)&semaphore_, + semaphore_ipc_handle_, cudaIpcMemLazyEnablePeerAccess)); } IpcHandle::~IpcHandle() { - if (is_imported_) { - NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcCloseMemHandle(ptr_)); - NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcCloseMemHandle((void*)semaphores_)); + if (rank_ == Communicator::getInstance().deviceId()) { + NVFUSER_CUDA_RT_SAFE_CALL(cudaFree((void*)semaphore_)); } else { - NVFUSER_CUDA_RT_SAFE_CALL(cudaFree((void*)semaphores_)); + NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcCloseMemHandle(ptr_)); + NVFUSER_CUDA_RT_SAFE_CALL(cudaIpcCloseMemHandle((void*)semaphore_)); + } +} + + +void IpcHandleCache::exchangeHandles(const std::vector& communications, const ExpressionEvaluator& expr_evaluator) { + Communicator* communicator = &Communicator::getInstance(); + const int64_t my_rank = communicator->deviceId(); + auto get_tensor = [&expr_evaluator](P2PCommunication* communication) -> at::Tensor { + return expr_evaluator.evaluate(communication->buffer()) + .as(); + }; + + std::vector non_cached_communications; + for (auto communication : communications) { + const auto dst = + expr_evaluator.evaluate(communication->dst()).as(); + const auto src = + expr_evaluator.evaluate(communication->src()).as(); + const bool is_sender = my_rank == src; + const bool is_receiver = my_rank == dst; + NVF_ERROR(is_sender || is_receiver, "RANK ", my_rank, " is not involved in the p2p comm ", communication); + if (is_sender && is_receiver) { + continue; + } + if (find(communication, expr_evaluator) != nullptr) { + continue; + } + non_cached_communications.push_back(communication); + } + + // put memhandles to TCP store + auto get_tcp_store_key = + [&expr_evaluator](P2PCommunication* communication, int64_t rank) -> std::string { + return "nvfuser_ipc_handle_info_P2PComm_dst=" + + std::to_string(expr_evaluator.evaluate(communication->dst()) + .as()) + + "_src=" + + std::to_string(expr_evaluator.evaluate(communication->src()) + .as()) + + "_rank=" + std::to_string(rank); + }; + std::unordered_map> + local_ipc_handles; + auto store = communicator->getTcpStore(); + for (P2PCommunication* communication : non_cached_communications) { + auto buffer_handle = + std::make_unique(get_tensor(communication)); + store->set(get_tcp_store_key(communication, my_rank), toBytes(*buffer_handle)); + local_ipc_handles.emplace(communication, std::move(buffer_handle)); + } + + // barrier to ensure all ranks have pushed their memhandles to the store + // TODO: precisely select what ranks need to wait on that barrier. + communicator->barrier(); + + // get memhandles from TCP store + for (P2PCommunication* communication : non_cached_communications) { + const auto dst = + expr_evaluator.evaluate(communication->dst()).as(); + const auto src = + expr_evaluator.evaluate(communication->src()).as(); + int64_t peer = (my_rank == dst) ? src : dst; + + auto& local_ipc_handle = local_ipc_handles.at(communication); + + std::string key = get_tcp_store_key(communication, peer); + NVF_ERROR( + store->check({key}), + "key ", + key, + " not found in store at rank ", + my_rank); + auto peer_ipc_handle = std::make_unique(store->get(key)); + + auto ipc_handles = std::make_unique(std::move(local_ipc_handle), std::move(peer_ipc_handle)); + + insert( + communication, expr_evaluator, std::move(ipc_handles)); } } diff --git a/csrc/multidevice/ipc_handle.h b/csrc/multidevice/ipc_handle.h index 5ab790f7e8f..fb8c025261d 100644 --- a/csrc/multidevice/ipc_handle.h +++ b/csrc/multidevice/ipc_handle.h @@ -12,18 +12,6 @@ namespace nvfuser { -template -std::vector toBytes(const T& data) { - return std::vector( - reinterpret_cast(&data), - reinterpret_cast(&data) + sizeof(T)); -} - -template -const T& fromBytes(const std::vector& bytes) { - return *reinterpret_cast(bytes.data()); -} - enum class IpcSemaphore : cuuint32_t { kReady, kInUse }; class IpcHandle { @@ -36,68 +24,69 @@ class IpcHandle { return ptr_; } - auto semaphores() const { - return semaphores_; + auto semaphore() const { + return semaphore_; } private: void* ptr_; int64_t storage_offset_; int64_t element_size_; - bool is_imported_; cudaIpcMemHandle_t ipc_handle_; - cudaIpcMemHandle_t semaphores_ipc_handle_; - IpcSemaphore* semaphores_; + cudaIpcMemHandle_t semaphore_ipc_handle_; + IpcSemaphore* semaphore_; + int64_t rank_; }; -using P2pIpcHandle = std::vector>; -// class P2pIpcHandle { -// public: -// P2pIpcHandle(IpcHandle local_handle, IpcHandle peer_handle) -// : local_handle_(local_handle), peer_handle_(peer_handle) {}; - -// ~P2pIpcHandle(); +class P2pIpcHandle { + public: -// const auto& local() { -// return local_handle_; -// } + P2pIpcHandle(std::unique_ptr local, std::unique_ptr peer) : local_(std::move(local)), peer_(std::move(peer)) {} -// const auto& peer() { -// return peer_handle_; -// } + const auto& local() const { + return *local_; + } -// private: -// IpcHandle local_handle_; -// IpcHandle peer_handle_; -// }; + const auto& peer() const { + return *peer_; + } + private: + std::unique_ptr local_; + std::unique_ptr peer_; +}; class IpcHandleCache { - public: - IpcHandleCache() = default; - ~IpcHandleCache() = default; - - - P2pIpcHandle* find(P2PCommunication* comm, ExpressionEvaluator& expr_evaluator) { - return find(getKey(comm, expr_evaluator)); + public: + IpcHandleCache() = default; + ~IpcHandleCache() = default; + + const P2pIpcHandle& get(P2PCommunication* communication, ExpressionEvaluator& expr_evaluator) { + auto it = find(getKey(communication, expr_evaluator)); + NVF_ERROR( + it != nullptr, + "No remote buffer found for ", + communication->toString()); + return *it; } - void insert(P2PCommunication* comm, ExpressionEvaluator& expr_evaluator, std::unique_ptr handle) { - handles_[getKey(comm, expr_evaluator)] = std::move(handle); - } + void exchangeHandles(const std::vector& communications, const ExpressionEvaluator& expr_evaluator); private: using KeyType = std::tuple; - KeyType getKey(P2PCommunication* comm, ExpressionEvaluator& expr_evaluator) { + KeyType getKey(P2PCommunication* comm, const ExpressionEvaluator& expr_evaluator) { int64_t dst = expr_evaluator.evaluate(comm->dst()).as(); int64_t src = expr_evaluator.evaluate(comm->src()).as(); at::Tensor buffer = expr_evaluator.evaluate(comm->buffer()).as(); return std::make_tuple(dst, src, buffer, comm); } + void insert(P2PCommunication* comm, const ExpressionEvaluator& expr_evaluator, std::unique_ptr handle) { + handles_[getKey(comm, expr_evaluator)] = std::move(handle); + } - void insert(KeyType key, std::unique_ptr handle) { - handles_[key] = std::move(handle); + P2pIpcHandle* find(P2PCommunication* comm, const ExpressionEvaluator& expr_evaluator) { + return find(getKey(comm, expr_evaluator)); } P2pIpcHandle* find(KeyType key) { @@ -124,8 +113,6 @@ class IpcHandleCache { } }; - - struct KeyHash { std::size_t operator()(const KeyType& key) const { return (std::hash()(std::get<0>(key)) << 13) ^ diff --git a/tests/cpp/test_multidevice_gpu_comms.cpp b/tests/cpp/test_multidevice_gpu_comms.cpp index 75a6aeba472..c9159d3f20c 100644 --- a/tests/cpp/test_multidevice_gpu_comms.cpp +++ b/tests/cpp/test_multidevice_gpu_comms.cpp @@ -20,6 +20,18 @@ namespace nvfuser { #define CUDA_CALL(call) ASSERT_EQ((call), cudaSuccess) +template +std::vector toBytes(const T& data) { + return std::vector( + reinterpret_cast(&data), + reinterpret_cast(&data) + sizeof(T)); +} + +template +const T& fromBytes(const std::vector& bytes) { + return *reinterpret_cast(bytes.data()); +} + class GpuCommTest : public MultiDeviceTest {}; TEST_F(GpuCommTest, IpcMemHandle) { From 263d95c046714f32f72075d090f49dff8d641005 Mon Sep 17 00:00:00 2001 From: snordmann Date: Wed, 12 Feb 2025 14:54:40 -0800 Subject: [PATCH 51/55] minor cleanup --- csrc/multidevice/ipc_handle.cpp | 1 - csrc/multidevice/ipc_handle.h | 29 ++++++++++++----------------- 2 files changed, 12 insertions(+), 18 deletions(-) diff --git a/csrc/multidevice/ipc_handle.cpp b/csrc/multidevice/ipc_handle.cpp index 9e3c893f223..3cb36f1963d 100644 --- a/csrc/multidevice/ipc_handle.cpp +++ b/csrc/multidevice/ipc_handle.cpp @@ -76,7 +76,6 @@ IpcHandle::~IpcHandle() { } } - void IpcHandleCache::exchangeHandles(const std::vector& communications, const ExpressionEvaluator& expr_evaluator) { Communicator* communicator = &Communicator::getInstance(); const int64_t my_rank = communicator->deviceId(); diff --git a/csrc/multidevice/ipc_handle.h b/csrc/multidevice/ipc_handle.h index fb8c025261d..f81d74550ae 100644 --- a/csrc/multidevice/ipc_handle.h +++ b/csrc/multidevice/ipc_handle.h @@ -56,13 +56,17 @@ class P2pIpcHandle { std::unique_ptr peer_; }; +// The cache key must be match on (dst, src, tensor, Id of SendComm, Id of RecvComm) or (int64_t dst, int64_t src, tensor, P2PCommunication*) +// we need a counter on Tensor+P2PCommunication* for each given dst, src +// In the store, we need the key to be computed on (dst, src, counter), also bc it cannot depend nor on tensor neither on P2PCommunication* (not even its ID) +// We could store separately the local and remote handles, or by first mapping with the IpcHandle's rank. Btw, we need to add rank to IpcHandle. class IpcHandleCache { public: IpcHandleCache() = default; ~IpcHandleCache() = default; - const P2pIpcHandle& get(P2PCommunication* communication, ExpressionEvaluator& expr_evaluator) { - auto it = find(getKey(communication, expr_evaluator)); + const P2pIpcHandle& get(P2PCommunication* communication, ExpressionEvaluator& expr_evaluator) const { + auto it = find(communication, expr_evaluator); NVF_ERROR( it != nullptr, "No remote buffer found for ", @@ -75,22 +79,19 @@ class IpcHandleCache { private: using KeyType = std::tuple; - KeyType getKey(P2PCommunication* comm, const ExpressionEvaluator& expr_evaluator) { + KeyType getKey(P2PCommunication* comm, const ExpressionEvaluator& expr_evaluator) const { int64_t dst = expr_evaluator.evaluate(comm->dst()).as(); int64_t src = expr_evaluator.evaluate(comm->src()).as(); at::Tensor buffer = expr_evaluator.evaluate(comm->buffer()).as(); return std::make_tuple(dst, src, buffer, comm); } - void insert(P2PCommunication* comm, const ExpressionEvaluator& expr_evaluator, std::unique_ptr handle) { - handles_[getKey(comm, expr_evaluator)] = std::move(handle); - } - P2pIpcHandle* find(P2PCommunication* comm, const ExpressionEvaluator& expr_evaluator) { - return find(getKey(comm, expr_evaluator)); + void insert(P2PCommunication* comm, const ExpressionEvaluator& expr_evaluator, std::unique_ptr handle) { + handles_[getKey(comm, expr_evaluator)] = std::move(handle); } - P2pIpcHandle* find(KeyType key) { - auto it = handles_.find(key); + P2pIpcHandle* find(P2PCommunication* comm, const ExpressionEvaluator& expr_evaluator) const { + auto it = handles_.find(getKey(comm, expr_evaluator)); if (it == handles_.end()) { return nullptr; } @@ -102,7 +103,7 @@ class IpcHandleCache { auto ptr = reinterpret_cast(tensor.data_ptr()); auto offset = tensor.storage_offset(); auto element_size = tensor.element_size(); - return std::hash()(ptr) ^ std::hash()(offset) ^ + return std::hash()(ptr) ^ std::hash()(offset) << 32 ^ std::hash()(element_size); } }; @@ -139,10 +140,4 @@ class IpcHandleCache { handles_; }; - -// The cache key must be match on (dst, src, tensor, Id of SendComm, Id of RecvComm) or (int64_t dst, int64_t src, tensor, P2PCommunication*) -// we need a counter on Tensor+P2PCommunication* for each given dst, src -// In the store, we need the key to be computed on (dst, src, counter), also bc it cannot depend nor on tensor neither on P2PCommunication* (not even its ID) -// We could store separately the local and remote handles, or by first mapping with the IpcHandle's rank. Btw, we need to add rank to IpcHandle. - } // nvfuser From 106d29579e0466062a6b2c4c47ecce4249c61406 Mon Sep 17 00:00:00 2001 From: snordmann Date: Wed, 12 Feb 2025 14:55:22 -0800 Subject: [PATCH 52/55] lint --- csrc/host_ir/executor.cpp | 9 ++-- csrc/multidevice/ipc_handle.cpp | 73 +++++++++++++++++--------------- csrc/multidevice/ipc_handle.h | 75 +++++++++++++++++++-------------- 3 files changed, 89 insertions(+), 68 deletions(-) diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index 4d0bbc34776..f8ef8572dfe 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -410,7 +410,8 @@ void HostIrEvaluator::handle(PostOnStream* post_ir) { } void HostIrEvaluator::handle(ShareMemHandles* share_mem_handles) { - ipc_handle_cache_.exchangeHandles(share_mem_handles->communications(), expr_evaluator_); + ipc_handle_cache_.exchangeHandles( + share_mem_handles->communications(), expr_evaluator_); } void HostIrEvaluator::handle(Communication* communication) { @@ -469,7 +470,8 @@ void HostIrEvaluator::handle(P2PCommunication* communication) { return; } - const P2pIpcHandle& ipc_handles = ipc_handle_cache_.get(communication, expr_evaluator_); + const P2pIpcHandle& ipc_handles = + ipc_handle_cache_.get(communication, expr_evaluator_); const IpcHandle& peer_buffer = ipc_handles.peer(); const auto local_semaphore = reinterpret_cast(ipc_handles.local().semaphore()); @@ -549,7 +551,8 @@ void HostIrEvaluator::handle(Wait* wait) { at::Tensor buffer = getKnownTensorOrUndefined(p2p_comm->buffer(), expr_evaluator_); - const P2pIpcHandle& ipc_handles = ipc_handle_cache_.get(p2p_comm, expr_evaluator_); + const P2pIpcHandle& ipc_handles = + ipc_handle_cache_.get(p2p_comm, expr_evaluator_); const auto local_semaphore = reinterpret_cast(ipc_handles.local().semaphore()); diff --git a/csrc/multidevice/ipc_handle.cpp b/csrc/multidevice/ipc_handle.cpp index 3cb36f1963d..089071f22d7 100644 --- a/csrc/multidevice/ipc_handle.cpp +++ b/csrc/multidevice/ipc_handle.cpp @@ -5,9 +5,9 @@ * SPDX-License-Identifier: BSD-3-Clause */ // clang-format on -#include -#include -#include +#include +#include +#include namespace nvfuser { @@ -27,7 +27,6 @@ const T& fromBytes(const std::vector& bytes) { } // namespace - IpcHandle::IpcHandle(at::Tensor tensor) : ptr_(tensor.data_ptr()), storage_offset_(tensor.storage_offset()), @@ -35,15 +34,13 @@ IpcHandle::IpcHandle(at::Tensor tensor) rank_(Communicator::getInstance().deviceId()) { NVFUSER_CUDA_RT_SAFE_CALL( cudaIpcGetMemHandle(&ipc_handle_, tensor.data_ptr())); - NVFUSER_CUDA_RT_SAFE_CALL(cudaMalloc( - (void**)&semaphore_, sizeof(IpcSemaphore))); + NVFUSER_CUDA_RT_SAFE_CALL( + cudaMalloc((void**)&semaphore_, sizeof(IpcSemaphore))); static_assert( sizeof(IpcSemaphore) == sizeof(int), "IpcSemaphore must be same size as int"); NVFUSER_CUDA_RT_SAFE_CALL(cudaMemset( - (void*)semaphore_, - (int)IpcSemaphore::kReady, - sizeof(IpcSemaphore))); + (void*)semaphore_, (int)IpcSemaphore::kReady, sizeof(IpcSemaphore))); NVFUSER_CUDA_RT_SAFE_CALL( cudaIpcGetMemHandle(&semaphore_ipc_handle_, semaphore_)); } @@ -76,12 +73,14 @@ IpcHandle::~IpcHandle() { } } -void IpcHandleCache::exchangeHandles(const std::vector& communications, const ExpressionEvaluator& expr_evaluator) { +void IpcHandleCache::exchangeHandles( + const std::vector& communications, + const ExpressionEvaluator& expr_evaluator) { Communicator* communicator = &Communicator::getInstance(); const int64_t my_rank = communicator->deviceId(); - auto get_tensor = [&expr_evaluator](P2PCommunication* communication) -> at::Tensor { - return expr_evaluator.evaluate(communication->buffer()) - .as(); + auto get_tensor = + [&expr_evaluator](P2PCommunication* communication) -> at::Tensor { + return expr_evaluator.evaluate(communication->buffer()).as(); }; std::vector non_cached_communications; @@ -92,7 +91,12 @@ void IpcHandleCache::exchangeHandles(const std::vector& commu expr_evaluator.evaluate(communication->src()).as(); const bool is_sender = my_rank == src; const bool is_receiver = my_rank == dst; - NVF_ERROR(is_sender || is_receiver, "RANK ", my_rank, " is not involved in the p2p comm ", communication); + NVF_ERROR( + is_sender || is_receiver, + "RANK ", + my_rank, + " is not involved in the p2p comm ", + communication); if (is_sender && is_receiver) { continue; } @@ -103,23 +107,24 @@ void IpcHandleCache::exchangeHandles(const std::vector& commu } // put memhandles to TCP store - auto get_tcp_store_key = - [&expr_evaluator](P2PCommunication* communication, int64_t rank) -> std::string { + auto get_tcp_store_key = [&expr_evaluator]( + P2PCommunication* communication, + int64_t rank) -> std::string { return "nvfuser_ipc_handle_info_P2PComm_dst=" + - std::to_string(expr_evaluator.evaluate(communication->dst()) - .as()) + + std::to_string( + expr_evaluator.evaluate(communication->dst()).as()) + "_src=" + - std::to_string(expr_evaluator.evaluate(communication->src()) - .as()) + + std::to_string( + expr_evaluator.evaluate(communication->src()).as()) + "_rank=" + std::to_string(rank); }; std::unordered_map> local_ipc_handles; auto store = communicator->getTcpStore(); for (P2PCommunication* communication : non_cached_communications) { - auto buffer_handle = - std::make_unique(get_tensor(communication)); - store->set(get_tcp_store_key(communication, my_rank), toBytes(*buffer_handle)); + auto buffer_handle = std::make_unique(get_tensor(communication)); + store->set( + get_tcp_store_key(communication, my_rank), toBytes(*buffer_handle)); local_ipc_handles.emplace(communication, std::move(buffer_handle)); } @@ -130,27 +135,27 @@ void IpcHandleCache::exchangeHandles(const std::vector& commu // get memhandles from TCP store for (P2PCommunication* communication : non_cached_communications) { const auto dst = - expr_evaluator.evaluate(communication->dst()).as(); + expr_evaluator.evaluate(communication->dst()).as(); const auto src = - expr_evaluator.evaluate(communication->src()).as(); + expr_evaluator.evaluate(communication->src()).as(); int64_t peer = (my_rank == dst) ? src : dst; auto& local_ipc_handle = local_ipc_handles.at(communication); std::string key = get_tcp_store_key(communication, peer); NVF_ERROR( - store->check({key}), - "key ", - key, - " not found in store at rank ", - my_rank); + store->check({key}), + "key ", + key, + " not found in store at rank ", + my_rank); auto peer_ipc_handle = std::make_unique(store->get(key)); - auto ipc_handles = std::make_unique(std::move(local_ipc_handle), std::move(peer_ipc_handle)); + auto ipc_handles = std::make_unique( + std::move(local_ipc_handle), std::move(peer_ipc_handle)); - insert( - communication, expr_evaluator, std::move(ipc_handles)); + insert(communication, expr_evaluator, std::move(ipc_handles)); } } -} // nvfuser +} // namespace nvfuser diff --git a/csrc/multidevice/ipc_handle.h b/csrc/multidevice/ipc_handle.h index f81d74550ae..cba4d50b83c 100644 --- a/csrc/multidevice/ipc_handle.h +++ b/csrc/multidevice/ipc_handle.h @@ -6,9 +6,9 @@ */ // clang-format on #pragma once +#include #include #include -#include namespace nvfuser { @@ -40,8 +40,10 @@ class IpcHandle { class P2pIpcHandle { public: - - P2pIpcHandle(std::unique_ptr local, std::unique_ptr peer) : local_(std::move(local)), peer_(std::move(peer)) {} + P2pIpcHandle( + std::unique_ptr local, + std::unique_ptr peer) + : local_(std::move(local)), peer_(std::move(peer)) {} const auto& local() const { return *local_; @@ -56,41 +58,56 @@ class P2pIpcHandle { std::unique_ptr peer_; }; -// The cache key must be match on (dst, src, tensor, Id of SendComm, Id of RecvComm) or (int64_t dst, int64_t src, tensor, P2PCommunication*) -// we need a counter on Tensor+P2PCommunication* for each given dst, src -// In the store, we need the key to be computed on (dst, src, counter), also bc it cannot depend nor on tensor neither on P2PCommunication* (not even its ID) -// We could store separately the local and remote handles, or by first mapping with the IpcHandle's rank. Btw, we need to add rank to IpcHandle. +// The cache key must be match on (dst, src, tensor, Id of SendComm, Id of +// RecvComm) or (int64_t dst, int64_t src, tensor, P2PCommunication*) we need a +// counter on Tensor+P2PCommunication* for each given dst, src In the store, we +// need the key to be computed on (dst, src, counter), also bc it cannot depend +// nor on tensor neither on P2PCommunication* (not even its ID) We could store +// separately the local and remote handles, or by first mapping with the +// IpcHandle's rank. Btw, we need to add rank to IpcHandle. class IpcHandleCache { public: IpcHandleCache() = default; ~IpcHandleCache() = default; - const P2pIpcHandle& get(P2PCommunication* communication, ExpressionEvaluator& expr_evaluator) const { + const P2pIpcHandle& get( + P2PCommunication* communication, + ExpressionEvaluator& expr_evaluator) const { auto it = find(communication, expr_evaluator); NVF_ERROR( - it != nullptr, - "No remote buffer found for ", - communication->toString()); + it != nullptr, + "No remote buffer found for ", + communication->toString()); return *it; } - void exchangeHandles(const std::vector& communications, const ExpressionEvaluator& expr_evaluator); + void exchangeHandles( + const std::vector& communications, + const ExpressionEvaluator& expr_evaluator); private: using KeyType = std::tuple; - KeyType getKey(P2PCommunication* comm, const ExpressionEvaluator& expr_evaluator) const { + KeyType getKey( + P2PCommunication* comm, + const ExpressionEvaluator& expr_evaluator) const { int64_t dst = expr_evaluator.evaluate(comm->dst()).as(); int64_t src = expr_evaluator.evaluate(comm->src()).as(); - at::Tensor buffer = expr_evaluator.evaluate(comm->buffer()).as(); + at::Tensor buffer = + expr_evaluator.evaluate(comm->buffer()).as(); return std::make_tuple(dst, src, buffer, comm); } - void insert(P2PCommunication* comm, const ExpressionEvaluator& expr_evaluator, std::unique_ptr handle) { + void insert( + P2PCommunication* comm, + const ExpressionEvaluator& expr_evaluator, + std::unique_ptr handle) { handles_[getKey(comm, expr_evaluator)] = std::move(handle); } - P2pIpcHandle* find(P2PCommunication* comm, const ExpressionEvaluator& expr_evaluator) const { + P2pIpcHandle* find( + P2PCommunication* comm, + const ExpressionEvaluator& expr_evaluator) const { auto it = handles_.find(getKey(comm, expr_evaluator)); if (it == handles_.end()) { return nullptr; @@ -103,8 +120,8 @@ class IpcHandleCache { auto ptr = reinterpret_cast(tensor.data_ptr()); auto offset = tensor.storage_offset(); auto element_size = tensor.element_size(); - return std::hash()(ptr) ^ std::hash()(offset) << 32 ^ - std::hash()(element_size); + return std::hash()(ptr) ^ + std::hash()(offset) << 32 ^ std::hash()(element_size); } }; @@ -117,27 +134,23 @@ class IpcHandleCache { struct KeyHash { std::size_t operator()(const KeyType& key) const { return (std::hash()(std::get<0>(key)) << 13) ^ - (std::hash()(std::get<1>(key)) << 7) ^ - (TensorHash{}(std::get<2>(key))) ^ - (std::hash()(std::get<3>(key))); + (std::hash()(std::get<1>(key)) << 7) ^ + (TensorHash{}(std::get<2>(key))) ^ + (std::hash()(std::get<3>(key))); } }; struct KeyEqual { bool operator()(const KeyType& lhs, const KeyType& rhs) const { return std::get<0>(lhs) == std::get<0>(rhs) && - std::get<1>(lhs) == std::get<1>(rhs) && - TensorEqual{}(std::get<2>(lhs), std::get<2>(rhs)) && - std::get<3>(lhs) == std::get<3>(rhs); + std::get<1>(lhs) == std::get<1>(rhs) && + TensorEqual{}(std::get<2>(lhs), std::get<2>(rhs)) && + std::get<3>(lhs) == std::get<3>(rhs); } }; - std::unordered_map< - KeyType, - std::unique_ptr, - KeyHash, - KeyEqual> - handles_; + std::unordered_map, KeyHash, KeyEqual> + handles_; }; -} // nvfuser +} // namespace nvfuser From ed69f75d4c4fcd92f44c06f23cb5d8c77c892aef Mon Sep 17 00:00:00 2001 From: snordmann Date: Wed, 12 Feb 2025 14:58:38 -0800 Subject: [PATCH 53/55] minor --- csrc/multidevice/ipc_handle.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/csrc/multidevice/ipc_handle.h b/csrc/multidevice/ipc_handle.h index cba4d50b83c..5df55fec441 100644 --- a/csrc/multidevice/ipc_handle.h +++ b/csrc/multidevice/ipc_handle.h @@ -121,7 +121,7 @@ class IpcHandleCache { auto offset = tensor.storage_offset(); auto element_size = tensor.element_size(); return std::hash()(ptr) ^ - std::hash()(offset) << 32 ^ std::hash()(element_size); + std::hash()(offset << 8) ^ std::hash()(element_size); } }; @@ -133,8 +133,8 @@ class IpcHandleCache { struct KeyHash { std::size_t operator()(const KeyType& key) const { - return (std::hash()(std::get<0>(key)) << 13) ^ - (std::hash()(std::get<1>(key)) << 7) ^ + return (std::hash()(std::get<0>(key) << 13)) ^ + (std::hash()(std::get<1>(key) << 7)) ^ (TensorHash{}(std::get<2>(key))) ^ (std::hash()(std::get<3>(key))); } From 929ae0df55efbda3bb342229f56f94b61a6138a8 Mon Sep 17 00:00:00 2001 From: snordmann Date: Wed, 12 Feb 2025 15:13:49 -0800 Subject: [PATCH 54/55] minor --- csrc/host_ir/host_ir.cpp | 6 +++++- csrc/host_ir/host_ir.h | 2 +- csrc/multidevice/communication.cpp | 8 ++++---- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/csrc/host_ir/host_ir.cpp b/csrc/host_ir/host_ir.cpp index edc9c476eaf..b5cacfd71b8 100644 --- a/csrc/host_ir/host_ir.cpp +++ b/csrc/host_ir/host_ir.cpp @@ -339,7 +339,11 @@ NVFUSER_DEFINE_CLONE_AND_CREATE(ShareMemHandles) std::string ShareMemHandles::toString(int indent_size) const { std::stringstream ss; - indent(ss, indent_size) << "ShareMemHandles" << std::endl; + indent(ss, indent_size) << "ShareMemHandles("; + for (auto communication: communications()) { + ss << communication->toInlineString() << ", "; + } + ss << std::endl; return ss.str(); } diff --git a/csrc/host_ir/host_ir.h b/csrc/host_ir/host_ir.h index efb23b95d67..d7fa4512db8 100644 --- a/csrc/host_ir/host_ir.h +++ b/csrc/host_ir/host_ir.h @@ -335,7 +335,7 @@ class ShareMemHandles : public Expr { return "hir::ShareMemHandles"; } - const std::vector& communications() { + const std::vector& communications() const { return attribute>(0); } }; diff --git a/csrc/multidevice/communication.cpp b/csrc/multidevice/communication.cpp index e48290241b0..1c1ada7f32c 100644 --- a/csrc/multidevice/communication.cpp +++ b/csrc/multidevice/communication.cpp @@ -230,17 +230,17 @@ P2PCommunication::P2PCommunication( NVFUSER_DEFINE_CLONE_AND_CREATE(P2PCommunication) -std::string P2PCommunication::toString(const int indent_size) const { +std::string P2PCommunication::toInlineString(const int indent_size) const { std::stringstream ss; indent(ss, indent_size) << "P2PCommunication " << name() << " (" << "buffer=" << buffer() << ", " << "dst=" << dst() << ", " - << "src=" << src() << ")\n"; + << "src=" << src() << ")"; return ss.str(); } -std::string P2PCommunication::toInlineString(int indent_size) const { - return toString(indent_size); +std::string P2PCommunication::toString(int indent_size) const { + return toInlineString(indent_size) + "\n"; } namespace { From 359779d2778a1d4ed337499f669fb33123c3b9bb Mon Sep 17 00:00:00 2001 From: snordmann Date: Wed, 12 Feb 2025 15:44:09 -0800 Subject: [PATCH 55/55] move p2p runtime in separate file --- CMakeLists.txt | 1 + csrc/host_ir/executor.cpp | 70 +++---------------- csrc/multidevice/cuda_p2p.cpp | 70 +++++++++++++++++++ csrc/multidevice/cuda_p2p.h | 22 ++++++ csrc/multidevice/ipc_handle.h | 1 + tests/cpp/test_multidevice_communications.cpp | 14 ++-- 6 files changed, 109 insertions(+), 69 deletions(-) create mode 100644 csrc/multidevice/cuda_p2p.cpp create mode 100644 csrc/multidevice/cuda_p2p.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 9e2afe3cb6e..6b16cb0c075 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -171,6 +171,7 @@ list(APPEND NVFUSER_SRCS ${NVFUSER_SRCS_DIR}/mma_type.cpp ${NVFUSER_SRCS_DIR}/multidevice/communication.cpp ${NVFUSER_SRCS_DIR}/multidevice/communicator.cpp + ${NVFUSER_SRCS_DIR}/multidevice/cuda_p2p.cpp ${NVFUSER_SRCS_DIR}/multidevice/ipc_handle.cpp ${NVFUSER_SRCS_DIR}/multidevice/device_mesh.cpp ${NVFUSER_SRCS_DIR}/multidevice/executor.cpp diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index f8ef8572dfe..7bd38e90c15 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -472,58 +473,12 @@ void HostIrEvaluator::handle(P2PCommunication* communication) { const P2pIpcHandle& ipc_handles = ipc_handle_cache_.get(communication, expr_evaluator_); - const IpcHandle& peer_buffer = ipc_handles.peer(); - const auto local_semaphore = - reinterpret_cast(ipc_handles.local().semaphore()); - const auto remote_semaphore = - reinterpret_cast(ipc_handles.peer().semaphore()); - static_assert( - sizeof(IpcSemaphore) == sizeof(uint32_t), "IpcSemaphore must be 32 bits"); - - const auto current_stream = reinterpret_cast( - c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream()); - + const auto current_stream = static_cast( + c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream()); if (is_receiver) { - // wait for sender to be ready - NVFUSER_CUDA_SAFE_CALL(cuStreamWaitValue32( - current_stream, - local_semaphore, - (cuuint32_t)(IpcSemaphore::kInUse), - CU_STREAM_WAIT_VALUE_EQ)); - // RDMA get the data from the sender - NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync( - buffer.data_ptr(), - peer_buffer.ptr(), - buffer.numel() * buffer.element_size(), - cudaMemcpyDeviceToDevice, - current_stream)); - // Signals completion to self - NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32( - current_stream, - local_semaphore, - (cuuint32_t)(IpcSemaphore::kReady), - CU_STREAM_WRITE_VALUE_DEFAULT)); - // Signals completion to receiver - NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32( - current_stream, - remote_semaphore, - (cuuint32_t)(IpcSemaphore::kReady), - CU_STREAM_WRITE_VALUE_DEFAULT)); + getZcopy::RecvPost(ipc_handles, buffer.numel() * buffer.element_size(), current_stream); } else /*sender*/ { - // signal to self that transfer is in progress - NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32( - current_stream, - local_semaphore, - (cuuint32_t)(IpcSemaphore::kInUse), - CU_STREAM_WRITE_VALUE_DEFAULT)); - // signal to receiver that the buffer is ready - NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32( - current_stream, - remote_semaphore, - (cuuint32_t)(IpcSemaphore::kInUse), - CU_STREAM_WRITE_VALUE_DEFAULT)); // passing - // CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER - // gives an error + getZcopy::SendPost(ipc_handles, current_stream); } } @@ -544,23 +499,14 @@ void HostIrEvaluator::handle(Wait* wait) { } const auto src = expr_evaluator_.evaluate(p2p_comm->src()).as(); + const auto dst = expr_evaluator_.evaluate(p2p_comm->dst()).as(); const int64_t my_rank = communicator_->deviceId(); - if (my_rank == src) { + if (my_rank == src && src != dst) { const auto current_stream = static_cast( c10::cuda::getCurrentCUDAStream(my_local_device_index_).stream()); - at::Tensor buffer = - getKnownTensorOrUndefined(p2p_comm->buffer(), expr_evaluator_); - const P2pIpcHandle& ipc_handles = ipc_handle_cache_.get(p2p_comm, expr_evaluator_); - const auto local_semaphore = - reinterpret_cast(ipc_handles.local().semaphore()); - - NVFUSER_CUDA_SAFE_CALL(cuStreamWaitValue32( - current_stream, - local_semaphore, - (cuuint32_t)(IpcSemaphore::kReady), - CU_STREAM_WAIT_VALUE_EQ)); + getZcopy::SendWait(ipc_handles, current_stream); } } diff --git a/csrc/multidevice/cuda_p2p.cpp b/csrc/multidevice/cuda_p2p.cpp new file mode 100644 index 00000000000..d4aa148cb4a --- /dev/null +++ b/csrc/multidevice/cuda_p2p.cpp @@ -0,0 +1,70 @@ +// clang-format off +/* + * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES. + * All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + */ +// clang-format on +#include +#include + +namespace nvfuser { + +namespace getZcopy { + +void RecvPost(const P2pIpcHandle& ipc_handles, int64_t count, CUstream stream) { + // wait for sender to be ready + NVFUSER_CUDA_SAFE_CALL(cuStreamWaitValue32( + stream, + reinterpret_cast(ipc_handles.local().semaphore()), + (cuuint32_t)(IpcSemaphore::kInUse), + CU_STREAM_WAIT_VALUE_EQ)); + // RDMA get the data from the sender + NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpyAsync( + ipc_handles.local().ptr(), + ipc_handles.peer().ptr(), + count, + cudaMemcpyDeviceToDevice, + stream)); + // Signals completion to self + NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32( + stream, + reinterpret_cast(ipc_handles.local().semaphore()), + (cuuint32_t)(IpcSemaphore::kReady), + CU_STREAM_WRITE_VALUE_DEFAULT)); + // Signals completion to receiver + NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32( + stream, + reinterpret_cast(ipc_handles.peer().semaphore()), + (cuuint32_t)(IpcSemaphore::kReady), + CU_STREAM_WRITE_VALUE_DEFAULT)); +} + +void SendPost(const P2pIpcHandle& ipc_handles, CUstream stream) { + // signal to self that transfer is in progress + NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32( + stream, + reinterpret_cast(ipc_handles.local().semaphore()), + (cuuint32_t)(IpcSemaphore::kInUse), + CU_STREAM_WRITE_VALUE_DEFAULT)); + // signal to receiver that the buffer is ready + NVFUSER_CUDA_SAFE_CALL(cuStreamWriteValue32( + stream, + reinterpret_cast(ipc_handles.peer().semaphore()), + (cuuint32_t)(IpcSemaphore::kInUse), + CU_STREAM_WRITE_VALUE_DEFAULT)); // passing + // CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER + // gives an error +} + +void SendWait(const P2pIpcHandle& ipc_handles, CUstream stream) { + NVFUSER_CUDA_SAFE_CALL(cuStreamWaitValue32( + stream, + reinterpret_cast(ipc_handles.local().semaphore()), + (cuuint32_t)(IpcSemaphore::kReady), + CU_STREAM_WAIT_VALUE_EQ)); +} + +} // namespace getZcopy + +} // namespace nvfuser diff --git a/csrc/multidevice/cuda_p2p.h b/csrc/multidevice/cuda_p2p.h new file mode 100644 index 00000000000..45d2fdd2558 --- /dev/null +++ b/csrc/multidevice/cuda_p2p.h @@ -0,0 +1,22 @@ +// clang-format off +/* + * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES. + * All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + */ +// clang-format on +#pragma once +#include +#include + +namespace nvfuser { + +namespace getZcopy { + +void RecvPost(const P2pIpcHandle& ipc_handles, int64_t count, CUstream stream); +void SendPost(const P2pIpcHandle& ipc_handles, CUstream stream); +void SendWait(const P2pIpcHandle& ipc_handles, CUstream stream); + +} // namespace getZcopy + +} // namespace nvfuser diff --git a/csrc/multidevice/ipc_handle.h b/csrc/multidevice/ipc_handle.h index 5df55fec441..70e5a3bf560 100644 --- a/csrc/multidevice/ipc_handle.h +++ b/csrc/multidevice/ipc_handle.h @@ -151,6 +151,7 @@ class IpcHandleCache { std::unordered_map, KeyHash, KeyEqual> handles_; + // TODO: add counter to support multiple send/recv per pair of ranks }; } // namespace nvfuser diff --git a/tests/cpp/test_multidevice_communications.cpp b/tests/cpp/test_multidevice_communications.cpp index 9db4f3a78eb..d0a3da5a26f 100644 --- a/tests/cpp/test_multidevice_communications.cpp +++ b/tests/cpp/test_multidevice_communications.cpp @@ -432,29 +432,29 @@ TEST_F(P2PCommunicationTest, CudaComm) { FusionGuard fg(container.get()); auto* my_rank_val = IrBuilder::create(my_rank, DataType::Int); - auto* recv_peer_val = IrBuilder::create(recv_peer, DataType::Int); auto* send_peer_val = IrBuilder::create(send_peer, DataType::Int); + auto* recv_peer_val = IrBuilder::create(recv_peer, DataType::Int); auto* send_tv = makeContigTensor(1); auto* recv_tv = makeContigTensor(1); container->addInput(send_tv); container->addInput(recv_tv); - auto recv = IrBuilder::create( - recv_tv, my_rank_val, recv_peer_val, CommunicatorBackend::kCuda); auto send = IrBuilder::create( - send_tv, send_peer_val, my_rank_val, CommunicatorBackend::kCuda); - std::vector grouped_communications = {recv, send}; + send_tv, send_peer_val, my_rank_val, CommunicatorBackend::kCuda); + auto recv = IrBuilder::create( + recv_tv, my_rank_val, recv_peer_val, CommunicatorBackend::kCuda); + std::vector grouped_communications = {send, recv}; auto share_mem_handles = IrBuilder::create( std::move(grouped_communications)); - auto wait_recv = IrBuilder::create(recv); auto wait_send = IrBuilder::create(send); + auto wait_recv = IrBuilder::create(recv); container->pushBackTopLevelExprs(share_mem_handles); container->pushBackTopLevelExprs(send); container->pushBackTopLevelExprs(recv); - container->pushBackTopLevelExprs(wait_recv); container->pushBackTopLevelExprs(wait_send); + container->pushBackTopLevelExprs(wait_recv); hir::HostIrEvaluator executor(std::move(container), communicator_);