rapidsai
diff --git a/‎cpp/CMakeLists.txt‎
Lines changed: 3 additions & 2 deletions b/‎cpp/CMakeLists.txt‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎cpp/benchmarks/streaming/bench_streaming_shuffle.cpp‎
Lines changed: 1 addition & 1 deletion b/‎cpp/benchmarks/streaming/bench_streaming_shuffle.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/include/rapidsmpf/allgather/allgather.hpp‎
Lines changed: 11 additions & 3 deletions b/‎cpp/include/rapidsmpf/allgather/allgather.hpp‎
Lines changed: 11 additions & 3 deletions
diff --git a/‎cpp/include/rapidsmpf/buffer/packed_data.hpp‎
Lines changed: 10 additions & 0 deletions b/‎cpp/include/rapidsmpf/buffer/packed_data.hpp‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎cpp/include/rapidsmpf/streaming/chunks/packed_data.hpp‎
Lines changed: 29 additions & 0 deletions b/‎cpp/include/rapidsmpf/streaming/chunks/packed_data.hpp‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎cpp/include/rapidsmpf/streaming/chunks/partition.hpp‎
Lines changed: 53 additions & 0 deletions b/‎cpp/include/rapidsmpf/streaming/chunks/partition.hpp‎
Lines changed: 53 additions & 0 deletions
diff --git a/‎cpp/include/rapidsmpf/streaming/coll/allgather.hpp‎
Lines changed: 115 additions & 0 deletions b/‎cpp/include/rapidsmpf/streaming/coll/allgather.hpp‎
Lines changed: 115 additions & 0 deletions
diff --git a/‎cpp/include/rapidsmpf/streaming/cudf/shuffler.hpp‎ renamed to ‎cpp/include/rapidsmpf/streaming/coll/shuffler.hpp‎ b/‎cpp/include/rapidsmpf/streaming/cudf/shuffler.hpp‎ renamed to ‎cpp/include/rapidsmpf/streaming/coll/shuffler.hpp‎
diff --git a/‎cpp/include/rapidsmpf/streaming/cudf/partition.hpp‎
Lines changed: 1 addition & 42 deletions b/‎cpp/include/rapidsmpf/streaming/cudf/partition.hpp‎
Lines changed: 1 addition & 42 deletions
@@ -168,11 +168,12 @@ add_library(
 if(RAPIDSMPF_HAVE_STREAMING)
   target_sources(
     rapidsmpf
-    PRIVATE src/streaming/core/context.cpp
+    PRIVATE src/streaming/coll/allgather.cpp
+            src/streaming/coll/shuffler.cpp
+            src/streaming/core/context.cpp
             src/streaming/core/leaf_node.cpp
             src/streaming/core/node.cpp
             src/streaming/cudf/partition.cpp
-            src/streaming/cudf/shuffler.cpp
             src/streaming/cudf/table_chunk.cpp
   )
 endif()
 
@@ -18,11 +18,11 @@
 #include <rapidsmpf/nvtx.hpp>
 #include <rapidsmpf/shuffler/shuffler.hpp>
 #include <rapidsmpf/statistics.hpp>
+#include <rapidsmpf/streaming/coll/shuffler.hpp>
 #include <rapidsmpf/streaming/core/channel.hpp>
 #include <rapidsmpf/streaming/core/context.hpp>
 #include <rapidsmpf/streaming/core/node.hpp>
 #include <rapidsmpf/streaming/cudf/partition.hpp>
-#include <rapidsmpf/streaming/cudf/shuffler.hpp>
 #include <rapidsmpf/streaming/cudf/table_chunk.hpp>
 #include <rapidsmpf/utils.hpp>
 
 
@@ -8,6 +8,7 @@
 #include <chrono>
 #include <condition_variable>
 #include <cstdint>
+#include <functional>
 #include <limits>
 #include <memory>
 #include <mutex>
@@ -373,9 +374,10 @@ class AllGather {
     /**
      * @brief Insert packed data into the allgather operation.
      *
+     * @param sequence_number Local ordered sequence number of the data.
      * @param packed_data The data to contribute to the allgather.
      */
-    void insert(PackedData&& packed_data);
+    void insert(std::uint64_t sequence_number, PackedData&& packed_data);
 
     /**
      * @brief Mark that this rank has finished contributing data.
@@ -445,6 +447,9 @@ class AllGather {
      * @param br Buffer resource for memory allocation.
      * @param statistics Statistics collection instance (disabled by
      * default).
+     * @param finished_callback Optional callback run when partitions are locally
+     * finished. The callback is guaranteed to be called by the progress thread exactly
+     * once when the allgather is locally ready.
      *
      * @note The caller promises that inserted buffers are stream-ordered with respect
      * to their own stream, and extracted buffers are likewise guaranteed to be stream-
@@ -455,7 +460,8 @@ class AllGather {
         std::shared_ptr<ProgressThread> progress_thread,
         OpID op_id,
         BufferResource* br,
-        std::shared_ptr<Statistics> statistics = Statistics::disabled()
+        std::shared_ptr<Statistics> statistics = Statistics::disabled(),
+        std::function<void(void)>&& finished_callback = nullptr
     );
 
     /// @brief Deleted copy constructor.
@@ -524,8 +530,10 @@ class AllGather {
         progress_thread_;  ///< Progress thread for async operations
     BufferResource* br_;  ///< Buffer resource for memory allocation
     std::shared_ptr<Statistics> statistics_;  ///< Statistics collection instance
+    std::function<void(void)> finished_callback_{
+        nullptr
+    };  ///< Optional callback to run when allgather is finished and ready for extraction.
     std::atomic<Rank> finish_counter_;  ///< Counter for finish markers received
-    std::atomic<std::uint64_t> sequence_number_;  ///< Sequence number for chunks
     std::atomic<std::uint32_t> nlocal_insertions_;  ///< Number of local data insertions
     OpID op_id_;  ///< Unique operation identifier
     std::atomic<bool> locally_finished_{false};  ///< Whether this rank has finished
 
@@ -8,6 +8,7 @@
 #include <memory>
 #include <vector>
 
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 
 #include <rapidsmpf/buffer/buffer.hpp>
@@ -68,6 +69,15 @@ struct PackedData {
     [[nodiscard]] bool empty() const {
         return metadata->empty() && data->size == 0;
     }
+
+    /**
+     * @brief Get the stream associated with the data buffer.
+     *
+     * @return The CUDA stream.
+     */
+    [[nodiscard]] rmm::cuda_stream_view stream() const {
+        return data->stream();
+    }
 };
 
 }  // namespace rapidsmpf
@@ -0,0 +1,29 @@
+/**
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <cstdint>
+
+#include <rapidsmpf/buffer/packed_data.hpp>
+
+namespace rapidsmpf::streaming {
+
+/**
+ * @brief Chunk of `PackedData` with sequence number.
+ */
+struct PackedDataChunk {
+    /**
+     * @brief Sequence number used to preserve chunk ordering.
+     */
+    std::uint64_t sequence_number;
+
+    /**
+     * @brief Packed data payload.
+     */
+    PackedData data;
+};
+
+}  // namespace rapidsmpf::streaming
@@ -0,0 +1,53 @@
+/**
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <unordered_map>
+#include <vector>
+
+#include <rapidsmpf/buffer/packed_data.hpp>
+#include <rapidsmpf/shuffler/chunk.hpp>
+
+namespace rapidsmpf::streaming {
+
+/**
+ * @brief Chunk of packed partitions identified by partition ID.
+ *
+ * Represents a single unit of work in a streaming pipeline where each partition
+ * is associated with a `PartID` and contains packed (serialized) data.
+ */
+struct PartitionMapChunk {
+    /**
+     * @brief Sequence number used to preserve chunk ordering.
+     */
+    std::uint64_t sequence_number;
+
+    /**
+     * @brief Packed data for each partition, keyed by partition ID.
+     */
+    std::unordered_map<shuffler::PartID, PackedData> data;
+};
+
+/**
+ * @brief Chunk of packed partitions stored as a vector.
+ *
+ * Represents a single unit of work in a streaming pipeline where the partitions
+ * are stored in a vector.
+ */
+struct PartitionVectorChunk {
+    /**
+     * @brief Sequence number used to preserve chunk ordering.
+     */
+    std::uint64_t sequence_number;
+
+    /**
+     * @brief Packed data for each partition stored in a vector.
+     */
+    std::vector<PackedData> data;
+};
+
+}  // namespace rapidsmpf::streaming
@@ -0,0 +1,115 @@
+/**
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include <rapidsmpf/allgather/allgather.hpp>
+#include <rapidsmpf/buffer/packed_data.hpp>
+#include <rapidsmpf/communicator/communicator.hpp>
+#include <rapidsmpf/streaming/chunks/packed_data.hpp>
+#include <rapidsmpf/streaming/core/channel.hpp>
+#include <rapidsmpf/streaming/core/context.hpp>
+
+#include <coro/event.hpp>
+#include <coro/task.hpp>
+
+namespace rapidsmpf::streaming {
+
+/**
+ * @brief Asynchronous (coroutine) interface to `allgather::AllGather`.
+ *
+ * Once the AllGather is created, many tasks may insert data into it. If multiple tasks
+ * insert data, the user is responsible for arranging that `insert_finished` is only
+ * called after all `insert`ions have completed. A single consumer task should extract
+ * data.
+ */
+class AllGather {
+  public:
+    /// @copydoc allgather::AllGather::Ordered
+    using Ordered = rapidsmpf::allgather::AllGather::Ordered;
+    /**
+     * @brief Construct an asynchronous allgather.
+     *
+     * @param ctx Streaming context
+     * @param op_id Unique identifier for the allgather.
+     */
+    AllGather(std::shared_ptr<Context> ctx, OpID op_id);
+
+    AllGather(AllGather const&) = delete;
+    AllGather& operator=(AllGather const&) = delete;
+    AllGather(AllGather&&) = delete;
+    AllGather& operator=(AllGather&&) = delete;
+
+    ~AllGather();
+
+    /**
+     * @brief Gets the streaming context associated with this AllGather object.
+     *
+     * @return Shared pointer to context.
+     */
+    [[nodiscard]] std::shared_ptr<Context> ctx() const noexcept;
+
+    /**
+     * @brief Insert a chunk into the allgather.
+     *
+     * @param chunk The chunk to insert holding data and a sequence number.
+     */
+    void insert(PackedDataChunk&& chunk);
+
+    /// @copydoc rapidsmpf::allgather::AllGather::insert_finished()
+    void insert_finished();
+
+    /**
+     * @brief Extract all gathered data.
+     *
+     * @param ordered If the extracted data should be ordered. If ordered, return data
+     * will be ordered first by rank and then by sequence number of the inserted chunks on
+     * that rank.
+     *
+     * @return Coroutine that completes when all data is available for extraction and
+     * returns the data.
+     */
+    coro::task<std::vector<PackedDataChunk>> extract_all(Ordered ordered = Ordered::YES);
+
+  private:
+    coro::event
+        event_{};  ///< Event tracking whether all data has arrived and can be extracted.
+    std::shared_ptr<Context> ctx_;  ///< Streaming context.
+    allgather::AllGather gatherer_;  ///< Underlying collective allgather.
+};
+
+namespace node {
+
+/**
+ * @brief Create an allgather node for a single allgather operation.
+ *
+ * This is a streaming version of `rapidsmpf::allgather::AllGather` that operates on
+ * packed data received through `Channel`s.
+ *
+ * @param ctx The streaming context to use.
+ * @param ch_in Input channel providing `PackedDataChunk`s to be gathered.
+ * @param ch_out Output channel where the gathered `PackedDataChunk`s are sent.
+ * @param op_id Unique identifier for the operation.
+ * @param ordered If the extracted data should be sent to the output channel with sequence
+ * numbers corresponding to the global total order of input chunks. If yes, then the
+ * sequence numbers of the extracted data will be ordered first by rank and then by input
+ * sequence number. If no, the sequence number of the extracted chunks will have no
+ * relation to any input sequence order.
+ *
+ * @return A streaming node that completes when the allgather is finished and the output
+ * channel is drained.
+ */
+Node allgather(
+    std::shared_ptr<Context> ctx,
+    std::shared_ptr<Channel> ch_in,
+    std::shared_ptr<Channel> ch_out,
+    OpID op_id,
+    AllGather::Ordered ordered = AllGather::Ordered::YES
+);
+}  // namespace node
+}  // namespace rapidsmpf::streaming
@@ -5,58 +5,17 @@
 
 #pragma once
 
-#include <unordered_map>
+#include <cstdint>
 #include <vector>
 
 #include <cudf/partitioning.hpp>
-#include <cudf/table/table.hpp>
 
-#include <rapidsmpf/buffer/packed_data.hpp>
-#include <rapidsmpf/shuffler/shuffler.hpp>
-#include <rapidsmpf/statistics.hpp>
 #include <rapidsmpf/streaming/core/channel.hpp>
 #include <rapidsmpf/streaming/core/context.hpp>
 #include <rapidsmpf/streaming/core/node.hpp>
-#include <rapidsmpf/streaming/cudf/table_chunk.hpp>
 
 namespace rapidsmpf::streaming {
 
-/**
- * @brief Chunk of packed partitions identified by partition ID.
- *
- * Represents a single unit of work in a streaming pipeline where each partition
- * is associated with a `PartID` and contains packed (serialized) data.
- */
-struct PartitionMapChunk {
-    /**
-     * @brief Sequence number used to preserve chunk ordering.
-     */
-    std::uint64_t sequence_number;
-
-    /**
-     * @brief Packed data for each partition, keyed by partition ID.
-     */
-    std::unordered_map<shuffler::PartID, PackedData> data;
-};
-
-/**
- * @brief Chunk of packed partitions stored as a vector.
- *
- * Represents a single unit of work in a streaming pipeline where the partitions
- * are stored in a vector.
- */
-struct PartitionVectorChunk {
-    /**
-     * @brief Sequence number used to preserve chunk ordering.
-     */
-    std::uint64_t sequence_number;
-
-    /**
-     * @brief Packed data for each partition stored in a vector.
-     */
-    std::vector<PackedData> data;
-};
-
 namespace node {
 
 /**