Implement the HIP cumulative access fix

artulab · artulab · commit c3ff51f06dd9 · 2026-05-21T12:38:11.000-07:00
diff --git a/iris/drivers/base.py b/iris/drivers/base.py
@@ -59,15 +59,31 @@ def initialize(self, device_ordinal: int) -> None:
         """Prepare the driver for a specific local GPU."""
 
     @abstractmethod
-    def allocate_exportable(self, size: int, va: Optional[int] = None) -> LocalAllocation:
+    def allocate_exportable(
+        self,
+        size: int,
+        va: Optional[int] = None,
+        *,
+        access_va: Optional[int] = None,
+        access_size: Optional[int] = None,
+    ) -> LocalAllocation:
         """Allocate exportable memory, optionally mapping it at a caller-reserved VA."""
 
     @abstractmethod
     def export_handle(self, allocation: LocalAllocation) -> bytes:
         """Export a transport-specific handle for a local allocation."""
 
     @abstractmethod
-    def import_and_map(self, peer_rank: int, handle_bytes: bytes, size: int, va: Optional[int] = None) -> PeerMapping:
+    def import_and_map(
+        self,
+        peer_rank: int,
+        handle_bytes: bytes,
+        size: int,
+        va: Optional[int] = None,
+        *,
+        access_va: Optional[int] = None,
+        access_size: Optional[int] = None,
+    ) -> PeerMapping:
         """Import a peer handle and map it into the local virtual address space."""
 
     @abstractmethod
diff --git a/iris/drivers/fabric/amd.py b/iris/drivers/fabric/amd.py
@@ -28,15 +28,27 @@ def initialize(self, device_ordinal: int) -> None:
         raise DriverNotSupported(_NOT_IMPLEMENTED_MESSAGE)
 
     def allocate_exportable(
-        self, size: int, va: Optional[int] = None
+        self,
+        size: int,
+        va: Optional[int] = None,
+        *,
+        access_va: Optional[int] = None,
+        access_size: Optional[int] = None,
     ) -> LocalAllocation:
         raise DriverNotSupported(_NOT_IMPLEMENTED_MESSAGE)
 
     def export_handle(self, allocation: LocalAllocation) -> bytes:
         raise DriverNotSupported(_NOT_IMPLEMENTED_MESSAGE)
 
     def import_and_map(
-        self, peer_rank: int, handle_bytes: bytes, size: int, va: Optional[int] = None
+        self,
+        peer_rank: int,
+        handle_bytes: bytes,
+        size: int,
+        va: Optional[int] = None,
+        *,
+        access_va: Optional[int] = None,
+        access_size: Optional[int] = None,
     ) -> PeerMapping:
         raise DriverNotSupported(_NOT_IMPLEMENTED_MESSAGE)
 
diff --git a/iris/drivers/fabric/nvidia.py b/iris/drivers/fabric/nvidia.py
@@ -356,9 +356,16 @@ def _check_initialized(self) -> None:
             )
 
     def allocate_exportable(
-        self, size: int, va: Optional[int] = None
+        self,
+        size: int,
+        va: Optional[int] = None,
+        *,
+        access_va: Optional[int] = None,
+        access_size: Optional[int] = None,
     ) -> LocalAllocation:
         self._check_initialized()
+        if (access_va is None) != (access_size is None):
+            raise CudaFabricError("access_va and access_size must be provided together")
         props = self._make_alloc_props()
         granularity = self._get_granularity()
         alloc_size = _round_up(size, granularity)
@@ -389,7 +396,10 @@ def allocate_exportable(
                 "cuMemMap",
             )
             mapped = True
-            self._mem_set_access(mapped_va, alloc_size)
+            self._mem_set_access(
+                int(access_va) if access_va is not None else mapped_va,
+                int(access_size) if access_size is not None else alloc_size,
+            )
             return LocalAllocation(
                 va=mapped_va,
                 size=alloc_size,
@@ -448,9 +458,18 @@ def _import_handle(self, handle_bytes: bytes) -> int:
         return int(imported.value)
 
     def import_and_map(
-        self, peer_rank: int, handle_bytes: bytes, size: int, va: Optional[int] = None
+        self,
+        peer_rank: int,
+        handle_bytes: bytes,
+        size: int,
+        va: Optional[int] = None,
+        *,
+        access_va: Optional[int] = None,
+        access_size: Optional[int] = None,
     ) -> PeerMapping:
         self._check_initialized()
+        if (access_va is None) != (access_size is None):
+            raise CudaFabricError("access_va and access_size must be provided together")
         imported_handle = self._import_handle(handle_bytes)
 
         granularity = self._get_granularity()
@@ -472,7 +491,10 @@ def import_and_map(
                 "cuMemMap",
             )
             mapped = True
-            self._mem_set_access(mapped_va, size)
+            self._mem_set_access(
+                int(access_va) if access_va is not None else mapped_va,
+                int(access_size) if access_size is not None else size,
+            )
         except Exception:
             if mapped:
                 try:
diff --git a/iris/drivers/local/amd.py b/iris/drivers/local/amd.py
@@ -378,7 +378,12 @@ def initialize(self, device_ordinal: int) -> None:
         logger.info("LocalHipDriver initialized (device %d)", device_ordinal)
 
     def allocate_exportable(
-        self, size: int, va: Optional[int] = None
+        self,
+        size: int,
+        va: Optional[int] = None,
+        *,
+        access_va: Optional[int] = None,
+        access_size: Optional[int] = None,
     ) -> LocalAllocation:
         """
         Allocate HIP VMem exportable as a DMA-BUF.
@@ -387,6 +392,8 @@ def allocate_exportable(
         granularity-aligned VA range containing [va, va + size).
         """
         self._check_initialized()
+        if (access_va is None) != (access_size is None):
+            raise LocalHipError("access_va and access_size must be provided together")
         props = self._make_alloc_props()
         granularity = self._get_granularity()
         alloc_size = _round_up(size, granularity)
@@ -418,7 +425,10 @@ def allocate_exportable(
                 "hipMemMap",
             )
             mapped = True
-            self._mem_set_access(mapped_va, alloc_size)
+            self._mem_set_access(
+                int(access_va) if access_va is not None else mapped_va,
+                int(access_size) if access_size is not None else alloc_size,
+            )
             return LocalAllocation(
                 va=mapped_va,
                 size=alloc_size,
@@ -509,10 +519,19 @@ def export_handle(self, allocation: LocalAllocation) -> bytes:
         return self._export_range(allocation.va, allocation.size)
 
     def import_and_map(
-        self, peer_rank: int, handle_bytes: bytes, size: int, va: Optional[int] = None
+        self,
+        peer_rank: int,
+        handle_bytes: bytes,
+        size: int,
+        va: Optional[int] = None,
+        *,
+        access_va: Optional[int] = None,
+        access_size: Optional[int] = None,
     ) -> PeerMapping:
         """Import a DMA-BUF descriptor and map it into local GPU address space."""
         self._check_initialized()
+        if (access_va is None) != (access_size is None):
+            raise LocalHipError("access_va and access_size must be provided together")
         if len(handle_bytes) != _AMD_HANDLE_BYTES:
             raise LocalHipError(
                 f"AMD local handle must be {_AMD_HANDLE_BYTES} bytes, got {len(handle_bytes)}"
@@ -548,7 +567,10 @@ def import_and_map(
                     "hipMemMap",
                 )
                 mapped = True
-                self._mem_set_access(mapped_va, size)
+                self._mem_set_access(
+                    int(access_va) if access_va is not None else mapped_va,
+                    int(access_size) if access_size is not None else size,
+                )
                 return PeerMapping(
                     peer_rank=peer_rank,
                     transport=InterconnectLevel.INTRA_NODE,
diff --git a/iris/drivers/local/nvidia.py b/iris/drivers/local/nvidia.py
@@ -362,7 +362,12 @@ def initialize(self, device_ordinal: int) -> None:
         logger.info("LocalCudaDriver initialized (device %d)", device_ordinal)
 
     def allocate_exportable(
-        self, size: int, va: Optional[int] = None
+        self,
+        size: int,
+        va: Optional[int] = None,
+        *,
+        access_va: Optional[int] = None,
+        access_size: Optional[int] = None,
     ) -> LocalAllocation:
         """
         Allocate CUDA VMM memory exportable as a POSIX FD.
@@ -371,6 +376,8 @@ def allocate_exportable(
         granularity-aligned VA range containing [va, va + size).
         """
         self._check_initialized()
+        if (access_va is None) != (access_size is None):
+            raise LocalCudaError("access_va and access_size must be provided together")
         props = self._make_alloc_props()
         granularity = self._get_granularity()
         alloc_size = _round_up(size, granularity)
@@ -401,7 +408,10 @@ def allocate_exportable(
                 "cuMemMap",
             )
             mapped = True
-            self._mem_set_access(mapped_va, alloc_size)
+            self._mem_set_access(
+                int(access_va) if access_va is not None else mapped_va,
+                int(access_size) if access_size is not None else alloc_size,
+            )
             return LocalAllocation(
                 va=mapped_va,
                 size=alloc_size,
@@ -475,10 +485,19 @@ def _import_handle(self, handle_bytes: bytes) -> int:
         return int(imported.value)
 
     def import_and_map(
-        self, peer_rank: int, handle_bytes: bytes, size: int, va: Optional[int] = None
+        self,
+        peer_rank: int,
+        handle_bytes: bytes,
+        size: int,
+        va: Optional[int] = None,
+        *,
+        access_va: Optional[int] = None,
+        access_size: Optional[int] = None,
     ) -> PeerMapping:
         """Import a POSIX-FD handle and map it into local CUDA VMM VA space."""
         self._check_initialized()
+        if (access_va is None) != (access_size is None):
+            raise LocalCudaError("access_va and access_size must be provided together")
         imported_handle = self._import_handle(handle_bytes)
 
         granularity = self._get_granularity()
@@ -500,7 +519,10 @@ def import_and_map(
                 "cuMemMap",
             )
             mapped = True
-            self._mem_set_access(mapped_va, size)
+            self._mem_set_access(
+                int(access_va) if access_va is not None else mapped_va,
+                int(access_size) if access_size is not None else size,
+            )
         except Exception:
             steps: list[tuple[str, Callable[[], None]]] = []
             if mapped:
diff --git a/iris/host/distributed/topology.py b/iris/host/distributed/topology.py
@@ -1122,20 +1122,10 @@ def __init__(self, iris_ctx=None):
             if num_gpus <= 0:
                 raise RuntimeError("TopologyDiscovery requires at least one GPU")
 
-            # Use LOCAL_RANK (set by torchrun/SLURM) for per-node GPU assignment.
-            # This is more robust than global_rank % num_gpus, which breaks when
-            # ranks aren't distributed in a way that aligns with device_count
-            # (e.g., 2 nodes with 8 GPUs each but only 4 ranks per node).
-            # The % num_gpus clamp handles isolation (LOCAL_RANK=3, device_count=1).
-            local_rank = int(os.environ.get("LOCAL_RANK", 0))
-            self.gpu_id = local_rank % num_gpus
-            # MUST set device BEFORE init_process_group — NCCL needs a CUDA
-            # device assigned to this process, otherwise all ranks fight over
-            # GPU 0 and init either fails or produces world_size=1.
-            torch.cuda.set_device(self.gpu_id)
             if dist.is_initialized():
                 self.rank = dist.get_rank()
                 self.world_size = dist.get_world_size()
+                self.gpu_id = torch.cuda.current_device()
             else:
                 raise RuntimeError("TopologyDiscovery requires an initialized distributed process group.")
 
diff --git a/iris/host/memory/allocators/vmem_chunked_allocator.py b/iris/host/memory/allocators/vmem_chunked_allocator.py
@@ -266,7 +266,15 @@ def _grow_chunk(self):
             )
 
         target_va = self.base_va + self.mapped_extent
-        allocation = self.driver.allocate_exportable(self.chunk_size, va=target_va)
+        alloc_kwargs = {}
+        if self.driver.__class__.__name__ == "LocalHipDriver":
+            alloc_kwargs = {
+                "access_va": self.base_va,
+                "access_size": self.mapped_extent + self.chunk_size,
+            }
+        allocation = self.driver.allocate_exportable(
+            self.chunk_size, va=target_va, **alloc_kwargs
+        )
         self.chunks.append(allocation)
         self._shared_regions.append(
             _SharedRegion(va=allocation.va, size=allocation.size, allocation=allocation)
@@ -506,8 +514,18 @@ def import_external_tensor(self, external_tensor: torch.Tensor) -> torch.Tensor:
 
             target_base_va = self.base_va + target_offset
             handle_bytes = self.driver.export_pointer_handle(alloc_base, alloc_size)
+            import_kwargs = {}
+            if self.driver.__class__.__name__ == "LocalHipDriver":
+                import_kwargs = {
+                    "access_va": self.base_va,
+                    "access_size": target_offset + aligned_alloc_size,
+                }
             mapping = self.driver.import_and_map(
-                self.cur_rank, handle_bytes, aligned_alloc_size, va=target_base_va
+                self.cur_rank,
+                handle_bytes,
+                aligned_alloc_size,
+                va=target_base_va,
+                **import_kwargs,
             )
             self._imported_heap_mappings.append(mapping)
             self._shared_regions.append(
diff --git a/iris/host/memory/symmetric_heap.py b/iris/host/memory/symmetric_heap.py
@@ -15,7 +15,7 @@
 import numpy as np
 import torch
 
-from iris.host.logging.logging import _log_rank, logger
+from iris.host.logging.logging import _log_rank
 from iris.host.memory.allocators import TorchAllocator, VMemAllocator, VMemChunkedAllocator
 from iris.drivers.base import PeerMapping
 from iris.host.distributed.fd_passing import setup_fd_infrastructure
@@ -142,7 +142,7 @@ def __init__(
             from iris.host.distributed.topology import TopologyDiscovery
 
             try:
-                topology = TopologyDiscovery.discover()
+                topology = TopologyDiscovery().discover()
             except Exception as exc:
                 logger.warning(
                     "TopologyDiscovery.discover() failed (%s); VMemChunkedAllocator will default to INTRA_NODE driver.",
@@ -555,11 +555,18 @@ def _refresh_peer_access_chunked(self, dist):
                     reconstructed_handle = _replace_fd_in_local_handle(
                         peer_handle_bytes, cloned_fd
                     )
+                    import_kwargs = {}
+                    if len(peer_handle_bytes) == _LOCAL_HIP_HANDLE_BYTES:
+                        import_kwargs = {
+                            "access_va": peer_va_base,
+                            "access_size": peer_offset + peer_size,
+                        }
                     mapping = self.allocator.driver.import_and_map(
                         peer,
                         reconstructed_handle,
                         peer_size,
                         va=peer_va_base + peer_offset,
+                        **import_kwargs,
                     )
                     self._peer_imported_mappings[peer].append(mapping)