Skip to content

Commit c3ff51f

Browse files
committed
Implement the HIP cumulative access fix
1 parent c33dc44 commit c3ff51f

8 files changed

Lines changed: 140 additions & 31 deletions

File tree

iris/drivers/base.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,15 +59,31 @@ def initialize(self, device_ordinal: int) -> None:
5959
"""Prepare the driver for a specific local GPU."""
6060

6161
@abstractmethod
62-
def allocate_exportable(self, size: int, va: Optional[int] = None) -> LocalAllocation:
62+
def allocate_exportable(
63+
self,
64+
size: int,
65+
va: Optional[int] = None,
66+
*,
67+
access_va: Optional[int] = None,
68+
access_size: Optional[int] = None,
69+
) -> LocalAllocation:
6370
"""Allocate exportable memory, optionally mapping it at a caller-reserved VA."""
6471

6572
@abstractmethod
6673
def export_handle(self, allocation: LocalAllocation) -> bytes:
6774
"""Export a transport-specific handle for a local allocation."""
6875

6976
@abstractmethod
70-
def import_and_map(self, peer_rank: int, handle_bytes: bytes, size: int, va: Optional[int] = None) -> PeerMapping:
77+
def import_and_map(
78+
self,
79+
peer_rank: int,
80+
handle_bytes: bytes,
81+
size: int,
82+
va: Optional[int] = None,
83+
*,
84+
access_va: Optional[int] = None,
85+
access_size: Optional[int] = None,
86+
) -> PeerMapping:
7187
"""Import a peer handle and map it into the local virtual address space."""
7288

7389
@abstractmethod

iris/drivers/fabric/amd.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,15 +28,27 @@ def initialize(self, device_ordinal: int) -> None:
2828
raise DriverNotSupported(_NOT_IMPLEMENTED_MESSAGE)
2929

3030
def allocate_exportable(
31-
self, size: int, va: Optional[int] = None
31+
self,
32+
size: int,
33+
va: Optional[int] = None,
34+
*,
35+
access_va: Optional[int] = None,
36+
access_size: Optional[int] = None,
3237
) -> LocalAllocation:
3338
raise DriverNotSupported(_NOT_IMPLEMENTED_MESSAGE)
3439

3540
def export_handle(self, allocation: LocalAllocation) -> bytes:
3641
raise DriverNotSupported(_NOT_IMPLEMENTED_MESSAGE)
3742

3843
def import_and_map(
39-
self, peer_rank: int, handle_bytes: bytes, size: int, va: Optional[int] = None
44+
self,
45+
peer_rank: int,
46+
handle_bytes: bytes,
47+
size: int,
48+
va: Optional[int] = None,
49+
*,
50+
access_va: Optional[int] = None,
51+
access_size: Optional[int] = None,
4052
) -> PeerMapping:
4153
raise DriverNotSupported(_NOT_IMPLEMENTED_MESSAGE)
4254

iris/drivers/fabric/nvidia.py

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -356,9 +356,16 @@ def _check_initialized(self) -> None:
356356
)
357357

358358
def allocate_exportable(
359-
self, size: int, va: Optional[int] = None
359+
self,
360+
size: int,
361+
va: Optional[int] = None,
362+
*,
363+
access_va: Optional[int] = None,
364+
access_size: Optional[int] = None,
360365
) -> LocalAllocation:
361366
self._check_initialized()
367+
if (access_va is None) != (access_size is None):
368+
raise CudaFabricError("access_va and access_size must be provided together")
362369
props = self._make_alloc_props()
363370
granularity = self._get_granularity()
364371
alloc_size = _round_up(size, granularity)
@@ -389,7 +396,10 @@ def allocate_exportable(
389396
"cuMemMap",
390397
)
391398
mapped = True
392-
self._mem_set_access(mapped_va, alloc_size)
399+
self._mem_set_access(
400+
int(access_va) if access_va is not None else mapped_va,
401+
int(access_size) if access_size is not None else alloc_size,
402+
)
393403
return LocalAllocation(
394404
va=mapped_va,
395405
size=alloc_size,
@@ -448,9 +458,18 @@ def _import_handle(self, handle_bytes: bytes) -> int:
448458
return int(imported.value)
449459

450460
def import_and_map(
451-
self, peer_rank: int, handle_bytes: bytes, size: int, va: Optional[int] = None
461+
self,
462+
peer_rank: int,
463+
handle_bytes: bytes,
464+
size: int,
465+
va: Optional[int] = None,
466+
*,
467+
access_va: Optional[int] = None,
468+
access_size: Optional[int] = None,
452469
) -> PeerMapping:
453470
self._check_initialized()
471+
if (access_va is None) != (access_size is None):
472+
raise CudaFabricError("access_va and access_size must be provided together")
454473
imported_handle = self._import_handle(handle_bytes)
455474

456475
granularity = self._get_granularity()
@@ -472,7 +491,10 @@ def import_and_map(
472491
"cuMemMap",
473492
)
474493
mapped = True
475-
self._mem_set_access(mapped_va, size)
494+
self._mem_set_access(
495+
int(access_va) if access_va is not None else mapped_va,
496+
int(access_size) if access_size is not None else size,
497+
)
476498
except Exception:
477499
if mapped:
478500
try:

iris/drivers/local/amd.py

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -378,7 +378,12 @@ def initialize(self, device_ordinal: int) -> None:
378378
logger.info("LocalHipDriver initialized (device %d)", device_ordinal)
379379

380380
def allocate_exportable(
381-
self, size: int, va: Optional[int] = None
381+
self,
382+
size: int,
383+
va: Optional[int] = None,
384+
*,
385+
access_va: Optional[int] = None,
386+
access_size: Optional[int] = None,
382387
) -> LocalAllocation:
383388
"""
384389
Allocate HIP VMem exportable as a DMA-BUF.
@@ -387,6 +392,8 @@ def allocate_exportable(
387392
granularity-aligned VA range containing [va, va + size).
388393
"""
389394
self._check_initialized()
395+
if (access_va is None) != (access_size is None):
396+
raise LocalHipError("access_va and access_size must be provided together")
390397
props = self._make_alloc_props()
391398
granularity = self._get_granularity()
392399
alloc_size = _round_up(size, granularity)
@@ -418,7 +425,10 @@ def allocate_exportable(
418425
"hipMemMap",
419426
)
420427
mapped = True
421-
self._mem_set_access(mapped_va, alloc_size)
428+
self._mem_set_access(
429+
int(access_va) if access_va is not None else mapped_va,
430+
int(access_size) if access_size is not None else alloc_size,
431+
)
422432
return LocalAllocation(
423433
va=mapped_va,
424434
size=alloc_size,
@@ -509,10 +519,19 @@ def export_handle(self, allocation: LocalAllocation) -> bytes:
509519
return self._export_range(allocation.va, allocation.size)
510520

511521
def import_and_map(
512-
self, peer_rank: int, handle_bytes: bytes, size: int, va: Optional[int] = None
522+
self,
523+
peer_rank: int,
524+
handle_bytes: bytes,
525+
size: int,
526+
va: Optional[int] = None,
527+
*,
528+
access_va: Optional[int] = None,
529+
access_size: Optional[int] = None,
513530
) -> PeerMapping:
514531
"""Import a DMA-BUF descriptor and map it into local GPU address space."""
515532
self._check_initialized()
533+
if (access_va is None) != (access_size is None):
534+
raise LocalHipError("access_va and access_size must be provided together")
516535
if len(handle_bytes) != _AMD_HANDLE_BYTES:
517536
raise LocalHipError(
518537
f"AMD local handle must be {_AMD_HANDLE_BYTES} bytes, got {len(handle_bytes)}"
@@ -548,7 +567,10 @@ def import_and_map(
548567
"hipMemMap",
549568
)
550569
mapped = True
551-
self._mem_set_access(mapped_va, size)
570+
self._mem_set_access(
571+
int(access_va) if access_va is not None else mapped_va,
572+
int(access_size) if access_size is not None else size,
573+
)
552574
return PeerMapping(
553575
peer_rank=peer_rank,
554576
transport=InterconnectLevel.INTRA_NODE,

iris/drivers/local/nvidia.py

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -362,7 +362,12 @@ def initialize(self, device_ordinal: int) -> None:
362362
logger.info("LocalCudaDriver initialized (device %d)", device_ordinal)
363363

364364
def allocate_exportable(
365-
self, size: int, va: Optional[int] = None
365+
self,
366+
size: int,
367+
va: Optional[int] = None,
368+
*,
369+
access_va: Optional[int] = None,
370+
access_size: Optional[int] = None,
366371
) -> LocalAllocation:
367372
"""
368373
Allocate CUDA VMM memory exportable as a POSIX FD.
@@ -371,6 +376,8 @@ def allocate_exportable(
371376
granularity-aligned VA range containing [va, va + size).
372377
"""
373378
self._check_initialized()
379+
if (access_va is None) != (access_size is None):
380+
raise LocalCudaError("access_va and access_size must be provided together")
374381
props = self._make_alloc_props()
375382
granularity = self._get_granularity()
376383
alloc_size = _round_up(size, granularity)
@@ -401,7 +408,10 @@ def allocate_exportable(
401408
"cuMemMap",
402409
)
403410
mapped = True
404-
self._mem_set_access(mapped_va, alloc_size)
411+
self._mem_set_access(
412+
int(access_va) if access_va is not None else mapped_va,
413+
int(access_size) if access_size is not None else alloc_size,
414+
)
405415
return LocalAllocation(
406416
va=mapped_va,
407417
size=alloc_size,
@@ -475,10 +485,19 @@ def _import_handle(self, handle_bytes: bytes) -> int:
475485
return int(imported.value)
476486

477487
def import_and_map(
478-
self, peer_rank: int, handle_bytes: bytes, size: int, va: Optional[int] = None
488+
self,
489+
peer_rank: int,
490+
handle_bytes: bytes,
491+
size: int,
492+
va: Optional[int] = None,
493+
*,
494+
access_va: Optional[int] = None,
495+
access_size: Optional[int] = None,
479496
) -> PeerMapping:
480497
"""Import a POSIX-FD handle and map it into local CUDA VMM VA space."""
481498
self._check_initialized()
499+
if (access_va is None) != (access_size is None):
500+
raise LocalCudaError("access_va and access_size must be provided together")
482501
imported_handle = self._import_handle(handle_bytes)
483502

484503
granularity = self._get_granularity()
@@ -500,7 +519,10 @@ def import_and_map(
500519
"cuMemMap",
501520
)
502521
mapped = True
503-
self._mem_set_access(mapped_va, size)
522+
self._mem_set_access(
523+
int(access_va) if access_va is not None else mapped_va,
524+
int(access_size) if access_size is not None else size,
525+
)
504526
except Exception:
505527
steps: list[tuple[str, Callable[[], None]]] = []
506528
if mapped:

iris/host/distributed/topology.py

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1122,20 +1122,10 @@ def __init__(self, iris_ctx=None):
11221122
if num_gpus <= 0:
11231123
raise RuntimeError("TopologyDiscovery requires at least one GPU")
11241124

1125-
# Use LOCAL_RANK (set by torchrun/SLURM) for per-node GPU assignment.
1126-
# This is more robust than global_rank % num_gpus, which breaks when
1127-
# ranks aren't distributed in a way that aligns with device_count
1128-
# (e.g., 2 nodes with 8 GPUs each but only 4 ranks per node).
1129-
# The % num_gpus clamp handles isolation (LOCAL_RANK=3, device_count=1).
1130-
local_rank = int(os.environ.get("LOCAL_RANK", 0))
1131-
self.gpu_id = local_rank % num_gpus
1132-
# MUST set device BEFORE init_process_group — NCCL needs a CUDA
1133-
# device assigned to this process, otherwise all ranks fight over
1134-
# GPU 0 and init either fails or produces world_size=1.
1135-
torch.cuda.set_device(self.gpu_id)
11361125
if dist.is_initialized():
11371126
self.rank = dist.get_rank()
11381127
self.world_size = dist.get_world_size()
1128+
self.gpu_id = torch.cuda.current_device()
11391129
else:
11401130
raise RuntimeError("TopologyDiscovery requires an initialized distributed process group.")
11411131

iris/host/memory/allocators/vmem_chunked_allocator.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -266,7 +266,15 @@ def _grow_chunk(self):
266266
)
267267

268268
target_va = self.base_va + self.mapped_extent
269-
allocation = self.driver.allocate_exportable(self.chunk_size, va=target_va)
269+
alloc_kwargs = {}
270+
if self.driver.__class__.__name__ == "LocalHipDriver":
271+
alloc_kwargs = {
272+
"access_va": self.base_va,
273+
"access_size": self.mapped_extent + self.chunk_size,
274+
}
275+
allocation = self.driver.allocate_exportable(
276+
self.chunk_size, va=target_va, **alloc_kwargs
277+
)
270278
self.chunks.append(allocation)
271279
self._shared_regions.append(
272280
_SharedRegion(va=allocation.va, size=allocation.size, allocation=allocation)
@@ -506,8 +514,18 @@ def import_external_tensor(self, external_tensor: torch.Tensor) -> torch.Tensor:
506514

507515
target_base_va = self.base_va + target_offset
508516
handle_bytes = self.driver.export_pointer_handle(alloc_base, alloc_size)
517+
import_kwargs = {}
518+
if self.driver.__class__.__name__ == "LocalHipDriver":
519+
import_kwargs = {
520+
"access_va": self.base_va,
521+
"access_size": target_offset + aligned_alloc_size,
522+
}
509523
mapping = self.driver.import_and_map(
510-
self.cur_rank, handle_bytes, aligned_alloc_size, va=target_base_va
524+
self.cur_rank,
525+
handle_bytes,
526+
aligned_alloc_size,
527+
va=target_base_va,
528+
**import_kwargs,
511529
)
512530
self._imported_heap_mappings.append(mapping)
513531
self._shared_regions.append(

iris/host/memory/symmetric_heap.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
import numpy as np
1616
import torch
1717

18-
from iris.host.logging.logging import _log_rank, logger
18+
from iris.host.logging.logging import _log_rank
1919
from iris.host.memory.allocators import TorchAllocator, VMemAllocator, VMemChunkedAllocator
2020
from iris.drivers.base import PeerMapping
2121
from iris.host.distributed.fd_passing import setup_fd_infrastructure
@@ -142,7 +142,7 @@ def __init__(
142142
from iris.host.distributed.topology import TopologyDiscovery
143143

144144
try:
145-
topology = TopologyDiscovery.discover()
145+
topology = TopologyDiscovery().discover()
146146
except Exception as exc:
147147
logger.warning(
148148
"TopologyDiscovery.discover() failed (%s); VMemChunkedAllocator will default to INTRA_NODE driver.",
@@ -555,11 +555,18 @@ def _refresh_peer_access_chunked(self, dist):
555555
reconstructed_handle = _replace_fd_in_local_handle(
556556
peer_handle_bytes, cloned_fd
557557
)
558+
import_kwargs = {}
559+
if len(peer_handle_bytes) == _LOCAL_HIP_HANDLE_BYTES:
560+
import_kwargs = {
561+
"access_va": peer_va_base,
562+
"access_size": peer_offset + peer_size,
563+
}
558564
mapping = self.allocator.driver.import_and_map(
559565
peer,
560566
reconstructed_handle,
561567
peer_size,
562568
va=peer_va_base + peer_offset,
569+
**import_kwargs,
563570
)
564571
self._peer_imported_mappings[peer].append(mapping)
565572

0 commit comments

Comments
 (0)