lablup
diff --git a/‎changes/6498.feature.md‎
Lines changed: 1 addition & 0 deletions b/‎changes/6498.feature.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/ai/backend/agent/agent.py‎
Lines changed: 16 additions & 8 deletions b/‎src/ai/backend/agent/agent.py‎
Lines changed: 16 additions & 8 deletions
diff --git a/‎src/ai/backend/agent/alloc_map.py‎
Lines changed: 11 additions & 0 deletions b/‎src/ai/backend/agent/alloc_map.py‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎src/ai/backend/agent/docker/agent.py‎
Lines changed: 3 additions & 0 deletions b/‎src/ai/backend/agent/docker/agent.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/ai/backend/agent/kubernetes/agent.py‎
Lines changed: 3 additions & 0 deletions b/‎src/ai/backend/agent/kubernetes/agent.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/ai/backend/agent/resources.py‎
Lines changed: 170 additions & 0 deletions b/‎src/ai/backend/agent/resources.py‎
Lines changed: 170 additions & 0 deletions
@@ -0,0 +1 @@
+Add resource isolation options for multi-agent setup
@@ -238,6 +238,7 @@
     ComputerContext,
     KernelResourceSpec,
     Mount,
+    ResourcePartitioner,
     align_memory,
     allocate,
     known_slot_types,
@@ -765,7 +766,10 @@ class AbstractAgent(
     etcd: AsyncEtcd
     local_instance_id: str
     kernel_registry: MutableMapping[KernelId, AbstractKernel]
+    resource_partitioner: ResourcePartitioner
     computers: MutableMapping[DeviceName, ComputerContext]
+    total_slots: Mapping[SlotName, Decimal]
+    reserved_slots: Mapping[SlotName, Decimal]
     images: Mapping[ImageCanonical, ScannedImage]
     port_pool: set[int]
 
@@ -836,6 +840,7 @@ def __init__(
         error_monitor: ErrorPluginContext,
         skip_initial_scan: bool = False,
         agent_public_key: Optional[PublicKey],
+        resource_partitioner: ResourcePartitioner,
     ) -> None:
         self._skip_initial_scan = skip_initial_scan
         self.loop = current_loop()
@@ -845,7 +850,10 @@ def __init__(
         self.local_instance_id = generate_local_instance_id(__file__)
         self.agent_public_key = agent_public_key
         self.kernel_registry = {}
+        self.resource_partitioner = resource_partitioner
         self.computers = {}
+        self.total_slots = {}
+        self.reserved_slots = {}
         self.images = {}
         self.restarting_kernels = {}
         self.stat_ctx = StatContext(
@@ -941,6 +949,12 @@ async def __ainit__(self) -> None:
             self.computers[name] = ComputerContext(computer, devices, alloc_map)
             metadatas.append(computer.get_metadata())
 
+        self.total_slots = self.resource_partitioner.calculate_total_slots(
+            self.computers, self.local_config.resource_common
+        )
+        self.reserved_slots = self.resource_partitioner.restrict_computer_resources(
+            self.computers, self.total_slots
+        )
         self.slots = await self.update_slots()
         log.info("Resource slots: {!r}", self.slots)
         log.info("Slot types: {!r}", known_slot_types)
@@ -1965,14 +1979,9 @@ async def update_slots(
         """
         scanned_slots = await self.scan_available_resources()
         usable_slots: dict[SlotName, Decimal] = {}
-        reserved_slots = {
-            SlotName("cpu"): Decimal(self.local_config.resource.reserved_cpu),
-            SlotName("mem"): Decimal(self.local_config.resource.reserved_mem),
-            SlotName("disk"): Decimal(self.local_config.resource.reserved_disk),
-        }
         for slot_name, slot_capacity in scanned_slots.items():
             if slot_name == SlotName("mem"):
-                mem_reserved = int(reserved_slots.get(slot_name, 0))
+                mem_reserved = int(self.reserved_slots.get(slot_name, 0))
                 mem_align = int(self.local_config.resource.memory_align_size)
                 mem_usable, mem_reserved = align_memory(
                     int(slot_capacity), mem_reserved, align=mem_align
@@ -1986,7 +1995,7 @@ async def update_slots(
                 )
             else:
                 usable_capacity = max(
-                    Decimal(0), slot_capacity - reserved_slots.get(slot_name, Decimal(0))
+                    Decimal(0), slot_capacity - self.reserved_slots.get(slot_name, Decimal(0))
                 )
             usable_slots[slot_name] = usable_capacity
         return usable_slots
@@ -2267,7 +2276,6 @@ async def check_image(
         Check the availability of the image and return a boolean flag that indicates whether
         the agent should try pulling the image from a registry.
         """
-        return False
 
     async def scan_running_kernels(self) -> None:
         """
 
@@ -236,6 +236,17 @@ def update_affinity_hint(
                     hint_for_next_allocation.append(dev)
         affinity_hint.devices = hint_for_next_allocation
 
+    @final
+    def update_device_slot_amounts(self, slot_amounts: Mapping[SlotName, Decimal]) -> None:
+        self.device_slots = {
+            device_id: DeviceSlotInfo(
+                slot_type=slot_info.slot_type,
+                slot_name=slot_info.slot_name,
+                amount=slot_amounts[slot_info.slot_name],
+            )
+            for device_id, slot_info in self.device_slots.items()
+        }
+
     @abstractmethod
     def allocate(
         self,
 
@@ -116,6 +116,7 @@
     ComputerContext,
     KernelResourceSpec,
     Mount,
+    ResourcePartitioner,
     known_slot_types,
 )
 from ..scratch import create_loop_filesystem, destroy_loop_filesystem
@@ -1315,6 +1316,7 @@ def __init__(
         skip_initial_scan: bool = False,
         agent_public_key: Optional[PublicKey],
         metadata_server: MetadataServer,
+        resource_partitioner: ResourcePartitioner,
     ) -> None:
         super().__init__(
             etcd,
@@ -1323,6 +1325,7 @@ def __init__(
             error_monitor=error_monitor,
             skip_initial_scan=skip_initial_scan,
             agent_public_key=agent_public_key,
+            resource_partitioner=resource_partitioner,
         )
         self.checked_invalid_images = set()
         self.metadata_server = metadata_server
 
@@ -74,6 +74,7 @@
     ComputerContext,
     KernelResourceSpec,
     Mount,
+    ResourcePartitioner,
     known_slot_types,
 )
 from ..types import Container, KernelOwnershipData, MountInfo, Port
@@ -829,6 +830,7 @@ def __init__(
         error_monitor: ErrorPluginContext,
         skip_initial_scan: bool = False,
         agent_public_key: Optional[PublicKey],
+        resource_partitioner: ResourcePartitioner,
     ) -> None:
         super().__init__(
             etcd,
@@ -837,6 +839,7 @@ def __init__(
             error_monitor=error_monitor,
             skip_initial_scan=skip_initial_scan,
             agent_public_key=agent_public_key,
+            resource_partitioner=resource_partitioner,
         )
 
     async def __ainit__(self) -> None:
 
@@ -29,6 +29,11 @@
 import aiodocker
 import attrs
 
+from ai.backend.agent.config.unified import (
+    CommonResourceConfig,
+    ResourceAllocationMode,
+    ResourceConfig,
+)
 from ai.backend.common.json import dump_json_str, load_json
 from ai.backend.common.plugin import AbstractPlugin, BasePluginContext
 from ai.backend.common.types import (
@@ -71,6 +76,17 @@
 known_slot_types: Mapping[SlotName, SlotTypes] = {}
 
 
+def _combine_mappings(mappings: list[Mapping[SlotName, Decimal]]) -> dict[SlotName, Decimal]:
+    combined: dict[SlotName, Decimal] = {}
+    for mapping in mappings:
+        if set(combined.keys()) & set(mapping.keys()):
+            raise ValueError(
+                f"Duplicate keys found in devices: {combined.keys()} and {mapping.keys()}"
+            )
+        combined = {**combined, **mapping}
+    return combined
+
+
 @attrs.define(auto_attribs=True, slots=True)
 class ComputerContext:
     instance: AbstractComputePlugin
@@ -444,6 +460,160 @@ def get_additional_allowed_syscalls(self) -> list[str]:
         return []
 
 
+class ResourcePartitioner:
+    def __init__(
+        self,
+        resource_config: ResourceConfig,
+        num_agents: int,
+        agent_idx: int,
+    ) -> None:
+        self.resource_config = resource_config
+        self.num_agents = num_agents
+        self.agent_idx = agent_idx
+        self.resource_scaling_factor: Mapping[SlotName, Decimal] = {}
+
+    @staticmethod
+    def calculate_total_slots(
+        computers: Mapping[DeviceName, ComputerContext],
+        resource_config: CommonResourceConfig,
+        deduct_reserved: bool = False,
+    ) -> dict[SlotName, Decimal]:
+        total_slots: dict[SlotName, Decimal] = defaultdict(lambda: Decimal("0"))
+        for device in computers.values():
+            for slot_info in device.alloc_map.device_slots.values():
+                total_slots[slot_info.slot_name] += slot_info.amount
+        if deduct_reserved:
+            return ResourcePartitioner.deduct_reserved_resources(total_slots, resource_config)
+        else:
+            return total_slots
+
+    @staticmethod
+    def deduct_reserved_resources(
+        total_slots: Mapping[SlotName, Decimal],
+        resource_config: CommonResourceConfig,
+    ) -> dict[SlotName, Decimal]:
+        reserved_resources = {
+            SlotName("cpu"): Decimal(resource_config.reserved_cpu),
+            SlotName("mem"): Decimal(resource_config.reserved_mem),
+        }
+
+        slots: dict[SlotName, Decimal] = {}
+        for slot_name, slot in total_slots.items():
+            slots[slot_name] = slot - reserved_resources.get(slot_name, Decimal("0"))
+        return slots
+
+    def restrict_computer_resources(
+        self,
+        computers: MutableMapping[DeviceName, ComputerContext],
+        total_slots: Mapping[SlotName, Decimal],
+    ) -> dict[SlotName, Decimal]:
+        devices_allocated_slots: list[Mapping[SlotName, Decimal]] = []
+        devices_reserved_slots: list[Mapping[SlotName, Decimal]] = []
+        for device in computers.values():
+            device_allocated_slots = self._calculate_device_slots(device.alloc_map, total_slots)
+            device.alloc_map.update_device_slot_amounts(device_allocated_slots)
+            devices_allocated_slots.append(device_allocated_slots)
+
+            device_reserved_slots = self._calculate_reserved_slots(
+                device_allocated_slots, total_slots
+            )
+            devices_reserved_slots.append(device_reserved_slots)
+
+        allocated_slots = _combine_mappings(devices_allocated_slots)
+        self.resource_scaling_factor = self._calculate_resource_scaling_factor(
+            allocated_slots, total_slots
+        )
+
+        reserved_slots = _combine_mappings(devices_reserved_slots)
+        return reserved_slots
+
+    def get_resource_scaling_factor(self, slot_name: SlotName) -> Decimal:
+        return self.resource_scaling_factor[slot_name]
+
+    def _calculate_device_slots(
+        self,
+        alloc_map: AbstractAllocMap,
+        total_slots: Mapping[SlotName, Decimal],
+    ) -> dict[SlotName, Decimal]:
+        total_slots_no_reserved = ResourcePartitioner.deduct_reserved_resources(
+            total_slots, self.resource_config
+        )
+        return {
+            device_slot.slot_name: self._calculate_device_slot(
+                device_slot.slot_name,
+                total_slots_no_reserved[device_slot.slot_name],
+                type(alloc_map),
+            )
+            for device_slot in alloc_map.device_slots.values()
+        }
+
+    def _calculate_device_slot(
+        self,
+        slot_name: SlotName,
+        total_slot: Decimal,
+        alloc_map_type: Type[AbstractAllocMap],
+    ) -> Decimal:
+        match self.resource_config.allocation_mode:
+            case ResourceAllocationMode.SHARED:
+                return total_slot
+            case ResourceAllocationMode.AUTO_SPLIT:
+                if alloc_map_type is DiscretePropertyAllocMap:
+                    slot, slot_extra = divmod(total_slot, self.num_agents)
+                    remainder_value = 1 if self.agent_idx < slot_extra else 0
+                    return slot + remainder_value
+                elif alloc_map_type is FractionAllocMap:
+                    return total_slot / self.num_agents
+                else:
+                    raise NotImplementedError(
+                        f"Unrecognized AbstractAllocMap type {alloc_map_type}"
+                    )
+            case ResourceAllocationMode.MANUAL:
+                match slot_name:
+                    case "cpu":
+                        assert self.resource_config.allocated_cpu is not None
+                        return Decimal(self.resource_config.allocated_cpu)
+                    case "mem":
+                        assert self.resource_config.allocated_mem is not None
+                        return Decimal(self.resource_config.allocated_mem)
+                    case slot_name:
+                        if slot_name not in self.resource_config.allocated_devices:
+                            raise ValueError(
+                                f"{slot_name=} not found in config {self.resource_config.allocated_devices!r}"
+                            )
+                        return self.resource_config.allocated_devices[slot_name]
+
+    def _calculate_reserved_slots(
+        self,
+        device_slots: Mapping[SlotName, Decimal],
+        total_slots: Mapping[SlotName, Decimal],
+    ) -> dict[SlotName, Decimal]:
+        reserved_slots: dict[SlotName, Decimal] = {}
+        for slot_name, slot in device_slots.items():
+            reserved_slots[slot_name] = max(total_slots[slot_name] - slot, Decimal(0))
+        return reserved_slots
+
+    def _calculate_resource_scaling_factor(
+        self,
+        allocated_slots: Mapping[SlotName, Decimal],
+        total_slots: Mapping[SlotName, Decimal],
+    ) -> dict[SlotName, Decimal]:
+        match self.resource_config.allocation_mode:
+            case ResourceAllocationMode.SHARED:
+                return defaultdict(lambda: Decimal(1.0))
+            case ResourceAllocationMode.AUTO_SPLIT:
+                return defaultdict(lambda: Decimal(1.0) / Decimal(self.num_agents))
+            case ResourceAllocationMode.MANUAL:
+                if SlotName("cpu") not in allocated_slots or SlotName("cpu") not in total_slots:
+                    raise ValueError("CPU not in allocated or total slots seen")
+                if SlotName("mem") not in allocated_slots or SlotName("mem") not in total_slots:
+                    raise ValueError("Memory not in allocated or total slots seen")
+                scaling_factor = {
+                    slot_name: slot / total_slots[slot_name]
+                    for slot_name, slot in allocated_slots.items()
+                }
+                return scaling_factor
+
+
 class ComputePluginContext(BasePluginContext[AbstractComputePlugin]):
     plugin_group = "backendai_accelerator_v21"
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Add resource isolation options for multi-agent setup`