lablup
diff --git a/‎changes/6498.feature.md‎
Lines changed: 1 addition & 0 deletions b/‎changes/6498.feature.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎configs/agent/sample.toml‎
Lines changed: 30 additions & 21 deletions b/‎configs/agent/sample.toml‎
Lines changed: 30 additions & 21 deletions
diff --git a/‎src/ai/backend/agent/agent.py‎
Lines changed: 29 additions & 10 deletions b/‎src/ai/backend/agent/agent.py‎
Lines changed: 29 additions & 10 deletions
diff --git a/‎src/ai/backend/agent/alloc_map.py‎
Lines changed: 11 additions & 0 deletions b/‎src/ai/backend/agent/alloc_map.py‎
Lines changed: 11 additions & 0 deletions
@@ -0,0 +1 @@
+Add resource isolation options for multi-agent setup
@@ -143,6 +143,14 @@
 # If agents field is populated, this field indicates the default values for all
 # agents.
 [resource]
+  # Hard CPU allocation for this agent (e.g., 8 cores).
+  # Only used in MANUAL allocation mode.
+  # All agents must specify this value when allocation-mode is MANUAL.
+  ## allocated-cpu = 8
+  # Hard memory allocation for this agent (e.g., "32G").
+  # Only used in MANUAL allocation mode.
+  # All agents must specify this value when allocation-mode is MANUAL.
+  ## allocated-mem = "32G"
   # The number of CPU cores reserved for the operating system and the agent
   # service.
   reserved-cpu = 1
@@ -156,6 +164,12 @@
   # Currently this value is unused. In future releases, it may be used to preserve
   # the minimum disk space from the scratch disk allocation via loopback files.
   reserved-disk = "8G"
+  # Resource allocation mode for multi-agent scenarios.
+  # - `shared`: All agents share the full resource pool (default, backward
+  # compatible).
+  # - `auto-split`: Automatically divide resources equally (1/N) among all agents.
+  # - `manual`: Manually specify per-agent resource allocations via config.
+  allocation-mode = "shared"
   # The alignment of the reported main memory size to absorb tiny deviations from
   # per-node firmware/hardware settings. Recommended to be multiple of the
   # page/hugepage size (e.g., 2 MiB).
@@ -165,6 +179,10 @@
   # Affinity policy
   affinity-policy = "INTERLEAVED"
 
+  # Device-specific per-slot resource allocations.
+  # Only used in MANUAL allocation mode.
+  [resource.allocated-devices]
+
 # Pyroscope configuration
 [pyroscope]
   # Whether to enable Pyroscope profiling
@@ -409,24 +427,15 @@
 
   # Resource config overrides for the individual agent
   [agents.resource]
-    # The number of CPU cores reserved for the operating system and the agent
-    # service.
-    reserved-cpu = 1
-    # The memory space reserved for the operating system and the agent service. It
-    # is subtracted from the reported main memory size and not available for user
-    # workload allocation. Depending on the memory-align-size option and system
-    # configuration, this may not be the exact value but have slightly less or more
-    # values within the memory-align-size.
-    reserved-mem = 1073741824
-    # The disk space reserved for the operating system and the agent service.
-    # Currently this value is unused. In future releases, it may be used to preserve
-    # the minimum disk space from the scratch disk allocation via loopback files.
-    reserved-disk = 8589934592
-    # The alignment of the reported main memory size to absorb tiny deviations from
-    # per-node firmware/hardware settings. Recommended to be multiple of the
-    # page/hugepage size (e.g., 2 MiB).
-    memory-align-size = 16777216
-    # Resource allocation order
-    allocation-order = [ "cuda", "rocm", "tpu", "cpu", "mem",]
-    # Affinity policy
-    affinity-policy = 1
+    # Hard CPU allocation for this agent (e.g., 8 cores).
+    # Only used in MANUAL allocation mode.
+    # All agents must specify this value when allocation-mode is MANUAL.
+    ## allocated-cpu = 8
+    # Hard memory allocation for this agent (e.g., "32G").
+    # Only used in MANUAL allocation mode.
+    # All agents must specify this value when allocation-mode is MANUAL.
+    ## allocated-mem = "32G"
+
+    # Device-specific per-slot resource allocations.
+    # Only used in MANUAL allocation mode.
+    [agents.resource.allocated-devices]
@@ -238,6 +238,7 @@
     ComputerContext,
     KernelResourceSpec,
     Mount,
+    ResourcePartitioner,
     align_memory,
     allocate,
     known_slot_types,
@@ -765,7 +766,10 @@ class AbstractAgent(
     etcd: AsyncEtcd
     local_instance_id: str
     kernel_registry: MutableMapping[KernelId, AbstractKernel]
+    resource_partitioner: ResourcePartitioner
     computers: MutableMapping[DeviceName, ComputerContext]
+    total_slots: Mapping[SlotName, Decimal]
+    reserved_slots: Mapping[SlotName, Decimal]
     images: Mapping[ImageCanonical, ScannedImage]
     port_pool: set[int]
 
@@ -836,6 +840,7 @@ def __init__(
         error_monitor: ErrorPluginContext,
         skip_initial_scan: bool = False,
         agent_public_key: Optional[PublicKey],
+        resource_partitioner: ResourcePartitioner,
     ) -> None:
         self._skip_initial_scan = skip_initial_scan
         self.loop = current_loop()
@@ -845,7 +850,10 @@ def __init__(
         self.local_instance_id = generate_local_instance_id(__file__)
         self.agent_public_key = agent_public_key
         self.kernel_registry = {}
+        self.resource_partitioner = resource_partitioner
         self.computers = {}
+        self.total_slots = {}
+        self.reserved_slots = {}
         self.images = {}
         self.restarting_kernels = {}
         self.stat_ctx = StatContext(
@@ -941,6 +949,12 @@ async def __ainit__(self) -> None:
             self.computers[name] = ComputerContext(computer, devices, alloc_map)
             metadatas.append(computer.get_metadata())
 
+        self.total_slots = self.resource_partitioner.calculate_total_slots(
+            self.computers, self.local_config.resource_common
+        )
+        self.reserved_slots = self.resource_partitioner.restrict_computer_resources(
+            self.computers, self.total_slots
+        )
         self.slots = await self.update_slots()
         log.info("Resource slots: {!r}", self.slots)
         log.info("Slot types: {!r}", known_slot_types)
@@ -1947,6 +1961,7 @@ async def load_resources(
         """
         Detect available resources attached on the system and load corresponding device plugin.
         """
+        raise NotImplementedError
 
     @abstractmethod
     async def scan_available_resources(
@@ -1955,6 +1970,7 @@ async def scan_available_resources(
         """
         Scan and define the amount of available resource slots in this node.
         """
+        raise NotImplementedError
 
     async def update_slots(
         self,
@@ -1965,14 +1981,9 @@ async def update_slots(
         """
         scanned_slots = await self.scan_available_resources()
         usable_slots: dict[SlotName, Decimal] = {}
-        reserved_slots = {
-            SlotName("cpu"): Decimal(self.local_config.resource.reserved_cpu),
-            SlotName("mem"): Decimal(self.local_config.resource.reserved_mem),
-            SlotName("disk"): Decimal(self.local_config.resource.reserved_disk),
-        }
         for slot_name, slot_capacity in scanned_slots.items():
             if slot_name == SlotName("mem"):
-                mem_reserved = int(reserved_slots.get(slot_name, 0))
+                mem_reserved = int(self.reserved_slots.get(slot_name, 0))
                 mem_align = int(self.local_config.resource.memory_align_size)
                 mem_usable, mem_reserved = align_memory(
                     int(slot_capacity), mem_reserved, align=mem_align
@@ -1986,7 +1997,7 @@ async def update_slots(
                 )
             else:
                 usable_capacity = max(
-                    Decimal(0), slot_capacity - reserved_slots.get(slot_name, Decimal(0))
+                    Decimal(0), slot_capacity - self.reserved_slots.get(slot_name, Decimal(0))
                 )
             usable_slots[slot_name] = usable_capacity
         return usable_slots
@@ -2098,6 +2109,7 @@ async def scan_images(self) -> ScanImagesResult:
         This is called periodically to keep the image list up-to-date and allow
         manual image addition and deletions by admins.
         """
+        raise NotImplementedError
 
     async def _scan_images_wrapper(self, interval: float) -> None:
         result = await self.scan_images()
@@ -2118,6 +2130,7 @@ async def push_image(
         """
         Push the given image to the given registry.
         """
+        raise NotImplementedError
 
     @abstractmethod
     async def pull_image(
@@ -2130,12 +2143,14 @@ async def pull_image(
         """
         Pull the given image from the given registry.
         """
+        raise NotImplementedError
 
     @abstractmethod
     async def purge_images(self, request: PurgeImagesReq) -> PurgeImagesResp:
         """
         Purge the given images from the agent.
         """
+        raise NotImplementedError
 
     async def check_and_pull(
         self,
@@ -2267,7 +2282,7 @@ async def check_image(
         Check the availability of the image and return a boolean flag that indicates whether
         the agent should try pulling the image from a registry.
         """
-        return False
+        raise NotImplementedError
 
     async def scan_running_kernels(self) -> None:
         """
@@ -3489,6 +3504,7 @@ async def destroy_kernel(
         * Send SIGTERM to the kernel's main process.
         * Send SIGKILL if it's not terminated within a few seconds.
         """
+        raise NotImplementedError
 
     @abstractmethod
     async def clean_kernel(
@@ -3512,6 +3528,7 @@ async def clean_kernel(
         The ``container_id`` may be ``None`` if the container has already gone away.
         In such cases, skip container-specific cleanups.
         """
+        raise NotImplementedError
 
     @abstractmethod
     async def create_local_network(self, network_name: str) -> None:
@@ -3523,6 +3540,7 @@ async def create_local_network(self, network_name: str) -> None:
         It may raise :exc:`NotImplementedError` and then the manager
         will cancel creation of the session.
         """
+        raise NotImplementedError
 
     @abstractmethod
     async def destroy_local_network(self, network_name: str) -> None:
@@ -3531,6 +3549,7 @@ async def destroy_local_network(self, network_name: str) -> None:
 
         This is called by the manager after kernel destruction.
         """
+        raise NotImplementedError
 
     @abstractmethod
     async def restart_kernel__load_config(
@@ -3541,7 +3560,7 @@ async def restart_kernel__load_config(
         """
         Restore the cluster config from a previous launch of the kernel.
         """
-        pass
+        raise NotImplementedError
 
     @abstractmethod
     async def restart_kernel__store_config(
@@ -3554,7 +3573,7 @@ async def restart_kernel__store_config(
         Store the cluster config to a kernel-related storage (e.g., scratch space),
         so that restarts of this kernel can reuse the configuration.
         """
-        pass
+        raise NotImplementedError
 
     async def restart_kernel(
         self,
 
@@ -236,6 +236,17 @@ def update_affinity_hint(
                     hint_for_next_allocation.append(dev)
         affinity_hint.devices = hint_for_next_allocation
 
+    @final
+    def update_device_slot_amounts(self, slot_amounts: Mapping[SlotName, Decimal]) -> None:
+        self.device_slots = {
+            device_id: DeviceSlotInfo(
+                slot_type=slot_info.slot_type,
+                slot_name=slot_info.slot_name,
+                amount=slot_amounts[slot_info.slot_name],
+            )
+            for device_id, slot_info in self.device_slots.items()
+        }
+
     @abstractmethod
     def allocate(
         self,
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Add resource isolation options for multi-agent setup`