Skip to content

Commit 79d9b6a

Browse files
committed
feat(BA-3024): Add custom resource alloc for agents in config
This change adds configuration for partitioning resources rather than every agent always seeing the full resource pool. This prevents unintended over-allocation that could crash kernels. SHARED mode allows all agents to see full resources (useful for stress testing). This is the same behavior as before. AUTO_SPLIT automatically divides resources equally among agents. MANUAL mode lets users specify exact per-agent allocations for all resources. Single-agent deployments remain unaffected and retain access to all available hardware resources.
1 parent a3d0d3b commit 79d9b6a

File tree

4 files changed

+690
-37
lines changed

4 files changed

+690
-37
lines changed

changes/6724.feature.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Add custom resource allocation in agent server config

configs/agent/sample.toml

Lines changed: 30 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,14 @@
143143
# If agents field is populated, this field indicates the default values for all
144144
# agents.
145145
[resource]
146+
# Hard CPU allocation for this agent (e.g., 8 cores).
147+
# Only used in MANUAL allocation mode.
148+
# All agents must specify this value when allocation-mode is MANUAL.
149+
## allocated-cpu = 8
150+
# Hard memory allocation for this agent (e.g., "32G").
151+
# Only used in MANUAL allocation mode.
152+
# All agents must specify this value when allocation-mode is MANUAL.
153+
## allocated-mem = "32G"
146154
# The number of CPU cores reserved for the operating system and the agent
147155
# service.
148156
reserved-cpu = 1
@@ -156,6 +164,12 @@
156164
# Currently this value is unused. In future releases, it may be used to preserve
157165
# the minimum disk space from the scratch disk allocation via loopback files.
158166
reserved-disk = "8G"
167+
# Resource allocation mode for multi-agent scenarios.
168+
# - `shared`: All agents share the full resource pool (default, backward
169+
# compatible).
170+
# - `auto-split`: Automatically divide resources equally (1/N) among all agents.
171+
# - `manual`: Manually specify per-agent resource allocations via config.
172+
allocation-mode = "shared"
159173
# The alignment of the reported main memory size to absorb tiny deviations from
160174
# per-node firmware/hardware settings. Recommended to be multiple of the
161175
# page/hugepage size (e.g., 2 MiB).
@@ -165,6 +179,10 @@
165179
# Affinity policy
166180
affinity-policy = "INTERLEAVED"
167181

182+
# Device-specific per-slot resource allocations.
183+
# Only used in MANUAL allocation mode.
184+
[resource.allocated-devices]
185+
168186
# Pyroscope configuration
169187
[pyroscope]
170188
# Whether to enable Pyroscope profiling
@@ -409,24 +427,15 @@
409427

410428
# Resource config overrides for the individual agent
411429
[agents.resource]
412-
# The number of CPU cores reserved for the operating system and the agent
413-
# service.
414-
reserved-cpu = 1
415-
# The memory space reserved for the operating system and the agent service. It
416-
# is subtracted from the reported main memory size and not available for user
417-
# workload allocation. Depending on the memory-align-size option and system
418-
# configuration, this may not be the exact value but have slightly less or more
419-
# values within the memory-align-size.
420-
reserved-mem = 1073741824
421-
# The disk space reserved for the operating system and the agent service.
422-
# Currently this value is unused. In future releases, it may be used to preserve
423-
# the minimum disk space from the scratch disk allocation via loopback files.
424-
reserved-disk = 8589934592
425-
# The alignment of the reported main memory size to absorb tiny deviations from
426-
# per-node firmware/hardware settings. Recommended to be multiple of the
427-
# page/hugepage size (e.g., 2 MiB).
428-
memory-align-size = 16777216
429-
# Resource allocation order
430-
allocation-order = [ "cuda", "rocm", "tpu", "cpu", "mem",]
431-
# Affinity policy
432-
affinity-policy = 1
430+
# Hard CPU allocation for this agent (e.g., 8 cores).
431+
# Only used in MANUAL allocation mode.
432+
# All agents must specify this value when allocation-mode is MANUAL.
433+
## allocated-cpu = 8
434+
# Hard memory allocation for this agent (e.g., "32G").
435+
# Only used in MANUAL allocation mode.
436+
# All agents must specify this value when allocation-mode is MANUAL.
437+
## allocated-mem = "32G"
438+
439+
# Device-specific per-slot resource allocations.
440+
# Only used in MANUAL allocation mode.
441+
[agents.resource.allocated-devices]

src/ai/backend/agent/config/unified.py

Lines changed: 111 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import os
1212
import sys
1313
import textwrap
14+
from decimal import Decimal
1415
from pathlib import Path
1516
from typing import (
1617
Any,
@@ -49,6 +50,7 @@
4950
BinarySizeField,
5051
ResourceGroupType,
5152
ServiceDiscoveryType,
53+
SlotName,
5254
)
5355
from ai.backend.logging import BraceStyleAdapter
5456
from ai.backend.logging.config import LoggingConfig
@@ -77,6 +79,12 @@ class ScratchType(enum.StrEnum):
7779
K8S_NFS = "k8s-nfs"
7880

7981

82+
class ResourceAllocationMode(enum.StrEnum):
83+
SHARED = "shared"
84+
AUTO_SPLIT = "auto-split"
85+
MANUAL = "manual"
86+
87+
8088
class AgentConfigValidationContext(BaseConfigValidationContext):
8189
is_invoked_subcommand: bool
8290

@@ -867,7 +875,7 @@ class ContainerConfig(CommonContainerConfig, OverridableContainerConfig):
867875
pass
868876

869877

870-
class ResourceConfig(BaseConfigSchema):
878+
class CommonResourceConfig(BaseConfigSchema):
871879
reserved_cpu: int = Field(
872880
default=1,
873881
description="The number of CPU cores reserved for the operating system and the agent service.",
@@ -899,6 +907,18 @@ class ResourceConfig(BaseConfigSchema):
899907
validation_alias=AliasChoices("reserved-disk", "reserved_disk"),
900908
serialization_alias="reserved-disk",
901909
)
910+
allocation_mode: ResourceAllocationMode = Field(
911+
default=ResourceAllocationMode.SHARED,
912+
description=textwrap.dedent("""
913+
Resource allocation mode for multi-agent scenarios.
914+
- `shared`: All agents share the full resource pool (default, backward compatible).
915+
- `auto-split`: Automatically divide resources equally (1/N) among all agents.
916+
- `manual`: Manually specify per-agent resource allocations via config.
917+
"""),
918+
examples=[item.value for item in ResourceAllocationMode],
919+
validation_alias=AliasChoices("allocation-mode", "allocation_mode"),
920+
serialization_alias="allocation-mode",
921+
)
902922
memory_align_size: BinarySizeField = Field(
903923
default=BinarySize.finite_from_str("16M"),
904924
description=(
@@ -941,6 +961,64 @@ def _parse_affinity_policy(cls, v: Any) -> AffinityPolicy:
941961
return v
942962

943963

964+
class OverridableResourceConfig(BaseConfigSchema):
965+
allocated_cpu: Optional[int] = Field(
966+
default=None,
967+
description=textwrap.dedent("""
968+
Hard CPU allocation for this agent (e.g., 8 cores).
969+
Only used in MANUAL allocation mode.
970+
All agents must specify this value when allocation-mode is MANUAL.
971+
"""),
972+
examples=[8, 16],
973+
validation_alias=AliasChoices("allocated-cpu", "allocated_cpu"),
974+
serialization_alias="allocated-cpu",
975+
)
976+
allocated_mem: Optional[BinarySizeField] = Field(
977+
default=None,
978+
description=textwrap.dedent("""
979+
Hard memory allocation for this agent (e.g., "32G").
980+
Only used in MANUAL allocation mode.
981+
All agents must specify this value when allocation-mode is MANUAL.
982+
"""),
983+
examples=["32G", "64G"],
984+
validation_alias=AliasChoices("allocated-mem", "allocated_mem"),
985+
serialization_alias="allocated-mem",
986+
)
987+
allocated_devices: Mapping[SlotName, Decimal] = Field(
988+
default_factory=dict,
989+
description=textwrap.dedent("""
990+
Device-specific per-slot resource allocations.
991+
Only used in MANUAL allocation mode.
992+
"""),
993+
examples=[{"cuda.mem": "0.3", "cuda.shares": "0.5"}],
994+
validation_alias=AliasChoices("allocated-devices", "allocated_devices"),
995+
serialization_alias="allocated-devices",
996+
)
997+
998+
model_config = ConfigDict(
999+
extra="allow",
1000+
arbitrary_types_allowed=True,
1001+
)
1002+
1003+
@model_validator(mode="after")
1004+
def validate_values_are_positive(self) -> Self:
1005+
if self.allocated_cpu is not None and self.allocated_cpu < 0:
1006+
raise ValueError(
1007+
f"Allocated cpu must not be a negative value, but given {self.allocated_cpu}"
1008+
)
1009+
if self.allocated_mem is not None and self.allocated_mem < 0:
1010+
raise ValueError(
1011+
f"Allocated mem must not be a negative value, but given {self.allocated_mem}"
1012+
)
1013+
if any(value < 0 for value in self.allocated_devices.values()):
1014+
raise ValueError("All allocated device resource values must not be a negative value")
1015+
return self
1016+
1017+
1018+
class ResourceConfig(CommonResourceConfig, OverridableResourceConfig):
1019+
pass
1020+
1021+
9441022
class EtcdConfig(BaseConfigSchema):
9451023
namespace: str = Field(
9461024
description="Etcd namespace",
@@ -1187,7 +1265,7 @@ class AgentOverrideConfig(BaseConfigSchema):
11871265
default=None,
11881266
description="Container config overrides for the individual agent",
11891267
)
1190-
resource: ResourceConfig | None = Field(
1268+
resource: OverridableResourceConfig | None = Field(
11911269
default=None,
11921270
description="Resource config overrides for the individual agent",
11931271
)
@@ -1315,3 +1393,34 @@ def _validate_agent_configs(self) -> Self:
13151393
config.validate_agent_specific_config()
13161394

13171395
return self
1396+
1397+
@model_validator(mode="after")
1398+
def _validate_resource_allocation_mode(self) -> Self:
1399+
match self.resource.allocation_mode:
1400+
case ResourceAllocationMode.SHARED | ResourceAllocationMode.AUTO_SPLIT:
1401+
for config in self.agent_configs:
1402+
resource = config.resource
1403+
if any([
1404+
resource.allocated_cpu is not None,
1405+
resource.allocated_mem is not None,
1406+
resource.allocated_devices,
1407+
]):
1408+
raise ValueError(
1409+
"On non-MANUAL mode, config must not specify manual resource allocations"
1410+
)
1411+
1412+
case ResourceAllocationMode.MANUAL:
1413+
for config in self.agent_configs:
1414+
resource = config.resource
1415+
if any([resource.allocated_cpu is None, resource.allocated_mem is None]):
1416+
raise ValueError(
1417+
"On MANUAL mode, config must specify cpu and mem resource allocations"
1418+
)
1419+
1420+
slot_names = [
1421+
set(config.resource.allocated_devices.keys()) for config in self.agent_configs
1422+
]
1423+
if not all(slot_name == slot_names[0] for slot_name in slot_names):
1424+
raise ValueError("All agents must have the same slots defined in the devices!")
1425+
1426+
return self

0 commit comments

Comments
 (0)