Skip to content

Commit 6d2e3d2

Browse files
committed
feat(BA-3024): Add custom resource alloc for agents in config
This change adds configuration for partitioning resources rather than every agent always seeing the full resource pool. This prevents unintended over-allocation that could crash kernels. SHARED mode allows all agents to see full resources (useful for stress testing). This is the same behavior as before. AUTO_SPLIT automatically divides resources equally among agents. MANUAL mode lets users specify exact per-agent allocations for all resources. Single-agent deployments remain unaffected and retain access to all available hardware resources.
1 parent b7b9429 commit 6d2e3d2

File tree

4 files changed

+694
-37
lines changed

4 files changed

+694
-37
lines changed

changes/6724.feature.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Add custom resource allocation in agent server config

configs/agent/sample.toml

Lines changed: 30 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,14 @@
143143
# If agents field is populated, this field indicates the default values for all
144144
# agents.
145145
[resource]
146+
# Hard CPU allocation for this agent (e.g., 8 cores).
147+
# Only used in MANUAL allocation mode.
148+
# All agents must specify this value when allocation-mode is MANUAL.
149+
## allocated-cpu = 8
150+
# Hard memory allocation for this agent (e.g., "32G").
151+
# Only used in MANUAL allocation mode.
152+
# All agents must specify this value when allocation-mode is MANUAL.
153+
## allocated-mem = "32G"
146154
# The number of CPU cores reserved for the operating system and the agent
147155
# service.
148156
reserved-cpu = 1
@@ -156,6 +164,12 @@
156164
# Currently this value is unused. In future releases, it may be used to preserve
157165
# the minimum disk space from the scratch disk allocation via loopback files.
158166
reserved-disk = "8G"
167+
# Resource allocation mode for multi-agent scenarios.
168+
# - `shared`: All agents share the full resource pool (default, backward
169+
# compatible).
170+
# - `auto-split`: Automatically divide resources equally (1/N) among all agents.
171+
# - `manual`: Manually specify per-agent resource allocations via config.
172+
allocation-mode = "shared"
159173
# The alignment of the reported main memory size to absorb tiny deviations from
160174
# per-node firmware/hardware settings. Recommended to be multiple of the
161175
# page/hugepage size (e.g., 2 MiB).
@@ -165,6 +179,10 @@
165179
# Affinity policy
166180
affinity-policy = "INTERLEAVED"
167181

182+
# Device-specific per-slot resource allocations.
183+
# Only used in MANUAL allocation mode.
184+
[resource.allocated-devices]
185+
168186
# Pyroscope configuration
169187
[pyroscope]
170188
# Whether to enable Pyroscope profiling
@@ -409,24 +427,15 @@
409427

410428
# Resource config overrides for the individual agent
411429
[agents.resource]
412-
# The number of CPU cores reserved for the operating system and the agent
413-
# service.
414-
reserved-cpu = 1
415-
# The memory space reserved for the operating system and the agent service. It
416-
# is subtracted from the reported main memory size and not available for user
417-
# workload allocation. Depending on the memory-align-size option and system
418-
# configuration, this may not be the exact value but have slightly less or more
419-
# values within the memory-align-size.
420-
reserved-mem = 1073741824
421-
# The disk space reserved for the operating system and the agent service.
422-
# Currently this value is unused. In future releases, it may be used to preserve
423-
# the minimum disk space from the scratch disk allocation via loopback files.
424-
reserved-disk = 8589934592
425-
# The alignment of the reported main memory size to absorb tiny deviations from
426-
# per-node firmware/hardware settings. Recommended to be multiple of the
427-
# page/hugepage size (e.g., 2 MiB).
428-
memory-align-size = 16777216
429-
# Resource allocation order
430-
allocation-order = [ "cuda", "rocm", "tpu", "cpu", "mem",]
431-
# Affinity policy
432-
affinity-policy = 1
430+
# Hard CPU allocation for this agent (e.g., 8 cores).
431+
# Only used in MANUAL allocation mode.
432+
# All agents must specify this value when allocation-mode is MANUAL.
433+
## allocated-cpu = 8
434+
# Hard memory allocation for this agent (e.g., "32G").
435+
# Only used in MANUAL allocation mode.
436+
# All agents must specify this value when allocation-mode is MANUAL.
437+
## allocated-mem = "32G"
438+
439+
# Device-specific per-slot resource allocations.
440+
# Only used in MANUAL allocation mode.
441+
[agents.resource.allocated-devices]

src/ai/backend/agent/config/unified.py

Lines changed: 115 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import os
1212
import sys
1313
import textwrap
14+
from decimal import Decimal
1415
from pathlib import Path
1516
from typing import (
1617
Any,
@@ -50,6 +51,7 @@
5051
BinarySizeField,
5152
ResourceGroupType,
5253
ServiceDiscoveryType,
54+
SlotName,
5355
)
5456
from ai.backend.logging import BraceStyleAdapter
5557
from ai.backend.logging.config import LoggingConfig
@@ -80,6 +82,12 @@ class ScratchType(enum.StrEnum):
8082
K8S_NFS = "k8s-nfs"
8183

8284

85+
class ResourceAllocationMode(enum.StrEnum):
86+
SHARED = "shared"
87+
AUTO_SPLIT = "auto-split"
88+
MANUAL = "manual"
89+
90+
8391
class AgentConfigValidationContext(BaseConfigValidationContext):
8492
is_invoked_subcommand: bool
8593

@@ -862,7 +870,7 @@ class ContainerConfig(CommonContainerConfig, OverridableContainerConfig):
862870
pass
863871

864872

865-
class ResourceConfig(BaseConfigSchema):
873+
class CommonResourceConfig(BaseConfigSchema):
866874
reserved_cpu: int = Field(
867875
default=1,
868876
description="The number of CPU cores reserved for the operating system and the agent service.",
@@ -894,6 +902,18 @@ class ResourceConfig(BaseConfigSchema):
894902
validation_alias=AliasChoices("reserved-disk", "reserved_disk"),
895903
serialization_alias="reserved-disk",
896904
)
905+
allocation_mode: ResourceAllocationMode = Field(
906+
default=ResourceAllocationMode.SHARED,
907+
description=textwrap.dedent("""
908+
Resource allocation mode for multi-agent scenarios.
909+
- `shared`: All agents share the full resource pool (default, backward compatible).
910+
- `auto-split`: Automatically divide resources equally (1/N) among all agents.
911+
- `manual`: Manually specify per-agent resource allocations via config.
912+
"""),
913+
examples=[item.value for item in ResourceAllocationMode],
914+
validation_alias=AliasChoices("allocation-mode", "allocation_mode"),
915+
serialization_alias="allocation-mode",
916+
)
897917
memory_align_size: BinarySizeField = Field(
898918
default=BinarySize.finite_from_str("16M"),
899919
description=(
@@ -936,6 +956,64 @@ def _parse_affinity_policy(cls, v: Any) -> AffinityPolicy:
936956
return v
937957

938958

959+
class OverridableResourceConfig(BaseConfigSchema):
960+
allocated_cpu: Optional[int] = Field(
961+
default=None,
962+
description=textwrap.dedent("""
963+
Hard CPU allocation for this agent (e.g., 8 cores).
964+
Only used in MANUAL allocation mode.
965+
All agents must specify this value when allocation-mode is MANUAL.
966+
"""),
967+
examples=[8, 16],
968+
validation_alias=AliasChoices("allocated-cpu", "allocated_cpu"),
969+
serialization_alias="allocated-cpu",
970+
)
971+
allocated_mem: Optional[BinarySizeField] = Field(
972+
default=None,
973+
description=textwrap.dedent("""
974+
Hard memory allocation for this agent (e.g., "32G").
975+
Only used in MANUAL allocation mode.
976+
All agents must specify this value when allocation-mode is MANUAL.
977+
"""),
978+
examples=["32G", "64G"],
979+
validation_alias=AliasChoices("allocated-mem", "allocated_mem"),
980+
serialization_alias="allocated-mem",
981+
)
982+
allocated_devices: Mapping[SlotName, Decimal] = Field(
983+
default_factory=dict,
984+
description=textwrap.dedent("""
985+
Device-specific per-slot resource allocations.
986+
Only used in MANUAL allocation mode.
987+
"""),
988+
examples=[{"cuda.mem": "0.3", "cuda.shares": "0.5"}],
989+
validation_alias=AliasChoices("allocated-devices", "allocated_devices"),
990+
serialization_alias="allocated-devices",
991+
)
992+
993+
model_config = ConfigDict(
994+
extra="allow",
995+
arbitrary_types_allowed=True,
996+
)
997+
998+
@model_validator(mode="after")
999+
def validate_values_are_positive(self) -> Self:
1000+
if self.allocated_cpu is not None and self.allocated_cpu < 0:
1001+
raise ValueError(
1002+
f"Allocated cpu must not be a negative value, but given {self.allocated_cpu}"
1003+
)
1004+
if self.allocated_mem is not None and self.allocated_mem < 0:
1005+
raise ValueError(
1006+
f"Allocated mem must not be a negative value, but given {self.allocated_mem}"
1007+
)
1008+
if any(value < 0 for value in self.allocated_devices.values()):
1009+
raise ValueError("All allocated device resource values must not be a negative value")
1010+
return self
1011+
1012+
1013+
class ResourceConfig(CommonResourceConfig, OverridableResourceConfig):
1014+
pass
1015+
1016+
9391017
class EtcdConfig(BaseConfigSchema):
9401018
namespace: str = Field(
9411019
description="Etcd namespace",
@@ -1165,7 +1243,7 @@ class AgentOverrideConfig(BaseConfigSchema):
11651243
default=None,
11661244
description="Container config overrides for the individual agent",
11671245
)
1168-
resource: ResourceConfig | None = Field(
1246+
resource: OverridableResourceConfig | None = Field(
11691247
default=None,
11701248
description="Resource config overrides for the individual agent",
11711249
)
@@ -1228,6 +1306,10 @@ def agent_configs(self) -> Sequence[AgentUnifiedConfig]:
12281306
def agent_ids(self) -> Sequence[AgentId]:
12291307
return [AgentId(agent_config.agent.id) for agent_config in self.agent_configs]
12301308

1309+
@property
1310+
def resource_common(self) -> CommonResourceConfig:
1311+
return self.resource
1312+
12311313
def with_updates(
12321314
self,
12331315
*,
@@ -1308,3 +1390,34 @@ def _validate_docker_config(self) -> Self:
13081390
DockerExtraConfig.model_validate(config.container.model_dump())
13091391

13101392
return self
1393+
1394+
@model_validator(mode="after")
1395+
def _validate_resource_allocation_mode(self) -> Self:
1396+
match self.resource.allocation_mode:
1397+
case ResourceAllocationMode.SHARED | ResourceAllocationMode.AUTO_SPLIT:
1398+
for config in self.agent_configs:
1399+
resource = config.resource
1400+
if any([
1401+
resource.allocated_cpu is not None,
1402+
resource.allocated_mem is not None,
1403+
resource.allocated_devices,
1404+
]):
1405+
raise ValueError(
1406+
"On non-MANUAL mode, config must not specify manual resource allocations"
1407+
)
1408+
1409+
case ResourceAllocationMode.MANUAL:
1410+
for config in self.agent_configs:
1411+
resource = config.resource
1412+
if any([resource.allocated_cpu is None, resource.allocated_mem is None]):
1413+
raise ValueError(
1414+
"On MANUAL mode, config must specify cpu and mem resource allocations"
1415+
)
1416+
1417+
slot_names = [
1418+
set(config.resource.allocated_devices.keys()) for config in self.agent_configs
1419+
]
1420+
if not all(slot_name == slot_names[0] for slot_name in slot_names):
1421+
raise ValueError("All agents must have the same slots defined in the devices!")
1422+
1423+
return self

0 commit comments

Comments
 (0)