Skip to content

Commit 103dbcc

Browse files
committed
feat(BA-3024): Add custom resource alloc for agents in config
This change adds configuration for partitioning resources rather than every agent always seeing the full resource pool. This prevents unintended over-allocation that could crash kernels. SHARED mode allows all agents to see full resources (useful for stress testing). This is the same behavior as before. AUTO_SPLIT automatically divides resources equally among agents. MANUAL mode lets users specify exact per-agent allocations for all resources. Single-agent deployments remain unaffected and retain access to all available hardware resources.
1 parent e808760 commit 103dbcc

File tree

4 files changed

+700
-37
lines changed

4 files changed

+700
-37
lines changed

changes/6724.feature.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Add custom resource allocation in agent server config

configs/agent/sample.toml

Lines changed: 30 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,14 @@
143143
# If agents field is populated, this field indicates the default values for all
144144
# agents.
145145
[resource]
146+
# Hard CPU allocation for this agent (e.g., 8 cores).
147+
# Only used in MANUAL allocation mode.
148+
# All agents must specify this value when allocation-mode is MANUAL.
149+
## allocated-cpu = 8
150+
# Hard memory allocation for this agent (e.g., "32G").
151+
# Only used in MANUAL allocation mode.
152+
# All agents must specify this value when allocation-mode is MANUAL.
153+
## allocated-mem = "32G"
146154
# The number of CPU cores reserved for the operating system and the agent
147155
# service.
148156
reserved-cpu = 1
@@ -156,6 +164,12 @@
156164
# Currently this value is unused. In future releases, it may be used to preserve
157165
# the minimum disk space from the scratch disk allocation via loopback files.
158166
reserved-disk = "8G"
167+
# Resource allocation mode for multi-agent scenarios.
168+
# - `shared`: All agents share the full resource pool (default, backward
169+
# compatible).
170+
# - `auto-split`: Automatically divide resources equally (1/N) among all agents.
171+
# - `manual`: Manually specify per-agent resource allocations via config.
172+
allocation-mode = "shared"
159173
# The alignment of the reported main memory size to absorb tiny deviations from
160174
# per-node firmware/hardware settings. Recommended to be multiple of the
161175
# page/hugepage size (e.g., 2 MiB).
@@ -165,6 +179,10 @@
165179
# Affinity policy
166180
affinity-policy = "INTERLEAVED"
167181

182+
# Device-specific per-slot resource allocations.
183+
# Only used in MANUAL allocation mode.
184+
[resource.allocated-devices]
185+
168186
# Pyroscope configuration
169187
[pyroscope]
170188
# Whether to enable Pyroscope profiling
@@ -409,24 +427,15 @@
409427

410428
# Resource config overrides for the individual agent
411429
[agents.resource]
412-
# The number of CPU cores reserved for the operating system and the agent
413-
# service.
414-
reserved-cpu = 1
415-
# The memory space reserved for the operating system and the agent service. It
416-
# is subtracted from the reported main memory size and not available for user
417-
# workload allocation. Depending on the memory-align-size option and system
418-
# configuration, this may not be the exact value but have slightly less or more
419-
# values within the memory-align-size.
420-
reserved-mem = 1073741824
421-
# The disk space reserved for the operating system and the agent service.
422-
# Currently this value is unused. In future releases, it may be used to preserve
423-
# the minimum disk space from the scratch disk allocation via loopback files.
424-
reserved-disk = 8589934592
425-
# The alignment of the reported main memory size to absorb tiny deviations from
426-
# per-node firmware/hardware settings. Recommended to be multiple of the
427-
# page/hugepage size (e.g., 2 MiB).
428-
memory-align-size = 16777216
429-
# Resource allocation order
430-
allocation-order = [ "cuda", "rocm", "tpu", "cpu", "mem",]
431-
# Affinity policy
432-
affinity-policy = 1
430+
# Hard CPU allocation for this agent (e.g., 8 cores).
431+
# Only used in MANUAL allocation mode.
432+
# All agents must specify this value when allocation-mode is MANUAL.
433+
## allocated-cpu = 8
434+
# Hard memory allocation for this agent (e.g., "32G").
435+
# Only used in MANUAL allocation mode.
436+
# All agents must specify this value when allocation-mode is MANUAL.
437+
## allocated-mem = "32G"
438+
439+
# Device-specific per-slot resource allocations.
440+
# Only used in MANUAL allocation mode.
441+
[agents.resource.allocated-devices]

src/ai/backend/agent/config/unified.py

Lines changed: 121 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import os
1212
import sys
1313
import textwrap
14+
from decimal import Decimal
1415
from pathlib import Path
1516
from typing import (
1617
Any,
@@ -51,6 +52,7 @@
5152
BinarySizeField,
5253
ResourceGroupType,
5354
ServiceDiscoveryType,
55+
SlotName,
5456
)
5557
from ai.backend.logging import BraceStyleAdapter
5658
from ai.backend.logging.config import LoggingConfig
@@ -81,6 +83,12 @@ class ScratchType(enum.StrEnum):
8183
K8S_NFS = "k8s-nfs"
8284

8385

86+
class ResourceAllocationMode(enum.StrEnum):
87+
SHARED = "shared"
88+
AUTO_SPLIT = "auto-split"
89+
MANUAL = "manual"
90+
91+
8492
class AgentConfigValidationContext(BaseConfigValidationContext):
8593
is_invoked_subcommand: bool
8694

@@ -863,7 +871,7 @@ class ContainerConfig(CommonContainerConfig, OverridableContainerConfig):
863871
pass
864872

865873

866-
class ResourceConfig(BaseConfigSchema):
874+
class CommonResourceConfig(BaseConfigSchema):
867875
reserved_cpu: int = Field(
868876
default=1,
869877
description="The number of CPU cores reserved for the operating system and the agent service.",
@@ -895,6 +903,18 @@ class ResourceConfig(BaseConfigSchema):
895903
validation_alias=AliasChoices("reserved-disk", "reserved_disk"),
896904
serialization_alias="reserved-disk",
897905
)
906+
allocation_mode: ResourceAllocationMode = Field(
907+
default=ResourceAllocationMode.SHARED,
908+
description=textwrap.dedent("""
909+
Resource allocation mode for multi-agent scenarios.
910+
- `shared`: All agents share the full resource pool (default, backward compatible).
911+
- `auto-split`: Automatically divide resources equally (1/N) among all agents.
912+
- `manual`: Manually specify per-agent resource allocations via config.
913+
"""),
914+
examples=[item.value for item in ResourceAllocationMode],
915+
validation_alias=AliasChoices("allocation-mode", "allocation_mode"),
916+
serialization_alias="allocation-mode",
917+
)
898918
memory_align_size: BinarySizeField = Field(
899919
default=BinarySize.finite_from_str("16M"),
900920
description=(
@@ -937,6 +957,64 @@ def _parse_affinity_policy(cls, v: Any) -> AffinityPolicy:
937957
return v
938958

939959

960+
class OverridableResourceConfig(BaseConfigSchema):
961+
allocated_cpu: Optional[int] = Field(
962+
default=None,
963+
description=textwrap.dedent("""
964+
Hard CPU allocation for this agent (e.g., 8 cores).
965+
Only used in MANUAL allocation mode.
966+
All agents must specify this value when allocation-mode is MANUAL.
967+
"""),
968+
examples=[8, 16],
969+
validation_alias=AliasChoices("allocated-cpu", "allocated_cpu"),
970+
serialization_alias="allocated-cpu",
971+
)
972+
allocated_mem: Optional[BinarySizeField] = Field(
973+
default=None,
974+
description=textwrap.dedent("""
975+
Hard memory allocation for this agent (e.g., "32G").
976+
Only used in MANUAL allocation mode.
977+
All agents must specify this value when allocation-mode is MANUAL.
978+
"""),
979+
examples=["32G", "64G"],
980+
validation_alias=AliasChoices("allocated-mem", "allocated_mem"),
981+
serialization_alias="allocated-mem",
982+
)
983+
allocated_devices: Mapping[SlotName, Decimal] = Field(
984+
default_factory=dict,
985+
description=textwrap.dedent("""
986+
Device-specific per-slot resource allocations.
987+
Only used in MANUAL allocation mode.
988+
"""),
989+
examples=[{"cuda.mem": "0.3", "cuda.shares": "0.5"}],
990+
validation_alias=AliasChoices("allocated-devices", "allocated_devices"),
991+
serialization_alias="allocated-devices",
992+
)
993+
994+
model_config = ConfigDict(
995+
extra="allow",
996+
arbitrary_types_allowed=True,
997+
)
998+
999+
@model_validator(mode="after")
1000+
def validate_values_are_positive(self) -> Self:
1001+
if self.allocated_cpu is not None and self.allocated_cpu < 0:
1002+
raise ValueError(
1003+
f"Allocated cpu must not be a negative value, but given {self.allocated_cpu}"
1004+
)
1005+
if self.allocated_mem is not None and self.allocated_mem < 0:
1006+
raise ValueError(
1007+
f"Allocated mem must not be a negative value, but given {self.allocated_mem}"
1008+
)
1009+
if any(value < 0 for value in self.allocated_devices.values()):
1010+
raise ValueError("All allocated device resource values must not be a negative value")
1011+
return self
1012+
1013+
1014+
class ResourceConfig(CommonResourceConfig, OverridableResourceConfig):
1015+
pass
1016+
1017+
9401018
class EtcdConfig(BaseConfigSchema):
9411019
namespace: str = Field(
9421020
description="Etcd namespace",
@@ -1166,7 +1244,7 @@ class AgentOverrideConfig(BaseConfigSchema):
11661244
default=None,
11671245
description="Container config overrides for the individual agent",
11681246
)
1169-
resource: ResourceConfig | None = Field(
1247+
resource: OverridableResourceConfig | None = Field(
11701248
default=None,
11711249
description="Resource config overrides for the individual agent",
11721250
)
@@ -1229,6 +1307,10 @@ def agent_configs(self) -> Sequence[AgentUnifiedConfig]:
12291307
def agent_ids(self) -> Sequence[AgentId]:
12301308
return [AgentId(agent_config.agent.id) for agent_config in self.agent_configs]
12311309

1310+
@property
1311+
def resource_common(self) -> CommonResourceConfig:
1312+
return self.resource
1313+
12321314
def with_updates(
12331315
self,
12341316
*,
@@ -1312,6 +1394,43 @@ def validate(config: AgentSpecificConfig) -> None:
13121394
self._for_each_agent(validate)
13131395
return self
13141396

1397+
@model_validator(mode="after")
1398+
def _validate_resource_allocation_mode(self) -> Self:
1399+
def validate_manual_resource_not_specified(config: AgentSpecificConfig) -> None:
1400+
resource = config.resource
1401+
if any([
1402+
resource.allocated_cpu is not None,
1403+
resource.allocated_mem is not None,
1404+
resource.allocated_devices,
1405+
]):
1406+
raise ValueError(
1407+
"On non-MANUAL mode, config must not specify manual resource allocations"
1408+
)
1409+
1410+
def validate_mandatory_manual_resource_specified(config: AgentSpecificConfig) -> None:
1411+
resource = config.resource
1412+
if any([
1413+
resource.allocated_cpu is None,
1414+
resource.allocated_mem is None,
1415+
]):
1416+
raise ValueError(
1417+
"On MANUAL mode, config must specify cpu and mem resource allocations"
1418+
)
1419+
1420+
match self.resource.allocation_mode:
1421+
case ResourceAllocationMode.SHARED | ResourceAllocationMode.AUTO_SPLIT:
1422+
self._for_each_agent(validate_manual_resource_not_specified)
1423+
case ResourceAllocationMode.MANUAL:
1424+
self._for_each_agent(validate_mandatory_manual_resource_specified)
1425+
1426+
slot_names = self._for_each_agent(
1427+
lambda config: set(config.resource.allocated_devices.keys())
1428+
)
1429+
if not all(slot_name == slot_names[0] for slot_name in slot_names):
1430+
raise ValueError("All agents must have the same slots defined in the devices!")
1431+
1432+
return self
1433+
13151434
def _for_each_agent(self, func: Callable[[AgentUnifiedConfig], R]) -> list[R]:
13161435
agents = [agent.construct_unified_config(default=self) for agent in self.agents]
13171436
if not agents:

0 commit comments

Comments
 (0)