|
11 | 11 | import os |
12 | 12 | import sys |
13 | 13 | import textwrap |
| 14 | +from decimal import Decimal |
14 | 15 | from pathlib import Path |
15 | 16 | from typing import ( |
16 | 17 | Any, |
|
49 | 50 | BinarySizeField, |
50 | 51 | ResourceGroupType, |
51 | 52 | ServiceDiscoveryType, |
| 53 | + SlotName, |
52 | 54 | ) |
53 | 55 | from ai.backend.logging import BraceStyleAdapter |
54 | 56 | from ai.backend.logging.config import LoggingConfig |
@@ -77,6 +79,12 @@ class ScratchType(enum.StrEnum): |
77 | 79 | K8S_NFS = "k8s-nfs" |
78 | 80 |
|
79 | 81 |
|
| 82 | +class ResourceAllocationMode(enum.StrEnum): |
| 83 | + SHARED = "shared" |
| 84 | + AUTO_SPLIT = "auto-split" |
| 85 | + MANUAL = "manual" |
| 86 | + |
| 87 | + |
80 | 88 | class AgentConfigValidationContext(BaseConfigValidationContext): |
81 | 89 | is_invoked_subcommand: bool |
82 | 90 |
|
@@ -867,7 +875,7 @@ class ContainerConfig(CommonContainerConfig, OverridableContainerConfig): |
867 | 875 | pass |
868 | 876 |
|
869 | 877 |
|
870 | | -class ResourceConfig(BaseConfigSchema): |
| 878 | +class CommonResourceConfig(BaseConfigSchema): |
871 | 879 | reserved_cpu: int = Field( |
872 | 880 | default=1, |
873 | 881 | description="The number of CPU cores reserved for the operating system and the agent service.", |
@@ -899,6 +907,18 @@ class ResourceConfig(BaseConfigSchema): |
899 | 907 | validation_alias=AliasChoices("reserved-disk", "reserved_disk"), |
900 | 908 | serialization_alias="reserved-disk", |
901 | 909 | ) |
| 910 | + allocation_mode: ResourceAllocationMode = Field( |
| 911 | + default=ResourceAllocationMode.SHARED, |
| 912 | + description=textwrap.dedent(""" |
| 913 | + Resource allocation mode for multi-agent scenarios. |
| 914 | + - `shared`: All agents share the full resource pool (default, backward compatible). |
| 915 | + - `auto-split`: Automatically divide resources equally (1/N) among all agents. |
| 916 | + - `manual`: Manually specify per-agent resource allocations via config. |
| 917 | + """), |
| 918 | + examples=[item.value for item in ResourceAllocationMode], |
| 919 | + validation_alias=AliasChoices("allocation-mode", "allocation_mode"), |
| 920 | + serialization_alias="allocation-mode", |
| 921 | + ) |
902 | 922 | memory_align_size: BinarySizeField = Field( |
903 | 923 | default=BinarySize.finite_from_str("16M"), |
904 | 924 | description=( |
@@ -941,6 +961,64 @@ def _parse_affinity_policy(cls, v: Any) -> AffinityPolicy: |
941 | 961 | return v |
942 | 962 |
|
943 | 963 |
|
| 964 | +class OverridableResourceConfig(BaseConfigSchema): |
| 965 | + allocated_cpu: Optional[int] = Field( |
| 966 | + default=None, |
| 967 | + description=textwrap.dedent(""" |
| 968 | + Hard CPU allocation for this agent (e.g., 8 cores). |
| 969 | + Only used in MANUAL allocation mode. |
| 970 | + All agents must specify this value when allocation-mode is MANUAL. |
| 971 | + """), |
| 972 | + examples=[8, 16], |
| 973 | + validation_alias=AliasChoices("allocated-cpu", "allocated_cpu"), |
| 974 | + serialization_alias="allocated-cpu", |
| 975 | + ) |
| 976 | + allocated_mem: Optional[BinarySizeField] = Field( |
| 977 | + default=None, |
| 978 | + description=textwrap.dedent(""" |
| 979 | + Hard memory allocation for this agent (e.g., "32G"). |
| 980 | + Only used in MANUAL allocation mode. |
| 981 | + All agents must specify this value when allocation-mode is MANUAL. |
| 982 | + """), |
| 983 | + examples=["32G", "64G"], |
| 984 | + validation_alias=AliasChoices("allocated-mem", "allocated_mem"), |
| 985 | + serialization_alias="allocated-mem", |
| 986 | + ) |
| 987 | + allocated_devices: Mapping[SlotName, Decimal] = Field( |
| 988 | + default_factory=dict, |
| 989 | + description=textwrap.dedent(""" |
| 990 | + Device-specific per-slot resource allocations. |
| 991 | + Only used in MANUAL allocation mode. |
| 992 | + """), |
| 993 | + examples=[{"cuda.mem": "0.3", "cuda.shares": "0.5"}], |
| 994 | + validation_alias=AliasChoices("allocated-devices", "allocated_devices"), |
| 995 | + serialization_alias="allocated-devices", |
| 996 | + ) |
| 997 | + |
| 998 | + model_config = ConfigDict( |
| 999 | + extra="allow", |
| 1000 | + arbitrary_types_allowed=True, |
| 1001 | + ) |
| 1002 | + |
| 1003 | + @model_validator(mode="after") |
| 1004 | + def validate_values_are_positive(self) -> Self: |
| 1005 | + if self.allocated_cpu is not None and self.allocated_cpu < 0: |
| 1006 | + raise ValueError( |
| 1007 | + f"Allocated cpu must not be a negative value, but given {self.allocated_cpu}" |
| 1008 | + ) |
| 1009 | + if self.allocated_mem is not None and self.allocated_mem < 0: |
| 1010 | + raise ValueError( |
| 1011 | + f"Allocated mem must not be a negative value, but given {self.allocated_mem}" |
| 1012 | + ) |
| 1013 | + if any(value < 0 for value in self.allocated_devices.values()): |
| 1014 | + raise ValueError("All allocated device resource values must not be a negative value") |
| 1015 | + return self |
| 1016 | + |
| 1017 | + |
| 1018 | +class ResourceConfig(CommonResourceConfig, OverridableResourceConfig): |
| 1019 | + pass |
| 1020 | + |
| 1021 | + |
944 | 1022 | class EtcdConfig(BaseConfigSchema): |
945 | 1023 | namespace: str = Field( |
946 | 1024 | description="Etcd namespace", |
@@ -1187,7 +1265,7 @@ class AgentOverrideConfig(BaseConfigSchema): |
1187 | 1265 | default=None, |
1188 | 1266 | description="Container config overrides for the individual agent", |
1189 | 1267 | ) |
1190 | | - resource: ResourceConfig | None = Field( |
| 1268 | + resource: OverridableResourceConfig | None = Field( |
1191 | 1269 | default=None, |
1192 | 1270 | description="Resource config overrides for the individual agent", |
1193 | 1271 | ) |
@@ -1315,3 +1393,34 @@ def _validate_agent_configs(self) -> Self: |
1315 | 1393 | config.validate_agent_specific_config() |
1316 | 1394 |
|
1317 | 1395 | return self |
| 1396 | + |
| 1397 | + @model_validator(mode="after") |
| 1398 | + def _validate_resource_allocation_mode(self) -> Self: |
| 1399 | + match self.resource.allocation_mode: |
| 1400 | + case ResourceAllocationMode.SHARED | ResourceAllocationMode.AUTO_SPLIT: |
| 1401 | + for config in self.agent_configs: |
| 1402 | + resource = config.resource |
| 1403 | + if any([ |
| 1404 | + resource.allocated_cpu is not None, |
| 1405 | + resource.allocated_mem is not None, |
| 1406 | + resource.allocated_devices, |
| 1407 | + ]): |
| 1408 | + raise ValueError( |
| 1409 | + "On non-MANUAL mode, config must not specify manual resource allocations" |
| 1410 | + ) |
| 1411 | + |
| 1412 | + case ResourceAllocationMode.MANUAL: |
| 1413 | + for config in self.agent_configs: |
| 1414 | + resource = config.resource |
| 1415 | + if any([resource.allocated_cpu is None, resource.allocated_mem is None]): |
| 1416 | + raise ValueError( |
| 1417 | + "On MANUAL mode, config must specify cpu and mem resource allocations" |
| 1418 | + ) |
| 1419 | + |
| 1420 | + slot_names = [ |
| 1421 | + set(config.resource.allocated_devices.keys()) for config in self.agent_configs |
| 1422 | + ] |
| 1423 | + if not all(slot_name == slot_names[0] for slot_name in slot_names): |
| 1424 | + raise ValueError("All agents must have the same slots defined in the devices!") |
| 1425 | + |
| 1426 | + return self |
0 commit comments