|
11 | 11 | import os |
12 | 12 | import sys |
13 | 13 | import textwrap |
| 14 | +from decimal import Decimal |
14 | 15 | from pathlib import Path |
15 | 16 | from typing import ( |
16 | 17 | Any, |
|
50 | 51 | BinarySizeField, |
51 | 52 | ResourceGroupType, |
52 | 53 | ServiceDiscoveryType, |
| 54 | + SlotName, |
53 | 55 | ) |
54 | 56 | from ai.backend.logging import BraceStyleAdapter |
55 | 57 | from ai.backend.logging.config import LoggingConfig |
@@ -80,6 +82,12 @@ class ScratchType(enum.StrEnum): |
80 | 82 | K8S_NFS = "k8s-nfs" |
81 | 83 |
|
82 | 84 |
|
| 85 | +class ResourceAllocationMode(enum.StrEnum): |
| 86 | + SHARED = "shared" |
| 87 | + AUTO_SPLIT = "auto-split" |
| 88 | + MANUAL = "manual" |
| 89 | + |
| 90 | + |
83 | 91 | class AgentConfigValidationContext(BaseConfigValidationContext): |
84 | 92 | is_invoked_subcommand: bool |
85 | 93 |
|
@@ -862,7 +870,7 @@ class ContainerConfig(CommonContainerConfig, OverridableContainerConfig): |
862 | 870 | pass |
863 | 871 |
|
864 | 872 |
|
865 | | -class ResourceConfig(BaseConfigSchema): |
| 873 | +class CommonResourceConfig(BaseConfigSchema): |
866 | 874 | reserved_cpu: int = Field( |
867 | 875 | default=1, |
868 | 876 | description="The number of CPU cores reserved for the operating system and the agent service.", |
@@ -894,6 +902,18 @@ class ResourceConfig(BaseConfigSchema): |
894 | 902 | validation_alias=AliasChoices("reserved-disk", "reserved_disk"), |
895 | 903 | serialization_alias="reserved-disk", |
896 | 904 | ) |
| 905 | + allocation_mode: ResourceAllocationMode = Field( |
| 906 | + default=ResourceAllocationMode.SHARED, |
| 907 | + description=textwrap.dedent(""" |
| 908 | + Resource allocation mode for multi-agent scenarios. |
| 909 | + - `shared`: All agents share the full resource pool (default, backward compatible). |
| 910 | + - `auto-split`: Automatically divide resources equally (1/N) among all agents. |
| 911 | + - `manual`: Manually specify per-agent resource allocations via config. |
| 912 | + """), |
| 913 | + examples=[item.value for item in ResourceAllocationMode], |
| 914 | + validation_alias=AliasChoices("allocation-mode", "allocation_mode"), |
| 915 | + serialization_alias="allocation-mode", |
| 916 | + ) |
897 | 917 | memory_align_size: BinarySizeField = Field( |
898 | 918 | default=BinarySize.finite_from_str("16M"), |
899 | 919 | description=( |
@@ -936,6 +956,64 @@ def _parse_affinity_policy(cls, v: Any) -> AffinityPolicy: |
936 | 956 | return v |
937 | 957 |
|
938 | 958 |
|
| 959 | +class OverridableResourceConfig(BaseConfigSchema): |
| 960 | + allocated_cpu: Optional[int] = Field( |
| 961 | + default=None, |
| 962 | + description=textwrap.dedent(""" |
| 963 | + Hard CPU allocation for this agent (e.g., 8 cores). |
| 964 | + Only used in MANUAL allocation mode. |
| 965 | + All agents must specify this value when allocation-mode is MANUAL. |
| 966 | + """), |
| 967 | + examples=[8, 16], |
| 968 | + validation_alias=AliasChoices("allocated-cpu", "allocated_cpu"), |
| 969 | + serialization_alias="allocated-cpu", |
| 970 | + ) |
| 971 | + allocated_mem: Optional[BinarySizeField] = Field( |
| 972 | + default=None, |
| 973 | + description=textwrap.dedent(""" |
| 974 | + Hard memory allocation for this agent (e.g., "32G"). |
| 975 | + Only used in MANUAL allocation mode. |
| 976 | + All agents must specify this value when allocation-mode is MANUAL. |
| 977 | + """), |
| 978 | + examples=["32G", "64G"], |
| 979 | + validation_alias=AliasChoices("allocated-mem", "allocated_mem"), |
| 980 | + serialization_alias="allocated-mem", |
| 981 | + ) |
| 982 | + allocated_devices: Mapping[SlotName, Decimal] = Field( |
| 983 | + default_factory=dict, |
| 984 | + description=textwrap.dedent(""" |
| 985 | + Device-specific per-slot resource allocations. |
| 986 | + Only used in MANUAL allocation mode. |
| 987 | + """), |
| 988 | + examples=[{"cuda.mem": "0.3", "cuda.shares": "0.5"}], |
| 989 | + validation_alias=AliasChoices("allocated-devices", "allocated_devices"), |
| 990 | + serialization_alias="allocated-devices", |
| 991 | + ) |
| 992 | + |
| 993 | + model_config = ConfigDict( |
| 994 | + extra="allow", |
| 995 | + arbitrary_types_allowed=True, |
| 996 | + ) |
| 997 | + |
| 998 | + @model_validator(mode="after") |
| 999 | + def validate_values_are_positive(self) -> Self: |
| 1000 | + if self.allocated_cpu is not None and self.allocated_cpu < 0: |
| 1001 | + raise ValueError( |
| 1002 | + f"Allocated cpu must not be a negative value, but given {self.allocated_cpu}" |
| 1003 | + ) |
| 1004 | + if self.allocated_mem is not None and self.allocated_mem < 0: |
| 1005 | + raise ValueError( |
| 1006 | + f"Allocated mem must not be a negative value, but given {self.allocated_mem}" |
| 1007 | + ) |
| 1008 | + if any(value < 0 for value in self.allocated_devices.values()): |
| 1009 | + raise ValueError("All allocated device resource values must not be a negative value") |
| 1010 | + return self |
| 1011 | + |
| 1012 | + |
| 1013 | +class ResourceConfig(CommonResourceConfig, OverridableResourceConfig): |
| 1014 | + pass |
| 1015 | + |
| 1016 | + |
939 | 1017 | class EtcdConfig(BaseConfigSchema): |
940 | 1018 | namespace: str = Field( |
941 | 1019 | description="Etcd namespace", |
@@ -1165,7 +1243,7 @@ class AgentOverrideConfig(BaseConfigSchema): |
1165 | 1243 | default=None, |
1166 | 1244 | description="Container config overrides for the individual agent", |
1167 | 1245 | ) |
1168 | | - resource: ResourceConfig | None = Field( |
| 1246 | + resource: OverridableResourceConfig | None = Field( |
1169 | 1247 | default=None, |
1170 | 1248 | description="Resource config overrides for the individual agent", |
1171 | 1249 | ) |
@@ -1228,6 +1306,10 @@ def agent_configs(self) -> Sequence[AgentUnifiedConfig]: |
1228 | 1306 | def agent_ids(self) -> Sequence[AgentId]: |
1229 | 1307 | return [AgentId(agent_config.agent.id) for agent_config in self.agent_configs] |
1230 | 1308 |
|
| 1309 | + @property |
| 1310 | + def resource_common(self) -> CommonResourceConfig: |
| 1311 | + return self.resource |
| 1312 | + |
1231 | 1313 | def with_updates( |
1232 | 1314 | self, |
1233 | 1315 | *, |
@@ -1308,3 +1390,34 @@ def _validate_docker_config(self) -> Self: |
1308 | 1390 | DockerExtraConfig.model_validate(config.container.model_dump()) |
1309 | 1391 |
|
1310 | 1392 | return self |
| 1393 | + |
| 1394 | + @model_validator(mode="after") |
| 1395 | + def _validate_resource_allocation_mode(self) -> Self: |
| 1396 | + match self.resource.allocation_mode: |
| 1397 | + case ResourceAllocationMode.SHARED | ResourceAllocationMode.AUTO_SPLIT: |
| 1398 | + for config in self.agent_configs: |
| 1399 | + resource = config.resource |
| 1400 | + if any([ |
| 1401 | + resource.allocated_cpu is not None, |
| 1402 | + resource.allocated_mem is not None, |
| 1403 | + resource.allocated_devices, |
| 1404 | + ]): |
| 1405 | + raise ValueError( |
| 1406 | + "On non-MANUAL mode, config must not specify manual resource allocations" |
| 1407 | + ) |
| 1408 | + |
| 1409 | + case ResourceAllocationMode.MANUAL: |
| 1410 | + for config in self.agent_configs: |
| 1411 | + resource = config.resource |
| 1412 | + if any([resource.allocated_cpu is None, resource.allocated_mem is None]): |
| 1413 | + raise ValueError( |
| 1414 | + "On MANUAL mode, config must specify cpu and mem resource allocations" |
| 1415 | + ) |
| 1416 | + |
| 1417 | + slot_names = [ |
| 1418 | + set(config.resource.allocated_devices.keys()) for config in self.agent_configs |
| 1419 | + ] |
| 1420 | + if not all(slot_name == slot_names[0] for slot_name in slot_names): |
| 1421 | + raise ValueError("All agents must have the same slots defined in the devices!") |
| 1422 | + |
| 1423 | + return self |
0 commit comments