Skip to content

Commit 90f0702

Browse files
committed
feat(BA-2753): Spawn multiple agents and route RPC appropriately
This change adds support for actually spawning multiple agents within the same agent server and adding agent_id field for all appropriate RPC calls in the agent server, then ensuring that the manager sends that info such that the agent server can correctly route the RPC calls to the correct agent.
1 parent daec211 commit 90f0702

File tree

13 files changed

+1359
-363
lines changed

13 files changed

+1359
-363
lines changed

changes/6320.feature.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Update Agent server RPC functions to include agent ID for agent runtime with multiple agents

configs/agent/sample.toml

Lines changed: 57 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,34 @@
1515
# If agents field is populated, this field indicates the default values for all
1616
# agents.
1717
[agent]
18+
# Agent ID
19+
## id = "agent-001"
20+
# Agent socket port
21+
agent-sock-port = 6007
22+
# Base path for IPC
23+
ipc-base-path = "/tmp/backend.ai/ipc"
24+
# Base path for variable data
25+
var-base-path = "var/lib/backend.ai"
26+
# Scaling group name
27+
scaling-group = "default"
28+
# Scaling group type
29+
scaling-group-type = "compute"
30+
# Allowed compute plugins
31+
## allow-compute-plugins = [ "ai.backend.activator.agent", "ai.backend.accelerator.cuda_open",]
32+
# Blocked compute plugins
33+
## block-compute-plugins = [ "ai.backend.accelerator.mock",]
34+
# Allowed network plugins
35+
## allow-network-plugins = [ "ai.backend.manager.network.overlay",]
36+
# Blocked network plugins
37+
## block-network-plugins = [ "ai.backend.manager.network.overlay",]
38+
# Path for image commit
39+
image-commit-path = "tmp/backend.ai/commit"
40+
# Path for abuse reports
41+
## abuse-report-path = "/var/log/backend.ai/abuse"
42+
# Whether to force terminate abusing containers
43+
force-terminate-abusing-containers = false
44+
# Kernel creation concurrency
45+
kernel-creation-concurrency = 4
1846
# Backend type for the agent.
1947
# This determines how the agent interacts with the underlying infrastructure.
2048
# Available options are:
@@ -40,14 +68,6 @@
4068
## rpc-auth-manager-public-key = "/path/to/public.key"
4169
# Path to RPC auth agent keypair
4270
## rpc-auth-agent-keypair = "/path/to/keypair.key"
43-
# Agent socket port
44-
agent-sock-port = 6007
45-
# Agent ID
46-
## id = "agent-001"
47-
# Base path for IPC
48-
ipc-base-path = "/tmp/backend.ai/ipc"
49-
# Base path for variable data
50-
var-base-path = "var/lib/backend.ai"
5171
# Mount path for containers
5272
## mount-path = "/mnt/backend.ai"
5373
# Whether to enable cohabiting storage proxy
@@ -58,10 +78,6 @@
5878
## region = "us-east-1"
5979
# Instance type
6080
## instance-type = "m5.large"
61-
# Scaling group name
62-
scaling-group = "default"
63-
# Scaling group type
64-
scaling-group-type = "compute"
6581
# Path to PID file
6682
pid-file = "/dev/null"
6783
# Event loop type
@@ -76,22 +92,6 @@
7692
metadata-server-bind-host = "0.0.0.0"
7793
# Metadata server port
7894
metadata-server-port = 40128
79-
# Allowed compute plugins
80-
## allow-compute-plugins = [ "ai.backend.activator.agent", "ai.backend.accelerator.cuda_open",]
81-
# Blocked compute plugins
82-
## block-compute-plugins = [ "ai.backend.accelerator.mock",]
83-
# Allowed network plugins
84-
## allow-network-plugins = [ "ai.backend.manager.network.overlay",]
85-
# Blocked network plugins
86-
## block-network-plugins = [ "ai.backend.manager.network.overlay",]
87-
# Path for image commit
88-
image-commit-path = "tmp/backend.ai/commit"
89-
# Path for abuse reports
90-
## abuse-report-path = "/var/log/backend.ai/abuse"
91-
# Whether to force terminate abusing containers
92-
force-terminate-abusing-containers = false
93-
# Kernel creation concurrency
94-
kernel-creation-concurrency = 4
9595
# Whether to use experimental Redis event dispatcher
9696
use-experimental-redis-event-dispatcher = false
9797
# Docker mode detected based on kernel version (linuxkit/native)
@@ -114,10 +114,6 @@
114114
kernel-uid = -1
115115
# Kernel group ID
116116
kernel-gid = -1
117-
# Bind host for containers
118-
bind-host = ""
119-
# Advertised host for containers
120-
## advertised-host = "192.168.1.100"
121117
# Port range for containers.
122118
# If multiple agents are used, user must ensure that the port ranges
123119
# do not overlap between the agent, else it may cause subtle issues
@@ -147,6 +143,10 @@
147143
# networks, and services.
148144
# This field is only used when backend is set to 'docker'.
149145
swarm-enabled = false
146+
# Bind host for containers
147+
bind-host = ""
148+
# Advertised host for containers
149+
## advertised-host = "192.168.1.100"
150150

151151
# KRunner volumes configuration, mapping container names to host paths.
152152
# This is used to specify volumes that should be mounted into containers
@@ -368,13 +368,17 @@
368368
# Only override fields if necessary.
369369
[agents.agent]
370370
# Agent ID
371-
id = "agent-001"
371+
## id = "agent-001"
372372
# Agent socket port
373-
## agent-sock-port = 6007 # min=1024 max=65535
374-
# Mount path for containers
375-
## mount-path = "/mnt/backend.ai"
376-
# Whether to enable cohabiting storage proxy
377-
## cohabiting-storage-proxy = true
373+
agent-sock-port = 6007
374+
# Base path for IPC
375+
ipc-base-path = "/tmp/backend.ai/ipc"
376+
# Base path for variable data
377+
var-base-path = "var/lib/backend.ai"
378+
# Scaling group name
379+
scaling-group = "default"
380+
# Scaling group type
381+
scaling-group-type = "compute"
378382
# Allowed compute plugins
379383
## allow-compute-plugins = [ "ai.backend.activator.agent", "ai.backend.accelerator.cuda_open",]
380384
# Blocked compute plugins
@@ -383,14 +387,14 @@
383387
## allow-network-plugins = [ "ai.backend.manager.network.overlay",]
384388
# Blocked network plugins
385389
## block-network-plugins = [ "ai.backend.manager.network.overlay",]
390+
# Path for image commit
391+
image-commit-path = "tmp/backend.ai/commit"
392+
# Path for abuse reports
393+
## abuse-report-path = "/var/log/backend.ai/abuse"
386394
# Whether to force terminate abusing containers
387-
## force-terminate-abusing-containers = true
395+
force-terminate-abusing-containers = false
388396
# Kernel creation concurrency
389-
## kernel-creation-concurrency = 4 # min=1 max=32
390-
# Docker mode detected based on kernel version (linuxkit/native)
391-
## docker-mode = "linuxkit"
392-
# Owner uid:gid of the mount directory
393-
## mount-path-uid-gid = "root:root"
397+
kernel-creation-concurrency = 4
394398

395399
# Container lifecycle synchronization config
396400
[agents.agent.sync-container-lifecycles]
@@ -402,26 +406,26 @@
402406
# Container config overrides for the individual agent
403407
[agents.container]
404408
# Kernel user ID
405-
## kernel-uid = 1000
409+
kernel-uid = -1
406410
# Kernel group ID
407-
## kernel-gid = 1000
411+
kernel-gid = -1
408412
# Port range for containers.
409413
# If multiple agents are used, user must ensure that the port ranges
410414
# do not overlap between the agent, else it may cause subtle issues
411415
# late into the agent's runtime.
412-
## port-range = [ 30000, 31000,]
416+
port-range = [ 30000, 31000,]
413417
# Statistics type
414418
## stats-type = "cgroup"
415419
# Sandbox type
416-
## sandbox-type = "docker"
420+
sandbox-type = "docker"
417421
# Jail arguments
418-
## jail-args = [ "--mount", "/tmp",]
422+
jail-args = [ "--mount", "/tmp",]
419423
# Scratch type
420-
## scratch-type = "hostdir"
424+
scratch-type = "hostdir"
421425
# Scratch root directory
422-
## scratch-root = "./scratches"
426+
scratch-root = "scratches"
423427
# Scratch size
424-
## scratch-size = "1G"
428+
scratch-size = 0
425429
# Scratch NFS address
426430
## scratch-nfs-address = "192.168.1.100:/export"
427431
# Scratch NFS options
@@ -433,7 +437,7 @@
433437
# When enabled, the agent will use Docker Swarm APIs to manage containers,
434438
# networks, and services.
435439
# This field is only used when backend is set to 'docker'.
436-
## swarm-enabled = true
440+
swarm-enabled = false
437441

438442
# Resource config overrides for the individual agent
439443
[agents.resource]

src/ai/backend/agent/agent.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -888,6 +888,8 @@ async def __ainit__(self) -> None:
888888
"AbstractAgent.__ainit__": "Redis runtime configuration is not set."
889889
})
890890

891+
self.local_config.agent.image_commit_path.mkdir(parents=True, exist_ok=True)
892+
891893
redis_profile_target = self.local_config.redis.to_redis_profile_target()
892894
stream_redis_target = redis_profile_target.profile_target(RedisRole.STREAM)
893895
mq = await self._make_message_queue(stream_redis_target)
@@ -2276,7 +2278,7 @@ async def scan_running_kernels(self) -> None:
22762278
"""
22772279
ipc_base_path = self.local_config.agent.ipc_base_path
22782280
var_base_path = self.local_config.agent.var_base_path
2279-
last_registry_file = f"last_registry.{self.local_instance_id}.dat"
2281+
last_registry_file = f"last_registry.{self.id}.dat"
22802282
if os.path.isfile(ipc_base_path / last_registry_file):
22812283
shutil.move(ipc_base_path / last_registry_file, var_base_path / last_registry_file)
22822284
try:
@@ -3745,7 +3747,7 @@ async def save_last_registry(self, force=False) -> None:
37453747
if (not force) and (now <= self.last_registry_written_time + 60):
37463748
return # don't save too frequently
37473749
var_base_path = self.local_config.agent.var_base_path
3748-
last_registry_file = f"last_registry.{self.local_instance_id}.dat"
3750+
last_registry_file = f"last_registry.{self.id}.dat"
37493751
try:
37503752
with open(var_base_path / last_registry_file, "wb") as f:
37513753
pickle.dump(self.kernel_registry, f)

src/ai/backend/agent/config/unified.py

Lines changed: 58 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
Sequence,
2323
TypeVar,
2424
)
25+
from uuid import uuid4
2526

2627
from pydantic import (
2728
AfterValidator,
@@ -48,6 +49,7 @@
4849
UserID,
4950
)
5051
from ai.backend.common.types import (
52+
AgentId,
5153
BinarySize,
5254
BinarySizeField,
5355
ResourceGroupType,
@@ -438,6 +440,20 @@ class CommonAgentConfig(BaseConfigSchema):
438440
validation_alias=AliasChoices("rpc-auth-agent-keypair", "rpc_auth_agent_keypair"),
439441
serialization_alias="rpc-auth-agent-keypair",
440442
)
443+
ipc_base_path: AutoDirectoryPath = Field(
444+
default=AutoDirectoryPath("/tmp/backend.ai/ipc"),
445+
description="Base path for IPC",
446+
examples=["/tmp/backend.ai/ipc"],
447+
validation_alias=AliasChoices("ipc-base-path", "ipc_base_path"),
448+
serialization_alias="ipc-base-path",
449+
)
450+
var_base_path: AutoDirectoryPath = Field(
451+
default=AutoDirectoryPath("./var/lib/backend.ai"),
452+
description="Base path for variable data",
453+
examples=["./var/lib/backend.ai"],
454+
validation_alias=AliasChoices("var-base-path", "var_base_path"),
455+
serialization_alias="var-base-path",
456+
)
441457
mount_path: Optional[AutoDirectoryPath] = Field(
442458
default=None,
443459
description="Mount path for containers",
@@ -528,6 +544,20 @@ class CommonAgentConfig(BaseConfigSchema):
528544
validation_alias=AliasChoices("metadata-server-port", "metadata_server_port"),
529545
serialization_alias="metadata-server-port",
530546
)
547+
image_commit_path: AutoDirectoryPath = Field(
548+
default=AutoDirectoryPath("./tmp/backend.ai/commit"),
549+
description="Path for image commit",
550+
examples=["./tmp/backend.ai/commit"],
551+
validation_alias=AliasChoices("image-commit-path", "image_commit_path"),
552+
serialization_alias="image-commit-path",
553+
)
554+
abuse_report_path: Optional[Path] = Field(
555+
default=None,
556+
description="Path for abuse reports",
557+
examples=["/var/log/backend.ai/abuse"],
558+
validation_alias=AliasChoices("abuse-report-path", "abuse_report_path"),
559+
serialization_alias="abuse-report-path",
560+
)
531561
use_experimental_redis_event_dispatcher: bool = Field(
532562
default=False,
533563
description="Whether to use experimental Redis event dispatcher",
@@ -574,8 +604,8 @@ def _validate_rpc_listen_addr(cls, rpc_listen_addr: HostPortPair) -> HostPortPai
574604

575605

576606
class OverridableAgentConfig(BaseConfigSchema):
577-
id: Optional[str] = Field(
578-
default=None,
607+
id: str = Field(
608+
default_factory=lambda: f"agent-{uuid4()}",
579609
description="Agent ID",
580610
examples=["agent-001"],
581611
)
@@ -588,20 +618,6 @@ class OverridableAgentConfig(BaseConfigSchema):
588618
validation_alias=AliasChoices("agent-sock-port", "agent_sock_port"),
589619
serialization_alias="agent-sock-port",
590620
)
591-
ipc_base_path: AutoDirectoryPath = Field(
592-
default=AutoDirectoryPath("/tmp/backend.ai/ipc"),
593-
description="Base path for IPC",
594-
examples=["/tmp/backend.ai/ipc"],
595-
validation_alias=AliasChoices("ipc-base-path", "ipc_base_path"),
596-
serialization_alias="ipc-base-path",
597-
)
598-
var_base_path: AutoDirectoryPath = Field(
599-
default=AutoDirectoryPath("./var/lib/backend.ai"),
600-
description="Base path for variable data",
601-
examples=["./var/lib/backend.ai"],
602-
validation_alias=AliasChoices("var-base-path", "var_base_path"),
603-
serialization_alias="var-base-path",
604-
)
605621
scaling_group: str = Field(
606622
default="default",
607623
description="Scaling group name",
@@ -644,20 +660,6 @@ class OverridableAgentConfig(BaseConfigSchema):
644660
validation_alias=AliasChoices("block-network-plugins", "block_network_plugins"),
645661
serialization_alias="block-network-plugins",
646662
)
647-
image_commit_path: AutoDirectoryPath = Field(
648-
default=AutoDirectoryPath("./tmp/backend.ai/commit"),
649-
description="Path for image commit",
650-
examples=["./tmp/backend.ai/commit"],
651-
validation_alias=AliasChoices("image-commit-path", "image_commit_path"),
652-
serialization_alias="image-commit-path",
653-
)
654-
abuse_report_path: Optional[Path] = Field(
655-
default=None,
656-
description="Path for abuse reports",
657-
examples=["/var/log/backend.ai/abuse"],
658-
validation_alias=AliasChoices("abuse-report-path", "abuse_report_path"),
659-
serialization_alias="abuse-report-path",
660-
)
661663
force_terminate_abusing_containers: bool = Field(
662664
default=False,
663665
description="Whether to force terminate abusing containers",
@@ -1138,7 +1140,12 @@ class AgentOverrideConfig(BaseConfigSchema):
11381140
description="Resource config overrides for the individual agent",
11391141
)
11401142

1141-
def construct_unified_config(self, *, default: AgentUnifiedConfig) -> AgentUnifiedConfig:
1143+
def construct_unified_config(
1144+
self,
1145+
*,
1146+
default: AgentUnifiedConfig,
1147+
agent_idx: int,
1148+
) -> AgentUnifiedConfig:
11421149
agent_updates: dict[str, Any] = {}
11431150
if self.agent is not None:
11441151
agent_override_fields = self.agent.model_dump(include=self.agent.model_fields_set)
@@ -1177,9 +1184,24 @@ class AgentUnifiedConfig(AgentGlobalConfig, AgentSpecificConfig):
11771184
extra="allow",
11781185
)
11791186

1187+
@property
1188+
def agent_common(self) -> CommonAgentConfig:
1189+
return self.agent
1190+
1191+
@property
1192+
def agent_default(self) -> OverridableAgentConfig:
1193+
return self.agent
1194+
11801195
@property
11811196
def agent_configs(self) -> Sequence[AgentUnifiedConfig]:
1182-
return self._for_each_agent(lambda x: x)
1197+
agent_configs = self._for_each_agent(lambda x: x)
1198+
if not agent_configs:
1199+
raise ValueError("There must be at least one agent config")
1200+
return agent_configs
1201+
1202+
@property
1203+
def agent_ids(self) -> Sequence[AgentId]:
1204+
return [AgentId(agent_config.agent.id) for agent_config in self.agent_configs]
11831205

11841206
def with_updates(
11851207
self,
@@ -1265,7 +1287,10 @@ def validate(config: AgentSpecificConfig) -> None:
12651287
return self
12661288

12671289
def _for_each_agent(self, func: Callable[[AgentUnifiedConfig], R]) -> list[R]:
1268-
agents = [agent.construct_unified_config(default=self) for agent in self.agents]
1290+
agents = [
1291+
agent.construct_unified_config(default=self, agent_idx=i)
1292+
for i, agent in enumerate(self.agents)
1293+
]
12691294
if not agents:
12701295
agents.append(self)
12711296

0 commit comments

Comments
 (0)