Skip to content

Commit 0c85e11

Browse files
committed
feat(BA-2752): Add multi agents in agent server config
This change adds config changes that allow the user to specify multiple agents in an agent runtime server. It works by keeping the existing agent configurations as default and add multiple agents as overrides.
1 parent 86bc18c commit 0c85e11

File tree

6 files changed

+982
-265
lines changed

6 files changed

+982
-265
lines changed

changes/6315.feature.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Add support for multiple agents in agent server config

configs/agent/sample.toml

Lines changed: 155 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,28 @@
66
#
77
# Generated automatically from the AgentUnifiedConfig schema.
88

9-
# Agent configuration
9+
# Complete agent configuration (common + overridable).
1010
[agent]
11+
# Agent ID
12+
id = "agent-001"
13+
# Agent socket port
14+
agent-sock-port = 6007
15+
# Scaling group name
16+
scaling-group = "default"
17+
# Scaling group type
18+
scaling-group-type = "compute"
19+
# Allowed compute plugins
20+
## allow-compute-plugins = [ "ai.backend.accelerator.cuda_open", "ai.backend.activator.agent",]
21+
# Blocked compute plugins
22+
## block-compute-plugins = [ "ai.backend.accelerator.mock",]
23+
# Allowed network plugins
24+
## allow-network-plugins = [ "ai.backend.manager.network.overlay",]
25+
# Blocked network plugins
26+
## block-network-plugins = [ "ai.backend.manager.network.overlay",]
27+
# Whether to force terminate abusing containers
28+
force-terminate-abusing-containers = false
29+
# Kernel creation concurrency
30+
kernel-creation-concurrency = 4
1131
# Backend type for the agent.
1232
# This determines how the agent interacts with the underlying infrastructure.
1333
# Available options are:
@@ -33,10 +53,6 @@
3353
## rpc-auth-manager-public-key = "/path/to/public.key"
3454
# Path to RPC auth agent keypair
3555
## rpc-auth-agent-keypair = "/path/to/keypair.key"
36-
# Agent socket port
37-
agent-sock-port = 6007
38-
# Agent ID
39-
## id = "agent-001"
4056
# Base path for IPC
4157
ipc-base-path = "/tmp/backend.ai/ipc"
4258
# Base path for variable data
@@ -51,10 +67,6 @@
5167
## region = "us-east-1"
5268
# Instance type
5369
## instance-type = "m5.large"
54-
# Scaling group name
55-
scaling-group = "default"
56-
# Scaling group type
57-
scaling-group-type = "compute"
5870
# Path to PID file
5971
pid-file = "/dev/null"
6072
# Event loop type
@@ -69,22 +81,10 @@
6981
metadata-server-bind-host = "0.0.0.0"
7082
# Metadata server port
7183
metadata-server-port = 40128
72-
# Allowed compute plugins
73-
## allow-compute-plugins = [ "ai.backend.accelerator.cuda_open", "ai.backend.activator.agent",]
74-
# Blocked compute plugins
75-
## block-compute-plugins = [ "ai.backend.accelerator.mock",]
76-
# Allowed network plugins
77-
## allow-network-plugins = [ "ai.backend.manager.network.overlay",]
78-
# Blocked network plugins
79-
## block-network-plugins = [ "ai.backend.manager.network.overlay",]
8084
# Path for image commit
8185
image-commit-path = "tmp/backend.ai/commit"
8286
# Path for abuse reports
8387
## abuse-report-path = "/var/log/backend.ai/abuse"
84-
# Whether to force terminate abusing containers
85-
force-terminate-abusing-containers = false
86-
# Kernel creation concurrency
87-
kernel-creation-concurrency = 4
8888
# Whether to use experimental Redis event dispatcher
8989
use-experimental-redis-event-dispatcher = false
9090
# Docker mode detected based on kernel version (linuxkit/native)
@@ -99,17 +99,16 @@
9999
# Synchronization interval in seconds
100100
interval = 10.0
101101

102-
# Container configuration
102+
# Complete container configuration (common + overridable).
103103
[container]
104104
# Kernel user ID
105105
kernel-uid = -1
106106
# Kernel group ID
107107
kernel-gid = -1
108-
# Bind host for containers
109-
bind-host = ""
110-
# Advertised host for containers
111-
## advertised-host = "192.168.1.100"
112-
# Port range for containers
108+
# Port range for containers.
109+
# If multiple agents are used, user must ensure that the port ranges
110+
# do not overlap between the agent, else it may cause subtle issues
111+
# late into the agent's runtime.
113112
port-range = [ 30000, 31000,]
114113
# Statistics type
115114
## stats-type = "docker"
@@ -135,6 +134,36 @@
135134
# networks, and services.
136135
# This field is only used when backend is set to 'docker'.
137136
swarm-enabled = false
137+
# Bind host for containers
138+
bind-host = ""
139+
# Advertised host for containers
140+
## advertised-host = "192.168.1.100"
141+
142+
# Resource configuration.
143+
# If agents field is populated, this field indicates the default values for all
144+
# agents.
145+
[resource]
146+
# The number of CPU cores reserved for the operating system and the agent
147+
# service.
148+
reserved-cpu = 1
149+
# The memory space reserved for the operating system and the agent service. It
150+
# is subtracted from the reported main memory size and not available for user
151+
# workload allocation. Depending on the memory-align-size option and system
152+
# configuration, this may not be the exact value but have slightly less or more
153+
# values within the memory-align-size.
154+
reserved-mem = "1G"
155+
# The disk space reserved for the operating system and the agent service.
156+
# Currently this value is unused. In future releases, it may be used to preserve
157+
# the minimum disk space from the scratch disk allocation via loopback files.
158+
reserved-disk = "8G"
159+
# The alignment of the reported main memory size to absorb tiny deviations from
160+
# per-node firmware/hardware settings. Recommended to be multiple of the
161+
# page/hugepage size (e.g., 2 MiB).
162+
memory-align-size = "16M"
163+
# Resource allocation order
164+
allocation-order = [ "cuda", "rocm", "tpu", "cpu", "mem",]
165+
# Affinity policy
166+
affinity-policy = "INTERLEAVED"
138167

139168
# Pyroscope configuration
140169
[pyroscope]
@@ -215,30 +244,6 @@
215244
# Override default log level for specific scope of package
216245
[logging.pkg_ns]
217246

218-
# Resource configuration
219-
[resource]
220-
# The number of CPU cores reserved for the operating system and the agent
221-
# service.
222-
reserved-cpu = 1
223-
# The memory space reserved for the operating system and the agent service. It
224-
# is subtracted from the reported main memory size and not available for user
225-
# workload allocation. Depending on the memory-align-size option and system
226-
# configuration, this may not be the exact value but have slightly less or more
227-
# values within the memory-align-size.
228-
reserved-mem = "1G"
229-
# The disk space reserved for the operating system and the agent service.
230-
# Currently this value is unused. In future releases, it may be used to preserve
231-
# the minimum disk space from the scratch disk allocation via loopback files.
232-
reserved-disk = "8G"
233-
# The alignment of the reported main memory size to absorb tiny deviations from
234-
# per-node firwmare/hardware settings. Recommended to be multiple of the
235-
# page/hugepage size (e.g., 2 MiB).
236-
memory-align-size = "16M"
237-
# Resource allocation order
238-
allocation-order = [ "cuda", "rocm", "tpu", "cpu", "mem",]
239-
# Affinity policy
240-
affinity-policy = "INTERLEAVED"
241-
242247
# OpenTelemetry configuration
243248
[otel]
244249
# Whether to enable OpenTelemetry
@@ -326,3 +331,102 @@
326331
init-polling-timeout-sec = 60.0
327332
# Init timeout in seconds
328333
init-timeout-sec = 60.0
334+
335+
# Configuration overrides for multiple agents.
336+
# Use this field only to define 2 or more agents, as defining only one
337+
# agent using this field is redundant. Use the fields agent, container,
338+
# and resource to define the configuration at a global level.
339+
# Any field populated in the agents config will be treated as an
340+
# override to the global default values. Thus the global fields must still
341+
# be provided when defining multiple agents.
342+
[[agents]]
343+
# Add multiple [[agents]] sections as needed
344+
# Agent settings that can be overridden per-agent in multi-agent mode.
345+
[agents.agent]
346+
# Agent ID
347+
id = "agent-001"
348+
# Agent socket port
349+
agent-sock-port = 6007
350+
# Scaling group name
351+
scaling-group = "default"
352+
# Scaling group type
353+
scaling-group-type = "compute"
354+
# Allowed compute plugins
355+
## allow-compute-plugins = [ "ai.backend.accelerator.cuda_open", "ai.backend.activator.agent",]
356+
# Blocked compute plugins
357+
## block-compute-plugins = [ "ai.backend.accelerator.mock",]
358+
# Allowed network plugins
359+
## allow-network-plugins = [ "ai.backend.manager.network.overlay",]
360+
# Blocked network plugins
361+
## block-network-plugins = [ "ai.backend.manager.network.overlay",]
362+
# Whether to force terminate abusing containers
363+
force-terminate-abusing-containers = false
364+
# Kernel creation concurrency
365+
kernel-creation-concurrency = 4
366+
367+
# Container lifecycle synchronization config
368+
[agents.agent.sync-container-lifecycles]
369+
# Whether to enable container lifecycle synchronization
370+
enabled = true
371+
# Synchronization interval in seconds
372+
interval = 10.0
373+
374+
# Container settings that can be overridden per-agent in multi-agent mode.
375+
[agents.container]
376+
# Kernel user ID
377+
kernel-uid = -1
378+
# Kernel group ID
379+
kernel-gid = -1
380+
# Port range for containers.
381+
# If multiple agents are used, user must ensure that the port ranges
382+
# do not overlap between the agent, else it may cause subtle issues
383+
# late into the agent's runtime.
384+
port-range = [ 30000, 31000,]
385+
# Statistics type
386+
## stats-type = "cgroup"
387+
# Sandbox type
388+
sandbox-type = "docker"
389+
# Jail arguments
390+
jail-args = [ "--mount", "/tmp",]
391+
# Scratch type
392+
scratch-type = "hostdir"
393+
# Scratch root directory
394+
scratch-root = "scratches"
395+
# Scratch size
396+
scratch-size = 0
397+
# Scratch NFS address
398+
## scratch-nfs-address = "192.168.1.100:/export"
399+
# Scratch NFS options
400+
## scratch-nfs-options = "rw,sync"
401+
# Alternative bridge network
402+
## alternative-bridge = "br-backend"
403+
# Whether to enable Docker Swarm mode.
404+
# This allows the agent to manage containers in a Docker Swarm cluster.
405+
# When enabled, the agent will use Docker Swarm APIs to manage containers,
406+
# networks, and services.
407+
# This field is only used when backend is set to 'docker'.
408+
swarm-enabled = false
409+
410+
# Resource config overrides for the individual agent
411+
[agents.resource]
412+
# The number of CPU cores reserved for the operating system and the agent
413+
# service.
414+
reserved-cpu = 1
415+
# The memory space reserved for the operating system and the agent service. It
416+
# is subtracted from the reported main memory size and not available for user
417+
# workload allocation. Depending on the memory-align-size option and system
418+
# configuration, this may not be the exact value but have slightly less or more
419+
# values within the memory-align-size.
420+
reserved-mem = 1073741824
421+
# The disk space reserved for the operating system and the agent service.
422+
# Currently this value is unused. In future releases, it may be used to preserve
423+
# the minimum disk space from the scratch disk allocation via loopback files.
424+
reserved-disk = 8589934592
425+
# The alignment of the reported main memory size to absorb tiny deviations from
426+
# per-node firmware/hardware settings. Recommended to be multiple of the
427+
# page/hugepage size (e.g., 2 MiB).
428+
memory-align-size = 16777216
429+
# Resource allocation order
430+
allocation-order = [ "cuda", "rocm", "tpu", "cpu", "mem",]
431+
# Affinity policy
432+
affinity-policy = 1

src/ai/backend/agent/agent.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1517,6 +1517,9 @@ def reset_port_pool(self, used_ports: Iterable[int]) -> None:
15171517
}
15181518
self.port_pool = original_port_pool
15191519

1520+
def update_scaling_group(self, scaling_group: str) -> None:
1521+
self.local_config.update(agent_update={"scaling_group": scaling_group})
1522+
15201523
async def purge_containers(self, containers: Iterable[ContainerKernelId]) -> None:
15211524
tasks = [self._purge_container(container) for container in containers]
15221525
await asyncio.gather(*tasks, return_exceptions=True)

0 commit comments

Comments
 (0)