Skip to content

Commit 515dea3

Browse files
committed
feat(BA-2752): Add multi agents in agent server config
This change adds config changes that allow the user to specify multiple agents in an agent runtime server. It works by keeping the existing agent configurations as default and add multiple agents as overrides.
1 parent 84b0aae commit 515dea3

File tree

6 files changed

+963
-263
lines changed

6 files changed

+963
-263
lines changed

changes/6315.feature.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Add support for multiple agents in agent server config

configs/agent/sample.toml

Lines changed: 177 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,38 @@
1111
# It is not intended to be set in the configuration file.
1212
## plugins = "..."
1313

14-
# Agent configuration
14+
# Agent configuration.
15+
# If agents field is populated, this field indicates the default values for all
16+
# agents.
1517
[agent]
18+
# Agent ID
19+
## id = "agent-001"
20+
# Agent socket port
21+
agent-sock-port = 6007
22+
# Base path for IPC
23+
ipc-base-path = "/tmp/backend.ai/ipc"
24+
# Base path for variable data
25+
var-base-path = "var/lib/backend.ai"
26+
# Scaling group name
27+
scaling-group = "default"
28+
# Scaling group type
29+
scaling-group-type = "compute"
30+
# Allowed compute plugins
31+
## allow-compute-plugins = [ "ai.backend.accelerator.cuda_open", "ai.backend.activator.agent",]
32+
# Blocked compute plugins
33+
## block-compute-plugins = [ "ai.backend.accelerator.mock",]
34+
# Allowed network plugins
35+
## allow-network-plugins = [ "ai.backend.manager.network.overlay",]
36+
# Blocked network plugins
37+
## block-network-plugins = [ "ai.backend.manager.network.overlay",]
38+
# Path for image commit
39+
image-commit-path = "tmp/backend.ai/commit"
40+
# Path for abuse reports
41+
## abuse-report-path = "/var/log/backend.ai/abuse"
42+
# Whether to force terminate abusing containers
43+
force-terminate-abusing-containers = false
44+
# Kernel creation concurrency
45+
kernel-creation-concurrency = 4
1646
# Backend type for the agent.
1747
# This determines how the agent interacts with the underlying infrastructure.
1848
# Available options are:
@@ -38,14 +68,6 @@
3868
## rpc-auth-manager-public-key = "/path/to/public.key"
3969
# Path to RPC auth agent keypair
4070
## rpc-auth-agent-keypair = "/path/to/keypair.key"
41-
# Agent socket port
42-
agent-sock-port = 6007
43-
# Agent ID
44-
## id = "agent-001"
45-
# Base path for IPC
46-
ipc-base-path = "/tmp/backend.ai/ipc"
47-
# Base path for variable data
48-
var-base-path = "var/lib/backend.ai"
4971
# Mount path for containers
5072
## mount-path = "/mnt/backend.ai"
5173
# Whether to enable cohabiting storage proxy
@@ -56,10 +78,6 @@
5678
## region = "us-east-1"
5779
# Instance type
5880
## instance-type = "m5.large"
59-
# Scaling group name
60-
scaling-group = "default"
61-
# Scaling group type
62-
scaling-group-type = "compute"
6381
# Path to PID file
6482
pid-file = "/dev/null"
6583
# Event loop type
@@ -74,22 +92,6 @@
7492
metadata-server-bind-host = "0.0.0.0"
7593
# Metadata server port
7694
metadata-server-port = 40128
77-
# Allowed compute plugins
78-
## allow-compute-plugins = [ "ai.backend.accelerator.cuda_open", "ai.backend.activator.agent",]
79-
# Blocked compute plugins
80-
## block-compute-plugins = [ "ai.backend.accelerator.mock",]
81-
# Allowed network plugins
82-
## allow-network-plugins = [ "ai.backend.manager.network.overlay",]
83-
# Blocked network plugins
84-
## block-network-plugins = [ "ai.backend.manager.network.overlay",]
85-
# Path for image commit
86-
image-commit-path = "tmp/backend.ai/commit"
87-
# Path for abuse reports
88-
## abuse-report-path = "/var/log/backend.ai/abuse"
89-
# Whether to force terminate abusing containers
90-
force-terminate-abusing-containers = false
91-
# Kernel creation concurrency
92-
kernel-creation-concurrency = 4
9395
# Whether to use experimental Redis event dispatcher
9496
use-experimental-redis-event-dispatcher = false
9597
# Docker mode detected based on kernel version (linuxkit/native)
@@ -104,17 +106,18 @@
104106
# Synchronization interval in seconds
105107
interval = 10.0
106108

107-
# Container configuration
109+
# Container configuration.
110+
# If agents field is populated, this field indicates the default values for all
111+
# agents.
108112
[container]
109113
# Kernel user ID
110114
kernel-uid = -1
111115
# Kernel group ID
112116
kernel-gid = -1
113-
# Bind host for containers
114-
bind-host = ""
115-
# Advertised host for containers
116-
## advertised-host = "192.168.1.100"
117-
# Port range for containers
117+
# Port range for containers.
118+
# If multiple agents are used, user must ensure that the port ranges
119+
# do not overlap between the agent, else it may cause subtle issues
120+
# late into the agent's runtime.
118121
port-range = [ 30000, 31000,]
119122
# Statistics type
120123
## stats-type = "docker"
@@ -140,6 +143,10 @@
140143
# networks, and services.
141144
# This field is only used when backend is set to 'docker'.
142145
swarm-enabled = false
146+
# Bind host for containers
147+
bind-host = ""
148+
# Advertised host for containers
149+
## advertised-host = "192.168.1.100"
143150

144151
# KRunner volumes configuration, mapping container names to host paths.
145152
# This is used to specify volumes that should be mounted into containers
@@ -149,6 +156,32 @@
149156
# It is not intended to be set in the configuration file.
150157
## [container.krunner-volumes]
151158

159+
# Resource configuration.
160+
# If agents field is populated, this field indicates the default values for all
161+
# agents.
162+
[resource]
163+
# The number of CPU cores reserved for the operating system and the agent
164+
# service.
165+
reserved-cpu = 1
166+
# The memory space reserved for the operating system and the agent service. It
167+
# is subtracted from the reported main memory size and not available for user
168+
# workload allocation. Depending on the memory-align-size option and system
169+
# configuration, this may not be the exact value but have slightly less or more
170+
# values within the memory-align-size.
171+
reserved-mem = "1G"
172+
# The disk space reserved for the operating system and the agent service.
173+
# Currently this value is unused. In future releases, it may be used to preserve
174+
# the minimum disk space from the scratch disk allocation via loopback files.
175+
reserved-disk = "8G"
176+
# The alignment of the reported main memory size to absorb tiny deviations from
177+
# per-node firmware/hardware settings. Recommended to be multiple of the
178+
# page/hugepage size (e.g., 2 MiB).
179+
memory-align-size = "16M"
180+
# Resource allocation order
181+
allocation-order = [ "cuda", "rocm", "tpu", "cpu", "mem",]
182+
# Affinity policy
183+
affinity-policy = "INTERLEAVED"
184+
152185
# Pyroscope configuration
153186
[pyroscope]
154187
# Whether to enable Pyroscope profiling
@@ -228,30 +261,6 @@
228261
# Override default log level for specific scope of package
229262
[logging.pkg_ns]
230263

231-
# Resource configuration
232-
[resource]
233-
# The number of CPU cores reserved for the operating system and the agent
234-
# service.
235-
reserved-cpu = 1
236-
# The memory space reserved for the operating system and the agent service. It
237-
# is subtracted from the reported main memory size and not available for user
238-
# workload allocation. Depending on the memory-align-size option and system
239-
# configuration, this may not be the exact value but have slightly less or more
240-
# values within the memory-align-size.
241-
reserved-mem = "1G"
242-
# The disk space reserved for the operating system and the agent service.
243-
# Currently this value is unused. In future releases, it may be used to preserve
244-
# the minimum disk space from the scratch disk allocation via loopback files.
245-
reserved-disk = "8G"
246-
# The alignment of the reported main memory size to absorb tiny deviations from
247-
# per-node firwmare/hardware settings. Recommended to be multiple of the
248-
# page/hugepage size (e.g., 2 MiB).
249-
memory-align-size = "16M"
250-
# Resource allocation order
251-
allocation-order = [ "cuda", "rocm", "tpu", "cpu", "mem",]
252-
# Affinity policy
253-
affinity-policy = "INTERLEAVED"
254-
255264
# OpenTelemetry configuration
256265
[otel]
257266
# Whether to enable OpenTelemetry
@@ -344,3 +353,112 @@
344353
# This field is injected at runtime based on etcd configuration.
345354
# It is not intended to be set in the other way.
346355
## [redis]
356+
357+
# Configuration overrides for multiple agents.
358+
# Use this field only to define 2 or more agents, as defining only one
359+
# agent using this field is redundant. Use the fields agent, container,
360+
# and resource to define the configuration at a global level.
361+
# Any field populated in the agents config will be treated as an
362+
# override to the global default values. Thus the global fields must still
363+
# be provided when defining multiple agents.
364+
[[agents]]
365+
# Add multiple [[agents]] sections as needed
366+
# Agent config overrides for the individual agent.
367+
# All fields except Agent ID are by default optional.
368+
# Only override fields if necessary.
369+
[agents.agent]
370+
# Agent ID
371+
## id = "agent-001"
372+
# Agent socket port
373+
agent-sock-port = 6007
374+
# Base path for IPC
375+
ipc-base-path = "/tmp/backend.ai/ipc"
376+
# Base path for variable data
377+
var-base-path = "var/lib/backend.ai"
378+
# Scaling group name
379+
scaling-group = "default"
380+
# Scaling group type
381+
scaling-group-type = "compute"
382+
# Allowed compute plugins
383+
## allow-compute-plugins = [ "ai.backend.accelerator.cuda_open", "ai.backend.activator.agent",]
384+
# Blocked compute plugins
385+
## block-compute-plugins = [ "ai.backend.accelerator.mock",]
386+
# Allowed network plugins
387+
## allow-network-plugins = [ "ai.backend.manager.network.overlay",]
388+
# Blocked network plugins
389+
## block-network-plugins = [ "ai.backend.manager.network.overlay",]
390+
# Path for image commit
391+
image-commit-path = "tmp/backend.ai/commit"
392+
# Path for abuse reports
393+
## abuse-report-path = "/var/log/backend.ai/abuse"
394+
# Whether to force terminate abusing containers
395+
force-terminate-abusing-containers = false
396+
# Kernel creation concurrency
397+
kernel-creation-concurrency = 4
398+
399+
# Container lifecycle synchronization config
400+
[agents.agent.sync-container-lifecycles]
401+
# Whether to enable container lifecycle synchronization
402+
enabled = true
403+
# Synchronization interval in seconds
404+
interval = 10.0
405+
406+
# Container config overrides for the individual agent
407+
[agents.container]
408+
# Kernel user ID
409+
kernel-uid = -1
410+
# Kernel group ID
411+
kernel-gid = -1
412+
# Port range for containers.
413+
# If multiple agents are used, user must ensure that the port ranges
414+
# do not overlap between the agent, else it may cause subtle issues
415+
# late into the agent's runtime.
416+
port-range = [ 30000, 31000,]
417+
# Statistics type
418+
## stats-type = "cgroup"
419+
# Sandbox type
420+
sandbox-type = "docker"
421+
# Jail arguments
422+
jail-args = [ "--mount", "/tmp",]
423+
# Scratch type
424+
scratch-type = "hostdir"
425+
# Scratch root directory
426+
scratch-root = "scratches"
427+
# Scratch size
428+
scratch-size = 0
429+
# Scratch NFS address
430+
## scratch-nfs-address = "192.168.1.100:/export"
431+
# Scratch NFS options
432+
## scratch-nfs-options = "rw,sync"
433+
# Alternative bridge network
434+
## alternative-bridge = "br-backend"
435+
# Whether to enable Docker Swarm mode.
436+
# This allows the agent to manage containers in a Docker Swarm cluster.
437+
# When enabled, the agent will use Docker Swarm APIs to manage containers,
438+
# networks, and services.
439+
# This field is only used when backend is set to 'docker'.
440+
swarm-enabled = false
441+
442+
# Resource config overrides for the individual agent
443+
[agents.resource]
444+
# The number of CPU cores reserved for the operating system and the agent
445+
# service.
446+
reserved-cpu = 1
447+
# The memory space reserved for the operating system and the agent service. It
448+
# is subtracted from the reported main memory size and not available for user
449+
# workload allocation. Depending on the memory-align-size option and system
450+
# configuration, this may not be the exact value but have slightly less or more
451+
# values within the memory-align-size.
452+
reserved-mem = 1073741824
453+
# The disk space reserved for the operating system and the agent service.
454+
# Currently this value is unused. In future releases, it may be used to preserve
455+
# the minimum disk space from the scratch disk allocation via loopback files.
456+
reserved-disk = 8589934592
457+
# The alignment of the reported main memory size to absorb tiny deviations from
458+
# per-node firmware/hardware settings. Recommended to be multiple of the
459+
# page/hugepage size (e.g., 2 MiB).
460+
memory-align-size = 16777216
461+
# Resource allocation order
462+
allocation-order = [ "cuda", "rocm", "tpu", "cpu", "mem",]
463+
# Affinity policy
464+
affinity-policy = 1

src/ai/backend/agent/agent.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1517,6 +1517,11 @@ def reset_port_pool(self, used_ports: Iterable[int]) -> None:
15171517
}
15181518
self.port_pool = original_port_pool
15191519

1520+
def update_scaling_group(self, scaling_group: str) -> None:
1521+
self.local_config = self.local_config.with_updates(
1522+
agent_update={"scaling_group": scaling_group}
1523+
)
1524+
15201525
async def purge_containers(self, containers: Iterable[ContainerKernelId]) -> None:
15211526
tasks = [self._purge_container(container) for container in containers]
15221527
await asyncio.gather(*tasks, return_exceptions=True)

0 commit comments

Comments
 (0)