Skip to content

Commit daec211

Browse files
committed
feat(BA-2752): Add multi agents in agent server config
This change adds config changes that allow the user to specify multiple agents in an agent runtime server. It works by keeping the existing agent configurations as default and add multiple agents as overrides.
1 parent 84b0aae commit daec211

File tree

6 files changed

+928
-232
lines changed

6 files changed

+928
-232
lines changed

changes/6315.feature.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Add support for multiple agents in agent server config

configs/agent/sample.toml

Lines changed: 142 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,9 @@
1111
# It is not intended to be set in the configuration file.
1212
## plugins = "..."
1313

14-
# Agent configuration
14+
# Agent configuration.
15+
# If agents field is populated, this field indicates the default values for all
16+
# agents.
1517
[agent]
1618
# Backend type for the agent.
1719
# This determines how the agent interacts with the underlying infrastructure.
@@ -75,7 +77,7 @@
7577
# Metadata server port
7678
metadata-server-port = 40128
7779
# Allowed compute plugins
78-
## allow-compute-plugins = [ "ai.backend.accelerator.cuda_open", "ai.backend.activator.agent",]
80+
## allow-compute-plugins = [ "ai.backend.activator.agent", "ai.backend.accelerator.cuda_open",]
7981
# Blocked compute plugins
8082
## block-compute-plugins = [ "ai.backend.accelerator.mock",]
8183
# Allowed network plugins
@@ -104,7 +106,9 @@
104106
# Synchronization interval in seconds
105107
interval = 10.0
106108

107-
# Container configuration
109+
# Container configuration.
110+
# If agents field is populated, this field indicates the default values for all
111+
# agents.
108112
[container]
109113
# Kernel user ID
110114
kernel-uid = -1
@@ -114,7 +118,10 @@
114118
bind-host = ""
115119
# Advertised host for containers
116120
## advertised-host = "192.168.1.100"
117-
# Port range for containers
121+
# Port range for containers.
122+
# If multiple agents are used, user must ensure that the port ranges
123+
# do not overlap between the agent, else it may cause subtle issues
124+
# late into the agent's runtime.
118125
port-range = [ 30000, 31000,]
119126
# Statistics type
120127
## stats-type = "docker"
@@ -149,6 +156,32 @@
149156
# It is not intended to be set in the configuration file.
150157
## [container.krunner-volumes]
151158

159+
# Resource configuration.
160+
# If agents field is populated, this field indicates the default values for all
161+
# agents.
162+
[resource]
163+
# The number of CPU cores reserved for the operating system and the agent
164+
# service.
165+
reserved-cpu = 1
166+
# The memory space reserved for the operating system and the agent service. It
167+
# is subtracted from the reported main memory size and not available for user
168+
# workload allocation. Depending on the memory-align-size option and system
169+
# configuration, this may not be the exact value but have slightly less or more
170+
# values within the memory-align-size.
171+
reserved-mem = "1G"
172+
# The disk space reserved for the operating system and the agent service.
173+
# Currently this value is unused. In future releases, it may be used to preserve
174+
# the minimum disk space from the scratch disk allocation via loopback files.
175+
reserved-disk = "8G"
176+
# The alignment of the reported main memory size to absorb tiny deviations from
177+
# per-node firmware/hardware settings. Recommended to be multiple of the
178+
# page/hugepage size (e.g., 2 MiB).
179+
memory-align-size = "16M"
180+
# Resource allocation order
181+
allocation-order = [ "cuda", "rocm", "tpu", "cpu", "mem",]
182+
# Affinity policy
183+
affinity-policy = "INTERLEAVED"
184+
152185
# Pyroscope configuration
153186
[pyroscope]
154187
# Whether to enable Pyroscope profiling
@@ -228,30 +261,6 @@
228261
# Override default log level for specific scope of package
229262
[logging.pkg_ns]
230263

231-
# Resource configuration
232-
[resource]
233-
# The number of CPU cores reserved for the operating system and the agent
234-
# service.
235-
reserved-cpu = 1
236-
# The memory space reserved for the operating system and the agent service. It
237-
# is subtracted from the reported main memory size and not available for user
238-
# workload allocation. Depending on the memory-align-size option and system
239-
# configuration, this may not be the exact value but have slightly less or more
240-
# values within the memory-align-size.
241-
reserved-mem = "1G"
242-
# The disk space reserved for the operating system and the agent service.
243-
# Currently this value is unused. In future releases, it may be used to preserve
244-
# the minimum disk space from the scratch disk allocation via loopback files.
245-
reserved-disk = "8G"
246-
# The alignment of the reported main memory size to absorb tiny deviations from
247-
# per-node firwmare/hardware settings. Recommended to be multiple of the
248-
# page/hugepage size (e.g., 2 MiB).
249-
memory-align-size = "16M"
250-
# Resource allocation order
251-
allocation-order = [ "cuda", "rocm", "tpu", "cpu", "mem",]
252-
# Affinity policy
253-
affinity-policy = "INTERLEAVED"
254-
255264
# OpenTelemetry configuration
256265
[otel]
257266
# Whether to enable OpenTelemetry
@@ -344,3 +353,108 @@
344353
# This field is injected at runtime based on etcd configuration.
345354
# It is not intended to be set in the other way.
346355
## [redis]
356+
357+
# Configuration overrides for multiple agents.
358+
# Use this field only to define 2 or more agents, as defining only one
359+
# agent using this field is redundant. Use the fields agent, container,
360+
# and resource to define the configuration at a global level.
361+
# Any field populated in the agents config will be treated as an
362+
# override to the global default values. Thus the global fields must still
363+
# be provided when defining multiple agents.
364+
[[agents]]
365+
# Add multiple [[agents]] sections as needed
366+
# Agent config overrides for the individual agent.
367+
# All fields except Agent ID are by default optional.
368+
# Only override fields if necessary.
369+
[agents.agent]
370+
# Agent ID
371+
id = "agent-001"
372+
# Agent socket port
373+
## agent-sock-port = 6007 # min=1024 max=65535
374+
# Mount path for containers
375+
## mount-path = "/mnt/backend.ai"
376+
# Whether to enable cohabiting storage proxy
377+
## cohabiting-storage-proxy = true
378+
# Allowed compute plugins
379+
## allow-compute-plugins = [ "ai.backend.activator.agent", "ai.backend.accelerator.cuda_open",]
380+
# Blocked compute plugins
381+
## block-compute-plugins = [ "ai.backend.accelerator.mock",]
382+
# Allowed network plugins
383+
## allow-network-plugins = [ "ai.backend.manager.network.overlay",]
384+
# Blocked network plugins
385+
## block-network-plugins = [ "ai.backend.manager.network.overlay",]
386+
# Whether to force terminate abusing containers
387+
## force-terminate-abusing-containers = true
388+
# Kernel creation concurrency
389+
## kernel-creation-concurrency = 4 # min=1 max=32
390+
# Docker mode detected based on kernel version (linuxkit/native)
391+
## docker-mode = "linuxkit"
392+
# Owner uid:gid of the mount directory
393+
## mount-path-uid-gid = "root:root"
394+
395+
# Container lifecycle synchronization config
396+
[agents.agent.sync-container-lifecycles]
397+
# Whether to enable container lifecycle synchronization
398+
enabled = true
399+
# Synchronization interval in seconds
400+
interval = 10.0
401+
402+
# Container config overrides for the individual agent
403+
[agents.container]
404+
# Kernel user ID
405+
## kernel-uid = 1000
406+
# Kernel group ID
407+
## kernel-gid = 1000
408+
# Port range for containers.
409+
# If multiple agents are used, user must ensure that the port ranges
410+
# do not overlap between the agent, else it may cause subtle issues
411+
# late into the agent's runtime.
412+
## port-range = [ 30000, 31000,]
413+
# Statistics type
414+
## stats-type = "cgroup"
415+
# Sandbox type
416+
## sandbox-type = "docker"
417+
# Jail arguments
418+
## jail-args = [ "--mount", "/tmp",]
419+
# Scratch type
420+
## scratch-type = "hostdir"
421+
# Scratch root directory
422+
## scratch-root = "./scratches"
423+
# Scratch size
424+
## scratch-size = "1G"
425+
# Scratch NFS address
426+
## scratch-nfs-address = "192.168.1.100:/export"
427+
# Scratch NFS options
428+
## scratch-nfs-options = "rw,sync"
429+
# Alternative bridge network
430+
## alternative-bridge = "br-backend"
431+
# Whether to enable Docker Swarm mode.
432+
# This allows the agent to manage containers in a Docker Swarm cluster.
433+
# When enabled, the agent will use Docker Swarm APIs to manage containers,
434+
# networks, and services.
435+
# This field is only used when backend is set to 'docker'.
436+
## swarm-enabled = true
437+
438+
# Resource config overrides for the individual agent
439+
[agents.resource]
440+
# The number of CPU cores reserved for the operating system and the agent
441+
# service.
442+
reserved-cpu = 1
443+
# The memory space reserved for the operating system and the agent service. It
444+
# is subtracted from the reported main memory size and not available for user
445+
# workload allocation. Depending on the memory-align-size option and system
446+
# configuration, this may not be the exact value but have slightly less or more
447+
# values within the memory-align-size.
448+
reserved-mem = 1073741824
449+
# The disk space reserved for the operating system and the agent service.
450+
# Currently this value is unused. In future releases, it may be used to preserve
451+
# the minimum disk space from the scratch disk allocation via loopback files.
452+
reserved-disk = 8589934592
453+
# The alignment of the reported main memory size to absorb tiny deviations from
454+
# per-node firmware/hardware settings. Recommended to be multiple of the
455+
# page/hugepage size (e.g., 2 MiB).
456+
memory-align-size = 16777216
457+
# Resource allocation order
458+
allocation-order = [ "cuda", "rocm", "tpu", "cpu", "mem",]
459+
# Affinity policy
460+
affinity-policy = 1

src/ai/backend/agent/agent.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1517,6 +1517,11 @@ def reset_port_pool(self, used_ports: Iterable[int]) -> None:
15171517
}
15181518
self.port_pool = original_port_pool
15191519

1520+
def update_scaling_group(self, scaling_group: str) -> None:
1521+
self.local_config = self.local_config.with_updates(
1522+
agent_update={"scaling_group": scaling_group}
1523+
)
1524+
15201525
async def purge_containers(self, containers: Iterable[ContainerKernelId]) -> None:
15211526
tasks = [self._purge_container(container) for container in containers]
15221527
await asyncio.gather(*tasks, return_exceptions=True)

0 commit comments

Comments
 (0)