lablup
diff --git a/‎changes/6315.feature.md‎
Lines changed: 1 addition & 0 deletions b/‎changes/6315.feature.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎configs/agent/sample.toml‎
Lines changed: 155 additions & 51 deletions b/‎configs/agent/sample.toml‎
Lines changed: 155 additions & 51 deletions
diff --git a/‎src/ai/backend/agent/agent.py‎
Lines changed: 3 additions & 0 deletions b/‎src/ai/backend/agent/agent.py‎
Lines changed: 3 additions & 0 deletions
@@ -0,0 +1 @@
+Add support for multiple agents in agent server config
@@ -6,8 +6,28 @@
 #
 # Generated automatically from the AgentUnifiedConfig schema.
 
-# Agent configuration
+# Complete agent configuration (common + overridable).
 [agent]
+  # Agent ID
+  id = "agent-001"
+  # Agent socket port
+  agent-sock-port = 6007
+  # Scaling group name
+  scaling-group = "default"
+  # Scaling group type
+  scaling-group-type = "compute"
+  # Allowed compute plugins
+  ## allow-compute-plugins = [ "ai.backend.accelerator.cuda_open", "ai.backend.activator.agent",]
+  # Blocked compute plugins
+  ## block-compute-plugins = [ "ai.backend.accelerator.mock",]
+  # Allowed network plugins
+  ## allow-network-plugins = [ "ai.backend.manager.network.overlay",]
+  # Blocked network plugins
+  ## block-network-plugins = [ "ai.backend.manager.network.overlay",]
+  # Whether to force terminate abusing containers
+  force-terminate-abusing-containers = false
+  # Kernel creation concurrency
+  kernel-creation-concurrency = 4
   # Backend type for the agent.
   # This determines how the agent interacts with the underlying infrastructure.
   # Available options are:
@@ -33,10 +53,6 @@
   ## rpc-auth-manager-public-key = "/path/to/public.key"
   # Path to RPC auth agent keypair
   ## rpc-auth-agent-keypair = "/path/to/keypair.key"
-  # Agent socket port
-  agent-sock-port = 6007
-  # Agent ID
-  ## id = "agent-001"
   # Base path for IPC
   ipc-base-path = "/tmp/backend.ai/ipc"
   # Base path for variable data
@@ -51,10 +67,6 @@
   ## region = "us-east-1"
   # Instance type
   ## instance-type = "m5.large"
-  # Scaling group name
-  scaling-group = "default"
-  # Scaling group type
-  scaling-group-type = "compute"
   # Path to PID file
   pid-file = "/dev/null"
   # Event loop type
@@ -69,22 +81,10 @@
   metadata-server-bind-host = "0.0.0.0"
   # Metadata server port
   metadata-server-port = 40128
-  # Allowed compute plugins
-  ## allow-compute-plugins = [ "ai.backend.accelerator.cuda_open", "ai.backend.activator.agent",]
-  # Blocked compute plugins
-  ## block-compute-plugins = [ "ai.backend.accelerator.mock",]
-  # Allowed network plugins
-  ## allow-network-plugins = [ "ai.backend.manager.network.overlay",]
-  # Blocked network plugins
-  ## block-network-plugins = [ "ai.backend.manager.network.overlay",]
   # Path for image commit
   image-commit-path = "tmp/backend.ai/commit"
   # Path for abuse reports
   ## abuse-report-path = "/var/log/backend.ai/abuse"
-  # Whether to force terminate abusing containers
-  force-terminate-abusing-containers = false
-  # Kernel creation concurrency
-  kernel-creation-concurrency = 4
   # Whether to use experimental Redis event dispatcher
   use-experimental-redis-event-dispatcher = false
   # Docker mode detected based on kernel version (linuxkit/native)
@@ -99,17 +99,16 @@
     # Synchronization interval in seconds
     interval = 10.0
 
-# Container configuration
+# Complete container configuration (common + overridable).
 [container]
   # Kernel user ID
   kernel-uid = -1
   # Kernel group ID
   kernel-gid = -1
-  # Bind host for containers
-  bind-host = ""
-  # Advertised host for containers
-  ## advertised-host = "192.168.1.100"
-  # Port range for containers
+  # Port range for containers.
+  # If multiple agents are used, user must ensure that the port ranges
+  # do not overlap between the agent, else it may cause subtle issues
+  # late into the agent's runtime.
   port-range = [ 30000, 31000,]
   # Statistics type
   ## stats-type = "docker"
@@ -135,6 +134,36 @@
   # networks, and services.
   # This field is only used when backend is set to 'docker'.
   swarm-enabled = false
+  # Bind host for containers
+  bind-host = ""
+  # Advertised host for containers
+  ## advertised-host = "192.168.1.100"
+
+# Resource configuration.
+# If agents field is populated, this field indicates the default values for all
+# agents.
+[resource]
+  # The number of CPU cores reserved for the operating system and the agent
+  # service.
+  reserved-cpu = 1
+  # The memory space reserved for the operating system and the agent service. It
+  # is subtracted from the reported main memory size and not available for user
+  # workload allocation. Depending on the memory-align-size option and system
+  # configuration, this may not be the exact value but have slightly less or more
+  # values within the memory-align-size.
+  reserved-mem = "1G"
+  # The disk space reserved for the operating system and the agent service.
+  # Currently this value is unused. In future releases, it may be used to preserve
+  # the minimum disk space from the scratch disk allocation via loopback files.
+  reserved-disk = "8G"
+  # The alignment of the reported main memory size to absorb tiny deviations from
+  # per-node firmware/hardware settings. Recommended to be multiple of the
+  # page/hugepage size (e.g., 2 MiB).
+  memory-align-size = "16M"
+  # Resource allocation order
+  allocation-order = [ "cuda", "rocm", "tpu", "cpu", "mem",]
+  # Affinity policy
+  affinity-policy = "INTERLEAVED"
 
 # Pyroscope configuration
 [pyroscope]
@@ -215,30 +244,6 @@
   # Override default log level for specific scope of package
   [logging.pkg_ns]
 
-# Resource configuration
-[resource]
-  # The number of CPU cores reserved for the operating system and the agent
-  # service.
-  reserved-cpu = 1
-  # The memory space reserved for the operating system and the agent service. It
-  # is subtracted from the reported main memory size and not available for user
-  # workload allocation. Depending on the memory-align-size option and system
-  # configuration, this may not be the exact value but have slightly less or more
-  # values within the memory-align-size.
-  reserved-mem = "1G"
-  # The disk space reserved for the operating system and the agent service.
-  # Currently this value is unused. In future releases, it may be used to preserve
-  # the minimum disk space from the scratch disk allocation via loopback files.
-  reserved-disk = "8G"
-  # The alignment of the reported main memory size to absorb tiny deviations from
-  # per-node firwmare/hardware settings. Recommended to be multiple of the
-  # page/hugepage size (e.g., 2 MiB).
-  memory-align-size = "16M"
-  # Resource allocation order
-  allocation-order = [ "cuda", "rocm", "tpu", "cpu", "mem",]
-  # Affinity policy
-  affinity-policy = "INTERLEAVED"
-
 # OpenTelemetry configuration
 [otel]
   # Whether to enable OpenTelemetry
@@ -326,3 +331,102 @@
   init-polling-timeout-sec = 60.0
   # Init timeout in seconds
   init-timeout-sec = 60.0
+
+# Configuration overrides for multiple agents.
+# Use this field only to define 2 or more agents, as defining only one
+# agent using this field is redundant. Use the fields agent, container,
+# and resource to define the configuration at a global level.
+# Any field populated in the agents config will be treated as an
+# override to the global default values. Thus the global fields must still
+# be provided when defining multiple agents.
+[[agents]]
+# Add multiple [[agents]] sections as needed
+  # Agent settings that can be overridden per-agent in multi-agent mode.
+  [agents.agent]
+    # Agent ID
+    id = "agent-001"
+    # Agent socket port
+    agent-sock-port = 6007
+    # Scaling group name
+    scaling-group = "default"
+    # Scaling group type
+    scaling-group-type = "compute"
+    # Allowed compute plugins
+    ## allow-compute-plugins = [ "ai.backend.accelerator.cuda_open", "ai.backend.activator.agent",]
+    # Blocked compute plugins
+    ## block-compute-plugins = [ "ai.backend.accelerator.mock",]
+    # Allowed network plugins
+    ## allow-network-plugins = [ "ai.backend.manager.network.overlay",]
+    # Blocked network plugins
+    ## block-network-plugins = [ "ai.backend.manager.network.overlay",]
+    # Whether to force terminate abusing containers
+    force-terminate-abusing-containers = false
+    # Kernel creation concurrency
+    kernel-creation-concurrency = 4
+
+    # Container lifecycle synchronization config
+    [agents.agent.sync-container-lifecycles]
+      # Whether to enable container lifecycle synchronization
+      enabled = true
+      # Synchronization interval in seconds
+      interval = 10.0
+
+  # Container settings that can be overridden per-agent in multi-agent mode.
+  [agents.container]
+    # Kernel user ID
+    kernel-uid = -1
+    # Kernel group ID
+    kernel-gid = -1
+    # Port range for containers.
+    # If multiple agents are used, user must ensure that the port ranges
+    # do not overlap between the agent, else it may cause subtle issues
+    # late into the agent's runtime.
+    port-range = [ 30000, 31000,]
+    # Statistics type
+    ## stats-type = "cgroup"
+    # Sandbox type
+    sandbox-type = "docker"
+    # Jail arguments
+    jail-args = [ "--mount", "/tmp",]
+    # Scratch type
+    scratch-type = "hostdir"
+    # Scratch root directory
+    scratch-root = "scratches"
+    # Scratch size
+    scratch-size = 0
+    # Scratch NFS address
+    ## scratch-nfs-address = "192.168.1.100:/export"
+    # Scratch NFS options
+    ## scratch-nfs-options = "rw,sync"
+    # Alternative bridge network
+    ## alternative-bridge = "br-backend"
+    # Whether to enable Docker Swarm mode.
+    # This allows the agent to manage containers in a Docker Swarm cluster.
+    # When enabled, the agent will use Docker Swarm APIs to manage containers,
+    # networks, and services.
+    # This field is only used when backend is set to 'docker'.
+    swarm-enabled = false
+
+  # Resource config overrides for the individual agent
+  [agents.resource]
+    # The number of CPU cores reserved for the operating system and the agent
+    # service.
+    reserved-cpu = 1
+    # The memory space reserved for the operating system and the agent service. It
+    # is subtracted from the reported main memory size and not available for user
+    # workload allocation. Depending on the memory-align-size option and system
+    # configuration, this may not be the exact value but have slightly less or more
+    # values within the memory-align-size.
+    reserved-mem = 1073741824
+    # The disk space reserved for the operating system and the agent service.
+    # Currently this value is unused. In future releases, it may be used to preserve
+    # the minimum disk space from the scratch disk allocation via loopback files.
+    reserved-disk = 8589934592
+    # The alignment of the reported main memory size to absorb tiny deviations from
+    # per-node firmware/hardware settings. Recommended to be multiple of the
+    # page/hugepage size (e.g., 2 MiB).
+    memory-align-size = 16777216
+    # Resource allocation order
+    allocation-order = [ "cuda", "rocm", "tpu", "cpu", "mem",]
+    # Affinity policy
+    affinity-policy = 1
@@ -1517,6 +1517,9 @@ def reset_port_pool(self, used_ports: Iterable[int]) -> None:
         }
         self.port_pool = original_port_pool
 
+    def update_scaling_group(self, scaling_group: str) -> None:
+        self.local_config.update(agent_update={"scaling_group": scaling_group})
+
     async def purge_containers(self, containers: Iterable[ContainerKernelId]) -> None:
         tasks = [self._purge_container(container) for container in containers]
         await asyncio.gather(*tasks, return_exceptions=True)
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Add support for multiple agents in agent server config`
Original file line number	Diff line number	Diff line change
`@@ -1517,6 +1517,9 @@ def reset_port_pool(self, used_ports: Iterable[int]) -> None:`
`1517`	`1517`	`}`
`1518`	`1518`	`self.port_pool = original_port_pool`
`1519`	`1519`
	`1520`	`+ def update_scaling_group(self, scaling_group: str) -> None:`
	`1521`	`+ self.local_config.update(agent_update={"scaling_group": scaling_group})`
	`1522`	`+`
`1520`	`1523`	`async def purge_containers(self, containers: Iterable[ContainerKernelId]) -> None:`
`1521`	`1524`	`tasks = [self._purge_container(container) for container in containers]`
`1522`	`1525`	`await asyncio.gather(*tasks, return_exceptions=True)`