lablup
diff --git a/‎changes/6315.feature.md‎
Lines changed: 1 addition & 0 deletions b/‎changes/6315.feature.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎configs/agent/sample.toml‎
Lines changed: 177 additions & 59 deletions b/‎configs/agent/sample.toml‎
Lines changed: 177 additions & 59 deletions
diff --git a/‎src/ai/backend/agent/agent.py‎
Lines changed: 5 additions & 0 deletions b/‎src/ai/backend/agent/agent.py‎
Lines changed: 5 additions & 0 deletions
@@ -0,0 +1 @@
+Add support for multiple agents in agent server config
@@ -11,8 +11,38 @@
 # It is not intended to be set in the configuration file.
 ## plugins = "..."
 
-# Agent configuration
+# Agent configuration.
+# If agents field is populated, this field indicates the default values for all
+# agents.
 [agent]
+  # Agent ID
+  ## id = "agent-001"
+  # Agent socket port
+  agent-sock-port = 6007
+  # Base path for IPC
+  ipc-base-path = "/tmp/backend.ai/ipc"
+  # Base path for variable data
+  var-base-path = "var/lib/backend.ai"
+  # Scaling group name
+  scaling-group = "default"
+  # Scaling group type
+  scaling-group-type = "compute"
+  # Allowed compute plugins
+  ## allow-compute-plugins = [ "ai.backend.accelerator.cuda_open", "ai.backend.activator.agent",]
+  # Blocked compute plugins
+  ## block-compute-plugins = [ "ai.backend.accelerator.mock",]
+  # Allowed network plugins
+  ## allow-network-plugins = [ "ai.backend.manager.network.overlay",]
+  # Blocked network plugins
+  ## block-network-plugins = [ "ai.backend.manager.network.overlay",]
+  # Path for image commit
+  image-commit-path = "tmp/backend.ai/commit"
+  # Path for abuse reports
+  ## abuse-report-path = "/var/log/backend.ai/abuse"
+  # Whether to force terminate abusing containers
+  force-terminate-abusing-containers = false
+  # Kernel creation concurrency
+  kernel-creation-concurrency = 4
   # Backend type for the agent.
   # This determines how the agent interacts with the underlying infrastructure.
   # Available options are:
@@ -38,14 +68,6 @@
   ## rpc-auth-manager-public-key = "/path/to/public.key"
   # Path to RPC auth agent keypair
   ## rpc-auth-agent-keypair = "/path/to/keypair.key"
-  # Agent socket port
-  agent-sock-port = 6007
-  # Agent ID
-  ## id = "agent-001"
-  # Base path for IPC
-  ipc-base-path = "/tmp/backend.ai/ipc"
-  # Base path for variable data
-  var-base-path = "var/lib/backend.ai"
   # Mount path for containers
   ## mount-path = "/mnt/backend.ai"
   # Whether to enable cohabiting storage proxy
@@ -56,10 +78,6 @@
   ## region = "us-east-1"
   # Instance type
   ## instance-type = "m5.large"
-  # Scaling group name
-  scaling-group = "default"
-  # Scaling group type
-  scaling-group-type = "compute"
   # Path to PID file
   pid-file = "/dev/null"
   # Event loop type
@@ -74,22 +92,6 @@
   metadata-server-bind-host = "0.0.0.0"
   # Metadata server port
   metadata-server-port = 40128
-  # Allowed compute plugins
-  ## allow-compute-plugins = [ "ai.backend.accelerator.cuda_open", "ai.backend.activator.agent",]
-  # Blocked compute plugins
-  ## block-compute-plugins = [ "ai.backend.accelerator.mock",]
-  # Allowed network plugins
-  ## allow-network-plugins = [ "ai.backend.manager.network.overlay",]
-  # Blocked network plugins
-  ## block-network-plugins = [ "ai.backend.manager.network.overlay",]
-  # Path for image commit
-  image-commit-path = "tmp/backend.ai/commit"
-  # Path for abuse reports
-  ## abuse-report-path = "/var/log/backend.ai/abuse"
-  # Whether to force terminate abusing containers
-  force-terminate-abusing-containers = false
-  # Kernel creation concurrency
-  kernel-creation-concurrency = 4
   # Whether to use experimental Redis event dispatcher
   use-experimental-redis-event-dispatcher = false
   # Docker mode detected based on kernel version (linuxkit/native)
@@ -104,17 +106,18 @@
     # Synchronization interval in seconds
     interval = 10.0
 
-# Container configuration
+# Container configuration.
+# If agents field is populated, this field indicates the default values for all
+# agents.
 [container]
   # Kernel user ID
   kernel-uid = -1
   # Kernel group ID
   kernel-gid = -1
-  # Bind host for containers
-  bind-host = ""
-  # Advertised host for containers
-  ## advertised-host = "192.168.1.100"
-  # Port range for containers
+  # Port range for containers.
+  # If multiple agents are used, user must ensure that the port ranges
+  # do not overlap between the agent, else it may cause subtle issues
+  # late into the agent's runtime.
   port-range = [ 30000, 31000,]
   # Statistics type
   ## stats-type = "docker"
@@ -140,6 +143,10 @@
   # networks, and services.
   # This field is only used when backend is set to 'docker'.
   swarm-enabled = false
+  # Bind host for containers
+  bind-host = ""
+  # Advertised host for containers
+  ## advertised-host = "192.168.1.100"
 
   # KRunner volumes configuration, mapping container names to host paths.
   # This is used to specify volumes that should be mounted into containers
@@ -149,6 +156,32 @@
   # It is not intended to be set in the configuration file.
   ## [container.krunner-volumes]
 
+# Resource configuration.
+# If agents field is populated, this field indicates the default values for all
+# agents.
+[resource]
+  # The number of CPU cores reserved for the operating system and the agent
+  # service.
+  reserved-cpu = 1
+  # The memory space reserved for the operating system and the agent service. It
+  # is subtracted from the reported main memory size and not available for user
+  # workload allocation. Depending on the memory-align-size option and system
+  # configuration, this may not be the exact value but have slightly less or more
+  # values within the memory-align-size.
+  reserved-mem = "1G"
+  # The disk space reserved for the operating system and the agent service.
+  # Currently this value is unused. In future releases, it may be used to preserve
+  # the minimum disk space from the scratch disk allocation via loopback files.
+  reserved-disk = "8G"
+  # The alignment of the reported main memory size to absorb tiny deviations from
+  # per-node firmware/hardware settings. Recommended to be multiple of the
+  # page/hugepage size (e.g., 2 MiB).
+  memory-align-size = "16M"
+  # Resource allocation order
+  allocation-order = [ "cuda", "rocm", "tpu", "cpu", "mem",]
+  # Affinity policy
+  affinity-policy = "INTERLEAVED"
+
 # Pyroscope configuration
 [pyroscope]
   # Whether to enable Pyroscope profiling
@@ -228,30 +261,6 @@
   # Override default log level for specific scope of package
   [logging.pkg_ns]
 
-# Resource configuration
-[resource]
-  # The number of CPU cores reserved for the operating system and the agent
-  # service.
-  reserved-cpu = 1
-  # The memory space reserved for the operating system and the agent service. It
-  # is subtracted from the reported main memory size and not available for user
-  # workload allocation. Depending on the memory-align-size option and system
-  # configuration, this may not be the exact value but have slightly less or more
-  # values within the memory-align-size.
-  reserved-mem = "1G"
-  # The disk space reserved for the operating system and the agent service.
-  # Currently this value is unused. In future releases, it may be used to preserve
-  # the minimum disk space from the scratch disk allocation via loopback files.
-  reserved-disk = "8G"
-  # The alignment of the reported main memory size to absorb tiny deviations from
-  # per-node firwmare/hardware settings. Recommended to be multiple of the
-  # page/hugepage size (e.g., 2 MiB).
-  memory-align-size = "16M"
-  # Resource allocation order
-  allocation-order = [ "cuda", "rocm", "tpu", "cpu", "mem",]
-  # Affinity policy
-  affinity-policy = "INTERLEAVED"
-
 # OpenTelemetry configuration
 [otel]
   # Whether to enable OpenTelemetry
@@ -344,3 +353,112 @@
 # This field is injected at runtime based on etcd configuration.
 # It is not intended to be set in the other way.
 ## [redis]
+
+# Configuration overrides for multiple agents.
+# Use this field only to define 2 or more agents, as defining only one
+# agent using this field is redundant. Use the fields agent, container,
+# and resource to define the configuration at a global level.
+# Any field populated in the agents config will be treated as an
+# override to the global default values. Thus the global fields must still
+# be provided when defining multiple agents.
+[[agents]]
+# Add multiple [[agents]] sections as needed
+  # Agent config overrides for the individual agent.
+  # All fields except Agent ID are by default optional.
+  # Only override fields if necessary.
+  [agents.agent]
+    # Agent ID
+    ## id = "agent-001"
+    # Agent socket port
+    agent-sock-port = 6007
+    # Base path for IPC
+    ipc-base-path = "/tmp/backend.ai/ipc"
+    # Base path for variable data
+    var-base-path = "var/lib/backend.ai"
+    # Scaling group name
+    scaling-group = "default"
+    # Scaling group type
+    scaling-group-type = "compute"
+    # Allowed compute plugins
+    ## allow-compute-plugins = [ "ai.backend.accelerator.cuda_open", "ai.backend.activator.agent",]
+    # Blocked compute plugins
+    ## block-compute-plugins = [ "ai.backend.accelerator.mock",]
+    # Allowed network plugins
+    ## allow-network-plugins = [ "ai.backend.manager.network.overlay",]
+    # Blocked network plugins
+    ## block-network-plugins = [ "ai.backend.manager.network.overlay",]
+    # Path for image commit
+    image-commit-path = "tmp/backend.ai/commit"
+    # Path for abuse reports
+    ## abuse-report-path = "/var/log/backend.ai/abuse"
+    # Whether to force terminate abusing containers
+    force-terminate-abusing-containers = false
+    # Kernel creation concurrency
+    kernel-creation-concurrency = 4
+
+    # Container lifecycle synchronization config
+    [agents.agent.sync-container-lifecycles]
+      # Whether to enable container lifecycle synchronization
+      enabled = true
+      # Synchronization interval in seconds
+      interval = 10.0
+
+  # Container config overrides for the individual agent
+  [agents.container]
+    # Kernel user ID
+    kernel-uid = -1
+    # Kernel group ID
+    kernel-gid = -1
+    # Port range for containers.
+    # If multiple agents are used, user must ensure that the port ranges
+    # do not overlap between the agent, else it may cause subtle issues
+    # late into the agent's runtime.
+    port-range = [ 30000, 31000,]
+    # Statistics type
+    ## stats-type = "cgroup"
+    # Sandbox type
+    sandbox-type = "docker"
+    # Jail arguments
+    jail-args = [ "--mount", "/tmp",]
+    # Scratch type
+    scratch-type = "hostdir"
+    # Scratch root directory
+    scratch-root = "scratches"
+    # Scratch size
+    scratch-size = 0
+    # Scratch NFS address
+    ## scratch-nfs-address = "192.168.1.100:/export"
+    # Scratch NFS options
+    ## scratch-nfs-options = "rw,sync"
+    # Alternative bridge network
+    ## alternative-bridge = "br-backend"
+    # Whether to enable Docker Swarm mode.
+    # This allows the agent to manage containers in a Docker Swarm cluster.
+    # When enabled, the agent will use Docker Swarm APIs to manage containers,
+    # networks, and services.
+    # This field is only used when backend is set to 'docker'.
+    swarm-enabled = false
+
+  # Resource config overrides for the individual agent
+  [agents.resource]
+    # The number of CPU cores reserved for the operating system and the agent
+    # service.
+    reserved-cpu = 1
+    # The memory space reserved for the operating system and the agent service. It
+    # is subtracted from the reported main memory size and not available for user
+    # workload allocation. Depending on the memory-align-size option and system
+    # configuration, this may not be the exact value but have slightly less or more
+    # values within the memory-align-size.
+    reserved-mem = 1073741824
+    # The disk space reserved for the operating system and the agent service.
+    # Currently this value is unused. In future releases, it may be used to preserve
+    # the minimum disk space from the scratch disk allocation via loopback files.
+    reserved-disk = 8589934592
+    # The alignment of the reported main memory size to absorb tiny deviations from
+    # per-node firmware/hardware settings. Recommended to be multiple of the
+    # page/hugepage size (e.g., 2 MiB).
+    memory-align-size = 16777216
+    # Resource allocation order
+    allocation-order = [ "cuda", "rocm", "tpu", "cpu", "mem",]
+    # Affinity policy
+    affinity-policy = 1
@@ -1517,6 +1517,11 @@ def reset_port_pool(self, used_ports: Iterable[int]) -> None:
         }
         self.port_pool = original_port_pool
 
+    def update_scaling_group(self, scaling_group: str) -> None:
+        self.local_config = self.local_config.with_updates(
+            agent_update={"scaling_group": scaling_group}
+        )
+
     async def purge_containers(self, containers: Iterable[ContainerKernelId]) -> None:
         tasks = [self._purge_container(container) for container in containers]
         await asyncio.gather(*tasks, return_exceptions=True)
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Add support for multiple agents in agent server config`