|
11 | 11 | # It is not intended to be set in the configuration file. |
12 | 12 | ## plugins = "..." |
13 | 13 |
|
14 | | -# Agent configuration |
| 14 | +# Agent configuration. |
| 15 | +# If agents field is populated, this field indicates the default values for all |
| 16 | +# agents. |
15 | 17 | [agent] |
16 | 18 | # Backend type for the agent. |
17 | 19 | # This determines how the agent interacts with the underlying infrastructure. |
|
75 | 77 | # Metadata server port |
76 | 78 | metadata-server-port = 40128 |
77 | 79 | # Allowed compute plugins |
78 | | - ## allow-compute-plugins = [ "ai.backend.accelerator.cuda_open", "ai.backend.activator.agent",] |
| 80 | + ## allow-compute-plugins = [ "ai.backend.activator.agent", "ai.backend.accelerator.cuda_open",] |
79 | 81 | # Blocked compute plugins |
80 | 82 | ## block-compute-plugins = [ "ai.backend.accelerator.mock",] |
81 | 83 | # Allowed network plugins |
|
104 | 106 | # Synchronization interval in seconds |
105 | 107 | interval = 10.0 |
106 | 108 |
|
107 | | -# Container configuration |
| 109 | +# Container configuration. |
| 110 | +# If agents field is populated, this field indicates the default values for all |
| 111 | +# agents. |
108 | 112 | [container] |
109 | 113 | # Kernel user ID |
110 | 114 | kernel-uid = -1 |
|
114 | 118 | bind-host = "" |
115 | 119 | # Advertised host for containers |
116 | 120 | ## advertised-host = "192.168.1.100" |
117 | | - # Port range for containers |
| 121 | + # Port range for containers. |
| 122 | + # If multiple agents are used, user must ensure that the port ranges |
| 123 | + # do not overlap between the agent, else it may cause subtle issues |
| 124 | + # late into the agent's runtime. |
118 | 125 | port-range = [ 30000, 31000,] |
119 | 126 | # Statistics type |
120 | 127 | ## stats-type = "docker" |
|
149 | 156 | # It is not intended to be set in the configuration file. |
150 | 157 | ## [container.krunner-volumes] |
151 | 158 |
|
| 159 | +# Resource configuration. |
| 160 | +# If agents field is populated, this field indicates the default values for all |
| 161 | +# agents. |
| 162 | +[resource] |
| 163 | + # The number of CPU cores reserved for the operating system and the agent |
| 164 | + # service. |
| 165 | + reserved-cpu = 1 |
| 166 | + # The memory space reserved for the operating system and the agent service. It |
| 167 | + # is subtracted from the reported main memory size and not available for user |
| 168 | + # workload allocation. Depending on the memory-align-size option and system |
| 169 | + # configuration, this may not be the exact value but have slightly less or more |
| 170 | + # values within the memory-align-size. |
| 171 | + reserved-mem = "1G" |
| 172 | + # The disk space reserved for the operating system and the agent service. |
| 173 | + # Currently this value is unused. In future releases, it may be used to preserve |
| 174 | + # the minimum disk space from the scratch disk allocation via loopback files. |
| 175 | + reserved-disk = "8G" |
| 176 | + # The alignment of the reported main memory size to absorb tiny deviations from |
| 177 | + # per-node firmware/hardware settings. Recommended to be multiple of the |
| 178 | + # page/hugepage size (e.g., 2 MiB). |
| 179 | + memory-align-size = "16M" |
| 180 | + # Resource allocation order |
| 181 | + allocation-order = [ "cuda", "rocm", "tpu", "cpu", "mem",] |
| 182 | + # Affinity policy |
| 183 | + affinity-policy = "INTERLEAVED" |
| 184 | + |
152 | 185 | # Pyroscope configuration |
153 | 186 | [pyroscope] |
154 | 187 | # Whether to enable Pyroscope profiling |
|
228 | 261 | # Override default log level for specific scope of package |
229 | 262 | [logging.pkg_ns] |
230 | 263 |
|
231 | | -# Resource configuration |
232 | | -[resource] |
233 | | - # The number of CPU cores reserved for the operating system and the agent |
234 | | - # service. |
235 | | - reserved-cpu = 1 |
236 | | - # The memory space reserved for the operating system and the agent service. It |
237 | | - # is subtracted from the reported main memory size and not available for user |
238 | | - # workload allocation. Depending on the memory-align-size option and system |
239 | | - # configuration, this may not be the exact value but have slightly less or more |
240 | | - # values within the memory-align-size. |
241 | | - reserved-mem = "1G" |
242 | | - # The disk space reserved for the operating system and the agent service. |
243 | | - # Currently this value is unused. In future releases, it may be used to preserve |
244 | | - # the minimum disk space from the scratch disk allocation via loopback files. |
245 | | - reserved-disk = "8G" |
246 | | - # The alignment of the reported main memory size to absorb tiny deviations from |
247 | | - # per-node firwmare/hardware settings. Recommended to be multiple of the |
248 | | - # page/hugepage size (e.g., 2 MiB). |
249 | | - memory-align-size = "16M" |
250 | | - # Resource allocation order |
251 | | - allocation-order = [ "cuda", "rocm", "tpu", "cpu", "mem",] |
252 | | - # Affinity policy |
253 | | - affinity-policy = "INTERLEAVED" |
254 | | - |
255 | 264 | # OpenTelemetry configuration |
256 | 265 | [otel] |
257 | 266 | # Whether to enable OpenTelemetry |
|
344 | 353 | # This field is injected at runtime based on etcd configuration. |
345 | 354 | # It is not intended to be set in the other way. |
346 | 355 | ## [redis] |
| 356 | + |
| 357 | +# Configuration overrides for multiple agents. |
| 358 | +# Use this field only to define 2 or more agents, as defining only one |
| 359 | +# agent using this field is redundant. Use the fields agent, container, |
| 360 | +# and resource to define the configuration at a global level. |
| 361 | +# Any field populated in the agents config will be treated as an |
| 362 | +# override to the global default values. Thus the global fields must still |
| 363 | +# be provided when defining multiple agents. |
| 364 | +[[agents]] |
| 365 | +# Add multiple [[agents]] sections as needed |
| 366 | + # Agent config overrides for the individual agent. |
| 367 | + # All fields except Agent ID are by default optional. |
| 368 | + # Only override fields if necessary. |
| 369 | + [agents.agent] |
| 370 | + # Agent ID |
| 371 | + id = "agent-001" |
| 372 | + # Agent socket port |
| 373 | + ## agent-sock-port = 6007 # min=1024 max=65535 |
| 374 | + # Mount path for containers |
| 375 | + ## mount-path = "/mnt/backend.ai" |
| 376 | + # Whether to enable cohabiting storage proxy |
| 377 | + ## cohabiting-storage-proxy = true |
| 378 | + # Allowed compute plugins |
| 379 | + ## allow-compute-plugins = [ "ai.backend.activator.agent", "ai.backend.accelerator.cuda_open",] |
| 380 | + # Blocked compute plugins |
| 381 | + ## block-compute-plugins = [ "ai.backend.accelerator.mock",] |
| 382 | + # Allowed network plugins |
| 383 | + ## allow-network-plugins = [ "ai.backend.manager.network.overlay",] |
| 384 | + # Blocked network plugins |
| 385 | + ## block-network-plugins = [ "ai.backend.manager.network.overlay",] |
| 386 | + # Whether to force terminate abusing containers |
| 387 | + ## force-terminate-abusing-containers = true |
| 388 | + # Kernel creation concurrency |
| 389 | + ## kernel-creation-concurrency = 4 # min=1 max=32 |
| 390 | + # Docker mode detected based on kernel version (linuxkit/native) |
| 391 | + ## docker-mode = "linuxkit" |
| 392 | + # Owner uid:gid of the mount directory |
| 393 | + ## mount-path-uid-gid = "root:root" |
| 394 | + |
| 395 | + # Container lifecycle synchronization config |
| 396 | + [agents.agent.sync-container-lifecycles] |
| 397 | + # Whether to enable container lifecycle synchronization |
| 398 | + enabled = true |
| 399 | + # Synchronization interval in seconds |
| 400 | + interval = 10.0 |
| 401 | + |
| 402 | + # Container config overrides for the individual agent |
| 403 | + [agents.container] |
| 404 | + # Kernel user ID |
| 405 | + ## kernel-uid = 1000 |
| 406 | + # Kernel group ID |
| 407 | + ## kernel-gid = 1000 |
| 408 | + # Port range for containers. |
| 409 | + # If multiple agents are used, user must ensure that the port ranges |
| 410 | + # do not overlap between the agent, else it may cause subtle issues |
| 411 | + # late into the agent's runtime. |
| 412 | + ## port-range = [ 30000, 31000,] |
| 413 | + # Statistics type |
| 414 | + ## stats-type = "cgroup" |
| 415 | + # Sandbox type |
| 416 | + ## sandbox-type = "docker" |
| 417 | + # Jail arguments |
| 418 | + ## jail-args = [ "--mount", "/tmp",] |
| 419 | + # Scratch type |
| 420 | + ## scratch-type = "hostdir" |
| 421 | + # Scratch root directory |
| 422 | + ## scratch-root = "./scratches" |
| 423 | + # Scratch size |
| 424 | + ## scratch-size = "1G" |
| 425 | + # Scratch NFS address |
| 426 | + ## scratch-nfs-address = "192.168.1.100:/export" |
| 427 | + # Scratch NFS options |
| 428 | + ## scratch-nfs-options = "rw,sync" |
| 429 | + # Alternative bridge network |
| 430 | + ## alternative-bridge = "br-backend" |
| 431 | + # Whether to enable Docker Swarm mode. |
| 432 | + # This allows the agent to manage containers in a Docker Swarm cluster. |
| 433 | + # When enabled, the agent will use Docker Swarm APIs to manage containers, |
| 434 | + # networks, and services. |
| 435 | + # This field is only used when backend is set to 'docker'. |
| 436 | + ## swarm-enabled = true |
| 437 | + |
| 438 | + # Resource config overrides for the individual agent |
| 439 | + [agents.resource] |
| 440 | + # The number of CPU cores reserved for the operating system and the agent |
| 441 | + # service. |
| 442 | + reserved-cpu = 1 |
| 443 | + # The memory space reserved for the operating system and the agent service. It |
| 444 | + # is subtracted from the reported main memory size and not available for user |
| 445 | + # workload allocation. Depending on the memory-align-size option and system |
| 446 | + # configuration, this may not be the exact value but have slightly less or more |
| 447 | + # values within the memory-align-size. |
| 448 | + reserved-mem = 1073741824 |
| 449 | + # The disk space reserved for the operating system and the agent service. |
| 450 | + # Currently this value is unused. In future releases, it may be used to preserve |
| 451 | + # the minimum disk space from the scratch disk allocation via loopback files. |
| 452 | + reserved-disk = 8589934592 |
| 453 | + # The alignment of the reported main memory size to absorb tiny deviations from |
| 454 | + # per-node firmware/hardware settings. Recommended to be multiple of the |
| 455 | + # page/hugepage size (e.g., 2 MiB). |
| 456 | + memory-align-size = 16777216 |
| 457 | + # Resource allocation order |
| 458 | + allocation-order = [ "cuda", "rocm", "tpu", "cpu", "mem",] |
| 459 | + # Affinity policy |
| 460 | + affinity-policy = 1 |
0 commit comments