|
6 | 6 | # |
7 | 7 | # Generated automatically from the AgentUnifiedConfig schema. |
8 | 8 |
|
9 | | -# Agent configuration |
| 9 | +# Complete agent configuration (common + overridable). |
10 | 10 | [agent] |
| 11 | + # Agent ID |
| 12 | + id = "agent-001" |
| 13 | + # Agent socket port |
| 14 | + agent-sock-port = 6007 |
| 15 | + # Scaling group name |
| 16 | + scaling-group = "default" |
| 17 | + # Scaling group type |
| 18 | + scaling-group-type = "compute" |
| 19 | + # Allowed compute plugins |
| 20 | + ## allow-compute-plugins = [ "ai.backend.accelerator.cuda_open", "ai.backend.activator.agent",] |
| 21 | + # Blocked compute plugins |
| 22 | + ## block-compute-plugins = [ "ai.backend.accelerator.mock",] |
| 23 | + # Allowed network plugins |
| 24 | + ## allow-network-plugins = [ "ai.backend.manager.network.overlay",] |
| 25 | + # Blocked network plugins |
| 26 | + ## block-network-plugins = [ "ai.backend.manager.network.overlay",] |
| 27 | + # Whether to force terminate abusing containers |
| 28 | + force-terminate-abusing-containers = false |
| 29 | + # Kernel creation concurrency |
| 30 | + kernel-creation-concurrency = 4 |
11 | 31 | # Backend type for the agent. |
12 | 32 | # This determines how the agent interacts with the underlying infrastructure. |
13 | 33 | # Available options are: |
|
33 | 53 | ## rpc-auth-manager-public-key = "/path/to/public.key" |
34 | 54 | # Path to RPC auth agent keypair |
35 | 55 | ## rpc-auth-agent-keypair = "/path/to/keypair.key" |
36 | | - # Agent socket port |
37 | | - agent-sock-port = 6007 |
38 | | - # Agent ID |
39 | | - ## id = "agent-001" |
40 | 56 | # Base path for IPC |
41 | 57 | ipc-base-path = "/tmp/backend.ai/ipc" |
42 | 58 | # Base path for variable data |
|
51 | 67 | ## region = "us-east-1" |
52 | 68 | # Instance type |
53 | 69 | ## instance-type = "m5.large" |
54 | | - # Scaling group name |
55 | | - scaling-group = "default" |
56 | | - # Scaling group type |
57 | | - scaling-group-type = "compute" |
58 | 70 | # Path to PID file |
59 | 71 | pid-file = "/dev/null" |
60 | 72 | # Event loop type |
|
69 | 81 | metadata-server-bind-host = "0.0.0.0" |
70 | 82 | # Metadata server port |
71 | 83 | metadata-server-port = 40128 |
72 | | - # Allowed compute plugins |
73 | | - ## allow-compute-plugins = [ "ai.backend.accelerator.cuda_open", "ai.backend.activator.agent",] |
74 | | - # Blocked compute plugins |
75 | | - ## block-compute-plugins = [ "ai.backend.accelerator.mock",] |
76 | | - # Allowed network plugins |
77 | | - ## allow-network-plugins = [ "ai.backend.manager.network.overlay",] |
78 | | - # Blocked network plugins |
79 | | - ## block-network-plugins = [ "ai.backend.manager.network.overlay",] |
80 | 84 | # Path for image commit |
81 | 85 | image-commit-path = "tmp/backend.ai/commit" |
82 | 86 | # Path for abuse reports |
83 | 87 | ## abuse-report-path = "/var/log/backend.ai/abuse" |
84 | | - # Whether to force terminate abusing containers |
85 | | - force-terminate-abusing-containers = false |
86 | | - # Kernel creation concurrency |
87 | | - kernel-creation-concurrency = 4 |
88 | 88 | # Whether to use experimental Redis event dispatcher |
89 | 89 | use-experimental-redis-event-dispatcher = false |
90 | 90 | # Docker mode detected based on kernel version (linuxkit/native) |
|
99 | 99 | # Synchronization interval in seconds |
100 | 100 | interval = 10.0 |
101 | 101 |
|
102 | | -# Container configuration |
| 102 | +# Complete container configuration (common + overridable). |
103 | 103 | [container] |
104 | 104 | # Kernel user ID |
105 | 105 | kernel-uid = -1 |
106 | 106 | # Kernel group ID |
107 | 107 | kernel-gid = -1 |
108 | | - # Bind host for containers |
109 | | - bind-host = "" |
110 | | - # Advertised host for containers |
111 | | - ## advertised-host = "192.168.1.100" |
112 | | - # Port range for containers |
| 108 | + # Port range for containers. |
| 109 | + # If multiple agents are used, user must ensure that the port ranges |
| 110 | + # do not overlap between the agent, else it may cause subtle issues |
| 111 | + # late into the agent's runtime. |
113 | 112 | port-range = [ 30000, 31000,] |
114 | 113 | # Statistics type |
115 | 114 | ## stats-type = "docker" |
|
135 | 134 | # networks, and services. |
136 | 135 | # This field is only used when backend is set to 'docker'. |
137 | 136 | swarm-enabled = false |
| 137 | + # Bind host for containers |
| 138 | + bind-host = "" |
| 139 | + # Advertised host for containers |
| 140 | + ## advertised-host = "192.168.1.100" |
| 141 | + |
| 142 | +# Resource configuration. |
| 143 | +# If agents field is populated, this field indicates the default values for all |
| 144 | +# agents. |
| 145 | +[resource] |
| 146 | + # The number of CPU cores reserved for the operating system and the agent |
| 147 | + # service. |
| 148 | + reserved-cpu = 1 |
| 149 | + # The memory space reserved for the operating system and the agent service. It |
| 150 | + # is subtracted from the reported main memory size and not available for user |
| 151 | + # workload allocation. Depending on the memory-align-size option and system |
| 152 | + # configuration, this may not be the exact value but have slightly less or more |
| 153 | + # values within the memory-align-size. |
| 154 | + reserved-mem = "1G" |
| 155 | + # The disk space reserved for the operating system and the agent service. |
| 156 | + # Currently this value is unused. In future releases, it may be used to preserve |
| 157 | + # the minimum disk space from the scratch disk allocation via loopback files. |
| 158 | + reserved-disk = "8G" |
| 159 | + # The alignment of the reported main memory size to absorb tiny deviations from |
| 160 | + # per-node firmware/hardware settings. Recommended to be multiple of the |
| 161 | + # page/hugepage size (e.g., 2 MiB). |
| 162 | + memory-align-size = "16M" |
| 163 | + # Resource allocation order |
| 164 | + allocation-order = [ "cuda", "rocm", "tpu", "cpu", "mem",] |
| 165 | + # Affinity policy |
| 166 | + affinity-policy = "INTERLEAVED" |
138 | 167 |
|
139 | 168 | # Pyroscope configuration |
140 | 169 | [pyroscope] |
|
215 | 244 | # Override default log level for specific scope of package |
216 | 245 | [logging.pkg_ns] |
217 | 246 |
|
218 | | -# Resource configuration |
219 | | -[resource] |
220 | | - # The number of CPU cores reserved for the operating system and the agent |
221 | | - # service. |
222 | | - reserved-cpu = 1 |
223 | | - # The memory space reserved for the operating system and the agent service. It |
224 | | - # is subtracted from the reported main memory size and not available for user |
225 | | - # workload allocation. Depending on the memory-align-size option and system |
226 | | - # configuration, this may not be the exact value but have slightly less or more |
227 | | - # values within the memory-align-size. |
228 | | - reserved-mem = "1G" |
229 | | - # The disk space reserved for the operating system and the agent service. |
230 | | - # Currently this value is unused. In future releases, it may be used to preserve |
231 | | - # the minimum disk space from the scratch disk allocation via loopback files. |
232 | | - reserved-disk = "8G" |
233 | | - # The alignment of the reported main memory size to absorb tiny deviations from |
234 | | - # per-node firwmare/hardware settings. Recommended to be multiple of the |
235 | | - # page/hugepage size (e.g., 2 MiB). |
236 | | - memory-align-size = "16M" |
237 | | - # Resource allocation order |
238 | | - allocation-order = [ "cuda", "rocm", "tpu", "cpu", "mem",] |
239 | | - # Affinity policy |
240 | | - affinity-policy = "INTERLEAVED" |
241 | | - |
242 | 247 | # OpenTelemetry configuration |
243 | 248 | [otel] |
244 | 249 | # Whether to enable OpenTelemetry |
|
326 | 331 | init-polling-timeout-sec = 60.0 |
327 | 332 | # Init timeout in seconds |
328 | 333 | init-timeout-sec = 60.0 |
| 334 | + |
| 335 | +# Configuration overrides for multiple agents. |
| 336 | +# Use this field only to define 2 or more agents, as defining only one |
| 337 | +# agent using this field is redundant. Use the fields agent, container, |
| 338 | +# and resource to define the configuration at a global level. |
| 339 | +# Any field populated in the agents config will be treated as an |
| 340 | +# override to the global default values. Thus the global fields must still |
| 341 | +# be provided when defining multiple agents. |
| 342 | +[[agents]] |
| 343 | +# Add multiple [[agents]] sections as needed |
| 344 | + # Agent settings that can be overridden per-agent in multi-agent mode. |
| 345 | + [agents.agent] |
| 346 | + # Agent ID |
| 347 | + id = "agent-001" |
| 348 | + # Agent socket port |
| 349 | + agent-sock-port = 6007 |
| 350 | + # Scaling group name |
| 351 | + scaling-group = "default" |
| 352 | + # Scaling group type |
| 353 | + scaling-group-type = "compute" |
| 354 | + # Allowed compute plugins |
| 355 | + ## allow-compute-plugins = [ "ai.backend.accelerator.cuda_open", "ai.backend.activator.agent",] |
| 356 | + # Blocked compute plugins |
| 357 | + ## block-compute-plugins = [ "ai.backend.accelerator.mock",] |
| 358 | + # Allowed network plugins |
| 359 | + ## allow-network-plugins = [ "ai.backend.manager.network.overlay",] |
| 360 | + # Blocked network plugins |
| 361 | + ## block-network-plugins = [ "ai.backend.manager.network.overlay",] |
| 362 | + # Whether to force terminate abusing containers |
| 363 | + force-terminate-abusing-containers = false |
| 364 | + # Kernel creation concurrency |
| 365 | + kernel-creation-concurrency = 4 |
| 366 | + |
| 367 | + # Container lifecycle synchronization config |
| 368 | + [agents.agent.sync-container-lifecycles] |
| 369 | + # Whether to enable container lifecycle synchronization |
| 370 | + enabled = true |
| 371 | + # Synchronization interval in seconds |
| 372 | + interval = 10.0 |
| 373 | + |
| 374 | + # Container settings that can be overridden per-agent in multi-agent mode. |
| 375 | + [agents.container] |
| 376 | + # Kernel user ID |
| 377 | + kernel-uid = -1 |
| 378 | + # Kernel group ID |
| 379 | + kernel-gid = -1 |
| 380 | + # Port range for containers. |
| 381 | + # If multiple agents are used, user must ensure that the port ranges |
| 382 | + # do not overlap between the agent, else it may cause subtle issues |
| 383 | + # late into the agent's runtime. |
| 384 | + port-range = [ 30000, 31000,] |
| 385 | + # Statistics type |
| 386 | + ## stats-type = "cgroup" |
| 387 | + # Sandbox type |
| 388 | + sandbox-type = "docker" |
| 389 | + # Jail arguments |
| 390 | + jail-args = [ "--mount", "/tmp",] |
| 391 | + # Scratch type |
| 392 | + scratch-type = "hostdir" |
| 393 | + # Scratch root directory |
| 394 | + scratch-root = "scratches" |
| 395 | + # Scratch size |
| 396 | + scratch-size = 0 |
| 397 | + # Scratch NFS address |
| 398 | + ## scratch-nfs-address = "192.168.1.100:/export" |
| 399 | + # Scratch NFS options |
| 400 | + ## scratch-nfs-options = "rw,sync" |
| 401 | + # Alternative bridge network |
| 402 | + ## alternative-bridge = "br-backend" |
| 403 | + # Whether to enable Docker Swarm mode. |
| 404 | + # This allows the agent to manage containers in a Docker Swarm cluster. |
| 405 | + # When enabled, the agent will use Docker Swarm APIs to manage containers, |
| 406 | + # networks, and services. |
| 407 | + # This field is only used when backend is set to 'docker'. |
| 408 | + swarm-enabled = false |
| 409 | + |
| 410 | + # Resource config overrides for the individual agent |
| 411 | + [agents.resource] |
| 412 | + # The number of CPU cores reserved for the operating system and the agent |
| 413 | + # service. |
| 414 | + reserved-cpu = 1 |
| 415 | + # The memory space reserved for the operating system and the agent service. It |
| 416 | + # is subtracted from the reported main memory size and not available for user |
| 417 | + # workload allocation. Depending on the memory-align-size option and system |
| 418 | + # configuration, this may not be the exact value but have slightly less or more |
| 419 | + # values within the memory-align-size. |
| 420 | + reserved-mem = 1073741824 |
| 421 | + # The disk space reserved for the operating system and the agent service. |
| 422 | + # Currently this value is unused. In future releases, it may be used to preserve |
| 423 | + # the minimum disk space from the scratch disk allocation via loopback files. |
| 424 | + reserved-disk = 8589934592 |
| 425 | + # The alignment of the reported main memory size to absorb tiny deviations from |
| 426 | + # per-node firmware/hardware settings. Recommended to be multiple of the |
| 427 | + # page/hugepage size (e.g., 2 MiB). |
| 428 | + memory-align-size = 16777216 |
| 429 | + # Resource allocation order |
| 430 | + allocation-order = [ "cuda", "rocm", "tpu", "cpu", "mem",] |
| 431 | + # Affinity policy |
| 432 | + affinity-policy = 1 |
0 commit comments