Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changes/6320.feature.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Update Agent server RPC functions to include agent ID for agent runtime with multiple agents
1 change: 1 addition & 0 deletions changes/6724.feature.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add custom resource allocation in agent server config
83 changes: 44 additions & 39 deletions configs/agent/sample.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,6 @@
scaling-group = "default"
# Scaling group type
scaling-group-type = "compute"
# Allowed compute plugins
## allow-compute-plugins = [ "ai.backend.accelerator.cuda_open", "ai.backend.activator.agent",]
# Blocked compute plugins
## block-compute-plugins = [ "ai.backend.accelerator.mock",]
# Allowed network plugins
## allow-network-plugins = [ "ai.backend.manager.network.overlay",]
# Blocked network plugins
## block-network-plugins = [ "ai.backend.manager.network.overlay",]
# Whether to force terminate abusing containers
force-terminate-abusing-containers = false
# Kernel creation concurrency
Expand Down Expand Up @@ -81,6 +73,14 @@
metadata-server-bind-host = "0.0.0.0"
# Metadata server port
metadata-server-port = 40128
# Allowed compute plugins
## allow-compute-plugins = [ "ai.backend.activator.agent", "ai.backend.accelerator.cuda_open",]
# Blocked compute plugins
## block-compute-plugins = [ "ai.backend.accelerator.mock",]
# Allowed network plugins
## allow-network-plugins = [ "ai.backend.manager.network.overlay",]
# Blocked network plugins
## block-network-plugins = [ "ai.backend.manager.network.overlay",]
# Path for image commit
image-commit-path = "tmp/backend.ai/commit"
# Path for abuse reports
Expand Down Expand Up @@ -156,6 +156,12 @@
# Currently this value is unused. In future releases, it may be used to preserve
# the minimum disk space from the scratch disk allocation via loopback files.
reserved-disk = "8G"
# Resource allocation mode for multi-agent scenarios.
# - `shared`: All agents share the full resource pool (default, backward
# compatible).
# - `auto-split`: Automatically divide resources equally (1/N) among all agents.
# - `manual`: Manually specify per-agent resource allocations via config.
allocation-mode = "shared"
# The alignment of the reported main memory size to absorb tiny deviations from
# per-node firmware/hardware settings. Recommended to be multiple of the
# page/hugepage size (e.g., 2 MiB).
Expand All @@ -165,6 +171,22 @@
# Affinity policy
affinity-policy = "INTERLEAVED"

# Resource allocations.
# Only used in MANUAL allocation mode.
[resource.allocations]
# Hard CPU allocation for this agent (e.g., 8 cores).
# Only used in MANUAL allocation mode.
# All agents must specify this value when allocation-mode is MANUAL.
cpu = 8
# Hard memory allocation for this agent (e.g., "32G").
# Only used in MANUAL allocation mode.
# All agents must specify this value when allocation-mode is MANUAL.
mem = "32G"

# Device-specific per-slot resource allocations.
# Only used in MANUAL allocation mode.
[resource.allocations.devices]

# Pyroscope configuration
[pyroscope]
# Whether to enable Pyroscope profiling
Expand Down Expand Up @@ -351,14 +373,6 @@
scaling-group = "default"
# Scaling group type
scaling-group-type = "compute"
# Allowed compute plugins
## allow-compute-plugins = [ "ai.backend.accelerator.cuda_open", "ai.backend.activator.agent",]
# Blocked compute plugins
## block-compute-plugins = [ "ai.backend.accelerator.mock",]
# Allowed network plugins
## allow-network-plugins = [ "ai.backend.manager.network.overlay",]
# Blocked network plugins
## block-network-plugins = [ "ai.backend.manager.network.overlay",]
# Whether to force terminate abusing containers
force-terminate-abusing-containers = false
# Kernel creation concurrency
Expand All @@ -383,7 +397,7 @@
# late into the agent's runtime.
port-range = [ 30000, 31000,]
# Statistics type
## stats-type = "cgroup"
## stats-type = "docker"
# Sandbox type
sandbox-type = "docker"
# Jail arguments
Expand All @@ -393,7 +407,7 @@
# Scratch root directory
scratch-root = "scratches"
# Scratch size
scratch-size = 0
scratch-size = "0"
# Scratch NFS address
## scratch-nfs-address = "192.168.1.100:/export"
# Scratch NFS options
Expand All @@ -409,24 +423,15 @@

# Resource config overrides for the individual agent
[agents.resource]
# The number of CPU cores reserved for the operating system and the agent
# service.
reserved-cpu = 1
# The memory space reserved for the operating system and the agent service. It
# is subtracted from the reported main memory size and not available for user
# workload allocation. Depending on the memory-align-size option and system
# configuration, this may not be the exact value but have slightly less or more
# values within the memory-align-size.
reserved-mem = 1073741824
# The disk space reserved for the operating system and the agent service.
# Currently this value is unused. In future releases, it may be used to preserve
# the minimum disk space from the scratch disk allocation via loopback files.
reserved-disk = 8589934592
# The alignment of the reported main memory size to absorb tiny deviations from
# per-node firmware/hardware settings. Recommended to be multiple of the
# page/hugepage size (e.g., 2 MiB).
memory-align-size = 16777216
# Resource allocation order
allocation-order = [ "cuda", "rocm", "tpu", "cpu", "mem",]
# Affinity policy
affinity-policy = 1
# Hard CPU allocation for this agent (e.g., 8 cores).
# Only used in MANUAL allocation mode.
# All agents must specify this value when allocation-mode is MANUAL.
cpu = 8
# Hard memory allocation for this agent (e.g., "32G").
# Only used in MANUAL allocation mode.
# All agents must specify this value when allocation-mode is MANUAL.
mem = "32G"

# Device-specific per-slot resource allocations.
# Only used in MANUAL allocation mode.
[agents.resource.devices]
4 changes: 2 additions & 2 deletions src/ai/backend/agent/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -2276,7 +2276,7 @@ async def scan_running_kernels(self) -> None:
"""
ipc_base_path = self.local_config.agent.ipc_base_path
var_base_path = self.local_config.agent.var_base_path
last_registry_file = f"last_registry.{self.local_instance_id}.dat"
last_registry_file = f"last_registry.{self.id}.dat"
if os.path.isfile(ipc_base_path / last_registry_file):
shutil.move(ipc_base_path / last_registry_file, var_base_path / last_registry_file)
try:
Expand Down Expand Up @@ -3745,7 +3745,7 @@ async def save_last_registry(self, force=False) -> None:
if (not force) and (now <= self.last_registry_written_time + 60):
return # don't save too frequently
var_base_path = self.local_config.agent.var_base_path
last_registry_file = f"last_registry.{self.local_instance_id}.dat"
last_registry_file = f"last_registry.{self.id}.dat"
try:
with open(var_base_path / last_registry_file, "wb") as f:
pickle.dump(self.kernel_registry, f)
Expand Down
Loading
Loading