Skip to content

Commit 2fcbf29

Browse files
authored
Cap BLAS threads to prevent fork-vs-OpenBLAS atfork freeze (#111)
glibc fork() runs OpenBLAS's pthread_atfork prepare handler, which joins the entire BLAS worker pool. uvloop spawns agent CLIs via fork() on the event loop while the memU thread keeps the pool busy with vector-index mat-vecs — under recall load the pool never quiesces and the loop freezes inside fork() for minutes. Default OPENBLAS_NUM_THREADS=1 (no pool, no atfork wait; our mat-vecs are bandwidth-bound anyway) and skip the SDK's per-connect `claude -v` subprocess. Explicit env always wins.
1 parent bb4b172 commit 2fcbf29

4 files changed

Lines changed: 129 additions & 0 deletions

File tree

nerve/_env.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
"""Process-wide environment defaults — import before numpy loads.
2+
3+
This module is imported for its side effects at the top of Nerve's entry
4+
points (``nerve.cli``) and before the first ``import numpy`` in the
5+
codebase (``nerve.memory.memu_bridge``). BLAS thread-pool sizing is read
6+
once when the BLAS shared library is loaded, so these defaults are only
7+
effective if they are in the environment *before* numpy is imported
8+
anywhere in the process.
9+
10+
Why cap BLAS at a single thread
11+
===============================
12+
13+
OpenBLAS (bundled with numpy wheels) spawns a worker thread pool sized to
14+
the machine's core count on first parallel operation. On many-core hosts
15+
this interacts catastrophically with subprocess spawning:
16+
17+
1. **fork vs. BLAS atfork collision.** glibc ``fork()`` runs registered
18+
``pthread_atfork`` prepare handlers. OpenBLAS registers one
19+
(``blas_thread_shutdown_``) that ``pthread_join``\\ s its *entire*
20+
worker pool before allowing the fork to proceed. The event loop
21+
spawns agent CLI subprocesses via ``fork()`` (libuv/uvloop), while the
22+
dedicated memU thread keeps the BLAS pool busy with vector-index
23+
mat-vecs over a multi-hundred-MB embedding matrix. Under a recall
24+
storm the pool never quiesces, so a spawn on the loop thread can block
25+
for minutes inside ``fork()`` — freezing the entire server (HTTP,
26+
WebSocket, all sessions). Observed in production on a many-core
27+
deployment; diagnosed via native stack: ``uv__spawn_and_init_child_fork
28+
→ __libc_fork → __run_prefork_handlers → blas_thread_shutdown_ →
29+
pthread_join``.
30+
31+
2. **Multi-threaded BLAS buys us nothing.** Nerve's only BLAS workload
32+
is one mat-vec per memory recall/dedup — memory-bandwidth-bound, ~tens
33+
of ms single-threaded even on large indexes.
34+
35+
With ``OPENBLAS_NUM_THREADS=1`` the pool is never created, the atfork
36+
handler returns immediately, and forks never stall.
37+
38+
``os.environ.setdefault`` is used throughout: an explicitly configured
39+
environment always wins over these defaults.
40+
"""
41+
42+
from __future__ import annotations
43+
44+
import os
45+
46+
47+
def apply_env_defaults() -> None:
48+
"""Apply process-env defaults. Idempotent; explicit env wins."""
49+
# Cap the BLAS worker pool (see module docstring). Must be set
50+
# before numpy/OpenBLAS loads.
51+
os.environ.setdefault("OPENBLAS_NUM_THREADS", "1")
52+
53+
# The Claude Agent SDK spawns an extra ``claude -v`` subprocess on
54+
# every connect just to warn about outdated CLIs. With many
55+
# concurrent sessions this doubles fork pressure on the event loop
56+
# for a warning nothing acts on. The SDK honors this variable to
57+
# skip the check; export it as an empty string to re-enable.
58+
os.environ.setdefault("CLAUDE_AGENT_SDK_SKIP_VERSION_CHECK", "1")
59+
60+
61+
apply_env_defaults()

nerve/cli.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,10 @@
1414

1515
from __future__ import annotations
1616

17+
# Must be first: applies BLAS thread caps and other process-env defaults
18+
# that have to be in place before numpy (or any BLAS user) is imported.
19+
import nerve._env # noqa: F401 isort: skip
20+
1721
import asyncio
1822
import logging
1923
import os

nerve/memory/memu_bridge.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,12 @@
2020
from typing import Any, Coroutine
2121
from zoneinfo import ZoneInfo
2222

23+
# Ensure BLAS thread caps are applied before numpy loads OpenBLAS — an
24+
# unbounded BLAS pool makes glibc fork() (subprocess spawning on the event
25+
# loop) block in OpenBLAS's atfork handler while the memU thread runs
26+
# vector searches. See nerve/_env.py for the full mechanism.
27+
import nerve._env # noqa: F401 isort: skip
28+
2329
import numpy as np
2430

2531
from nerve.config import NerveConfig

tests/test_env_defaults.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
"""Tests for nerve._env — process-env defaults applied before numpy loads.
2+
3+
These defaults prevent the fork-vs-OpenBLAS-atfork collision: an unbounded
4+
BLAS worker pool makes glibc fork() (used by uvloop to spawn agent CLIs on
5+
the event loop) block in OpenBLAS's pthread_atfork prepare handler while
6+
the memU thread runs vector searches.
7+
"""
8+
9+
from __future__ import annotations
10+
11+
import importlib
12+
import os
13+
14+
import nerve._env as env_mod
15+
16+
17+
class TestEnvDefaults:
18+
def test_defaults_applied_when_unset(self, monkeypatch):
19+
monkeypatch.delenv("OPENBLAS_NUM_THREADS", raising=False)
20+
monkeypatch.delenv("CLAUDE_AGENT_SDK_SKIP_VERSION_CHECK", raising=False)
21+
22+
env_mod.apply_env_defaults()
23+
24+
assert os.environ["OPENBLAS_NUM_THREADS"] == "1"
25+
assert os.environ["CLAUDE_AGENT_SDK_SKIP_VERSION_CHECK"] == "1"
26+
27+
def test_explicit_values_win(self, monkeypatch):
28+
monkeypatch.setenv("OPENBLAS_NUM_THREADS", "8")
29+
# Empty string re-enables the SDK version check (falsy in the SDK).
30+
monkeypatch.setenv("CLAUDE_AGENT_SDK_SKIP_VERSION_CHECK", "")
31+
32+
env_mod.apply_env_defaults()
33+
34+
assert os.environ["OPENBLAS_NUM_THREADS"] == "8"
35+
assert os.environ["CLAUDE_AGENT_SDK_SKIP_VERSION_CHECK"] == ""
36+
37+
def test_applied_on_import(self, monkeypatch):
38+
"""Importing the module applies the defaults (entry-point contract)."""
39+
monkeypatch.delenv("OPENBLAS_NUM_THREADS", raising=False)
40+
importlib.reload(env_mod)
41+
assert os.environ["OPENBLAS_NUM_THREADS"] == "1"
42+
43+
def test_idempotent(self, monkeypatch):
44+
monkeypatch.delenv("OPENBLAS_NUM_THREADS", raising=False)
45+
env_mod.apply_env_defaults()
46+
env_mod.apply_env_defaults()
47+
assert os.environ["OPENBLAS_NUM_THREADS"] == "1"
48+
49+
def test_bridge_import_applies_caps(self):
50+
"""memu_bridge must guarantee the caps before its numpy import.
51+
52+
numpy is typically already loaded by the time this test runs, so
53+
this can't verify load-order end-to-end — it pins the import
54+
dependency: importing the bridge module must (re)apply defaults.
55+
"""
56+
import nerve.memory.memu_bridge # noqa: F401
57+
58+
assert os.environ.get("OPENBLAS_NUM_THREADS") is not None

0 commit comments

Comments
 (0)