Skip to content

Commit e8f8302

Browse files
committed
opfs: enable opfs temp_directory via pre-registered pool
Previously attempting to set temp_directory to an opfs:// path did not work. Specifically, while 'auto' file handling scanned SQL stmts for opfs:// paths to pre-open mentioned files via the async OPFS API and register them where the sync openFile call could find them, it only worked for specific _files_, not for a temp directory _in which_ open file would later attempt to create files. The sync vs async APIs of openFile vs OPFS presents a challenge here: when the DB decides to spill to a tempfile and attempts to create it, it does so via the openFile call, which is a sync call. This makes creating the OPFS file at that point, via the async OPFS API, a problem. To get around this, this change, when an OPFS temp directory is registered, sets up a 'temp pool' of pre-created files, with sync access handles, which can be handed out as needed in openFile calls. When closed, a file can be truncated and returned to the pool, or deleted if the pool is already full. When the pool runs low, new files can be opened and added to it async. This approach works around the sync vs async API mismatch, while still allowing openFile calls to create arbitrarily named -- from its point of view -- files at arbitrary times. Fixes #2061.
1 parent 269b8d1 commit e8f8302

File tree

9 files changed

+311
-4
lines changed

9 files changed

+311
-4
lines changed

packages/duckdb-wasm/src/bindings/bindings_base.ts

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import { arrowToSQLField, arrowToSQLType } from '../json_typedef';
1313
import { WebFile } from './web_file';
1414
import { UDFFunction, UDFFunctionDeclaration } from './udf_function';
1515
import * as arrow from 'apache-arrow';
16+
import { createOPFSTempPool } from '../utils/opfs_util';
1617

1718
const TEXT_ENCODER = new TextEncoder();
1819

@@ -701,6 +702,21 @@ export abstract class DuckDBBindingsBase implements DuckDBBindings {
701702
throw new Error('Not an OPFS file name: ' + file);
702703
}
703704
}
705+
706+
public async registerOPFSTempDir(tempPath?: string, maxPoolSize?: number, minPoolSize?: number): Promise<void> {
707+
// Access BROWSER_RUNTIME through the runtime field
708+
const runtime = this._runtime as any;
709+
710+
if (runtime._opfsTmpPool) {
711+
await runtime._opfsTmpPool.destroy();
712+
runtime._opfsTmpPool = null;
713+
}
714+
715+
if (tempPath) {
716+
runtime._opfsTmpPool = await createOPFSTempPool(tempPath, { maxUnused: maxPoolSize, minUnused: minPoolSize });
717+
}
718+
}
719+
704720
public collectFileStatistics(file: string, enable: boolean): void {
705721
const [s, d, n] = callSRet(this.mod, 'duckdb_web_collect_file_stats', ['string', 'boolean'], [file, enable]);
706722
if (s !== StatusCode.SUCCESS) {

packages/duckdb-wasm/src/bindings/bindings_interface.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ export interface DuckDBBindings {
6363
copyFileToPath(name: string, path: string): void;
6464
copyFileToBuffer(name: string): Uint8Array;
6565
registerOPFSFileName(file: string): Promise<void>;
66+
registerOPFSTempDir(tempPath?: string, maxPoolSize?: number, minPoolSize?: number): Promise<void>;
6667
collectFileStatistics(file: string, enable: boolean): void;
6768
exportFileStatistics(file: string): FileStatistics;
6869
}

packages/duckdb-wasm/src/bindings/config.ts

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,37 @@ export interface DuckDBOPFSConfig {
4040
* - "manual": Files must be manually registered and dropped.
4141
*/
4242
fileHandling?: "auto" | "manual";
43+
44+
/**
45+
* OPFS path for temporary files (e.g., "opfs://tmp").
46+
*
47+
* When set, a "pool" of pre-allocated temp files is maintained for use by
48+
* DuckDB when it opens a tempfile on-demand. Pre-allocation of tempfiles is
49+
* required when using OPFS due to the OPFS API providing only asynchronous
50+
* file creation, while DuckDB's temp file creation must be synchronous. By
51+
* maintaining a pool of pre-created temp files, DuckDB can synchronously
52+
* claim a temp file from the pool when needed.
53+
*
54+
* `SET temp_directory='opfs://...` can also be used to initialize or change
55+
* the temp directory at runtime when using "auto" fileHandling.
56+
*/
57+
tempPath?: string;
58+
59+
/**
60+
* Maximum number of pre-allocated file handles in the temp pool beyond
61+
* returned files will be deleted when closed.
62+
*/
63+
tempPoolMax?: number;
64+
65+
/**
66+
* Minimum number of unused pre-allocated handles to maintain in any temp
67+
* file pools. When a tempfile is opened from the pool causing the remaining
68+
* unused handles to drop below this number, new handles will be created
69+
* asynchronously in the background to refill the pool up to tempPoolMax.
70+
*
71+
* Must be less than tempPoolMax.
72+
*/
73+
tempPoolMin?: number;
4374
}
4475

4576
export enum DuckDBAccessMode {

packages/duckdb-wasm/src/bindings/runtime_browser.ts

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import {
1616
} from './runtime';
1717
import { DuckDBModule } from './duckdb_module';
1818
import * as udf from './udf_runtime';
19+
import { TmpPool } from '../utils/opfs_util';
1920

2021
const OPFS_PREFIX_LEN = 'opfs://'.length;
2122
const PATH_SEP_REGEX = /\/|\\/;
@@ -26,6 +27,7 @@ export const BROWSER_RUNTIME: DuckDBRuntime & {
2627
_globalFileInfo: DuckDBGlobalFileInfo | null;
2728
_preparedHandles: Record<string, FileSystemSyncAccessHandle>;
2829
_opfsRoot: FileSystemDirectoryHandle | null;
30+
_opfsTmpPool: TmpPool | null;
2931

3032
getFileInfo(mod: DuckDBModule, fileId: number): DuckDBFileInfo | null;
3133
getGlobalFileInfo(mod: DuckDBModule): DuckDBGlobalFileInfo | null;
@@ -37,6 +39,7 @@ export const BROWSER_RUNTIME: DuckDBRuntime & {
3739
_globalFileInfo: null,
3840
_preparedHandles: {} as any,
3941
_opfsRoot: null,
42+
_opfsTmpPool: null,
4043

4144
getFileInfo(mod: DuckDBModule, fileId: number): DuckDBFileInfo | null {
4245
try {
@@ -417,7 +420,21 @@ export const BROWSER_RUNTIME: DuckDBRuntime & {
417420
return result;
418421
}
419422
case DuckDBDataProtocol.BROWSER_FSACCESS: {
420-
const handle: FileSystemSyncAccessHandle = BROWSER_RUNTIME._files?.get(file.fileName);
423+
let handle: FileSystemSyncAccessHandle | undefined = BROWSER_RUNTIME._files?.get(file.fileName);
424+
425+
// Check if this file belongs to a registered temp directory.
426+
// DuckDB creates temp files on-the-fly during query execution (for spilling),
427+
// calling openFile() synchronously. Since OPFS file creation is async, we can't
428+
// create new files here. Instead, we use pre-allocated file handles from the
429+
// temp pool that was set up when the temp directory was registered.
430+
if (!handle && BROWSER_RUNTIME._opfsTmpPool) {
431+
const pool = BROWSER_RUNTIME._opfsTmpPool;
432+
if (pool.matches(file.fileName)) {
433+
handle = pool.openFile(file.fileName);
434+
BROWSER_RUNTIME._files.set(file.fileName, handle);
435+
}
436+
}
437+
421438
if (!handle) {
422439
throw new Error(`No OPFS access handle registered with name: ${file.fileName}`);
423440
}
@@ -529,6 +546,7 @@ export const BROWSER_RUNTIME: DuckDBRuntime & {
529546
closeFile: (mod: DuckDBModule, fileId: number) => {
530547
const file = BROWSER_RUNTIME.getFileInfo(mod, fileId);
531548
BROWSER_RUNTIME._fileInfoCache.delete(fileId);
549+
532550
try {
533551
switch (file?.dataProtocol) {
534552
case DuckDBDataProtocol.BUFFER:
@@ -556,6 +574,10 @@ export const BROWSER_RUNTIME: DuckDBRuntime & {
556574
const fileName = readString(mod, fileNamePtr, fileNameLen);
557575
const handle: FileSystemSyncAccessHandle = BROWSER_RUNTIME._files?.get(fileName);
558576
if (handle) {
577+
if (BROWSER_RUNTIME._opfsTmpPool && BROWSER_RUNTIME._opfsTmpPool.matches(fileName)) {
578+
BROWSER_RUNTIME._opfsTmpPool.dropFile(fileName);
579+
return
580+
}
559581
BROWSER_RUNTIME._files.delete(fileName);
560582
if (handle instanceof FileSystemSyncAccessHandle) {
561583
try {
@@ -769,7 +791,13 @@ export const BROWSER_RUNTIME: DuckDBRuntime & {
769791
}
770792
return true;
771793
},
772-
removeFile: (_mod: DuckDBModule, _pathPtr: number, _pathLen: number) => {},
794+
removeFile: (mod: DuckDBModule, pathPtr: number, pathLen: number) => {
795+
const path = readString(mod, pathPtr, pathLen);
796+
797+
if (BROWSER_RUNTIME._opfsTmpPool && BROWSER_RUNTIME._opfsTmpPool.matches(path)) {
798+
BROWSER_RUNTIME._opfsTmpPool.dropFile(path);
799+
}
800+
},
773801
callScalarUDF: (
774802
mod: DuckDBModule,
775803
response: number,

packages/duckdb-wasm/src/parallel/async_bindings.ts

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,7 @@ export class AsyncDuckDB implements AsyncDuckDBBindings {
179179
case WorkerRequestType.CLOSE_PREPARED:
180180
case WorkerRequestType.COLLECT_FILE_STATISTICS:
181181
case WorkerRequestType.REGISTER_OPFS_FILE_NAME:
182+
case WorkerRequestType.REGISTER_OPFS_TEMP_DIR:
182183
case WorkerRequestType.COPY_FILE_TO_PATH:
183184
case WorkerRequestType.DISCONNECT:
184185
case WorkerRequestType.DROP_FILE:
@@ -386,6 +387,19 @@ export class AsyncDuckDB implements AsyncDuckDBBindings {
386387
this._config = config;
387388
const task = new WorkerTask<WorkerRequestType.OPEN, DuckDBConfig, null>(WorkerRequestType.OPEN, config);
388389
await this.postTask(task);
390+
391+
// If OPFS temp path is configured, eagerly initialize the pool
392+
if (config.opfs?.tempPath) {
393+
await this.registerOPFSTempDir(config.opfs.tempPath, config.opfs.tempPoolMax, config.opfs.tempPoolMin);
394+
395+
// Configure DuckDB to use this temp directory
396+
const conn = await this.connect();
397+
try {
398+
await conn.send(`SET temp_directory = '${config.opfs.tempPath}'`);
399+
} finally {
400+
await conn.close();
401+
}
402+
}
389403
}
390404

391405
/** Tokenize a script text */
@@ -614,6 +628,13 @@ export class AsyncDuckDB implements AsyncDuckDBBindings {
614628
);
615629
await this.postTask(task, []);
616630
}
631+
public async registerOPFSTempDir(name?: string, maxPoolSize?: number, minPoolSize?: number): Promise<void> {
632+
const task = new WorkerTask<WorkerRequestType.REGISTER_OPFS_TEMP_DIR, [string?, number?, number?], null>(
633+
WorkerRequestType.REGISTER_OPFS_TEMP_DIR,
634+
[name, maxPoolSize, minPoolSize],
635+
);
636+
await this.postTask(task, []);
637+
}
617638

618639
/** Enable file statistics */
619640
public async collectFileStatistics(name: string, enable: boolean): Promise<void> {
@@ -715,6 +736,18 @@ export class AsyncDuckDB implements AsyncDuckDBBindings {
715736
}
716737

717738
private async registerOPFSFileFromSQL(text: string) {
739+
const opfsTempDir = text.match(/(?:SET|PRAGMA)\s+temp_directory\s*=\s*['"]?(opfs:\/\/[^\s'";]+)['"]?/i);
740+
if (opfsTempDir ) {
741+
const newPath = opfsTempDir[1];
742+
743+
// Register the temp directory with the worker using config sizes if available
744+
await this.registerOPFSTempDir(newPath, this._config?.opfs?.tempPoolMax, this._config?.opfs?.tempPoolMin);
745+
// Remove the 'SET' or 'PRAGMA' temp_directory from the text that is
746+
// searched for opfs urls to avoid detecting and attempting to
747+
// register the temp *directory* as a file.
748+
text = text.replace(opfsTempDir[0], "");
749+
}
750+
718751
const files = searchOPFSFiles(text);
719752
const result: string[] = [];
720753
for (const file of files) {

packages/duckdb-wasm/src/parallel/worker_dispatcher.ts

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -365,7 +365,10 @@ export abstract class AsyncDuckDBDispatcher implements Logger {
365365
await this._bindings.registerOPFSFileName(request.data[0]);
366366
this.sendOK(request);
367367
break;
368-
368+
case WorkerRequestType.REGISTER_OPFS_TEMP_DIR:
369+
await this._bindings.registerOPFSTempDir(request.data[0], request.data[1], request.data[2]);
370+
this.sendOK(request);
371+
break;
369372
case WorkerRequestType.EXPORT_FILE_STATISTICS: {
370373
this.postMessage(
371374
{

packages/duckdb-wasm/src/parallel/worker_request.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ export enum WorkerRequestType {
1515
CLOSE_PREPARED = 'CLOSE_PREPARED',
1616
COLLECT_FILE_STATISTICS = 'COLLECT_FILE_STATISTICS',
1717
REGISTER_OPFS_FILE_NAME = 'REGISTER_OPFS_FILE_NAME',
18+
REGISTER_OPFS_TEMP_DIR = 'REGISTER_OPFS_TEMP_DIR',
1819
CONNECT = 'CONNECT',
1920
COPY_FILE_TO_BUFFER = 'COPY_FILE_TO_BUFFER',
2021
COPY_FILE_TO_PATH = 'COPY_FILE_TO_PATH',
@@ -111,6 +112,7 @@ export type WorkerRequestVariant =
111112
| WorkerRequest<WorkerRequestType.CANCEL_PENDING_QUERY, number>
112113
| WorkerRequest<WorkerRequestType.COLLECT_FILE_STATISTICS, [string, boolean]>
113114
| WorkerRequest<WorkerRequestType.REGISTER_OPFS_FILE_NAME, [string]>
115+
| WorkerRequest<WorkerRequestType.REGISTER_OPFS_TEMP_DIR, [string?, number?, number?]>
114116
| WorkerRequest<WorkerRequestType.CONNECT, null>
115117
| WorkerRequest<WorkerRequestType.COPY_FILE_TO_BUFFER, string>
116118
| WorkerRequest<WorkerRequestType.COPY_FILE_TO_PATH, [string, string]>
@@ -171,6 +173,7 @@ export type WorkerResponseVariant =
171173
export type WorkerTaskVariant =
172174
| WorkerTask<WorkerRequestType.COLLECT_FILE_STATISTICS, [string, boolean], null>
173175
| WorkerTask<WorkerRequestType.REGISTER_OPFS_FILE_NAME, [string], null>
176+
| WorkerTask<WorkerRequestType.REGISTER_OPFS_TEMP_DIR, [string?, number?, number?], null>
174177
| WorkerTask<WorkerRequestType.CLOSE_PREPARED, [number, number], null>
175178
| WorkerTask<WorkerRequestType.CONNECT, null, ConnectionID>
176179
| WorkerTask<WorkerRequestType.COPY_FILE_TO_BUFFER, string, Uint8Array>

packages/duckdb-wasm/src/utils/opfs_util.ts

Lines changed: 123 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,126 @@ export function isOPFSProtocol(path: string): boolean {
77

88
export function searchOPFSFiles(text: string) {
99
return [...text.matchAll(REGEX_OPFS_FILE)].map(match => match[1]);
10-
}
10+
}
11+
12+
type HandleEntry = { name: string; handle: FileSystemSyncAccessHandle };
13+
14+
export type TmpPoolConfig = { maxUnused?: number; minUnused?: number };
15+
16+
/**
17+
* TmpPool manages pre-allocated OPFS file handles for temporary files.
18+
*
19+
* DuckDB's file operations (openFile, dropFile, etc.) are synchronous C++ calls
20+
* that cannot be made async. However, OPFS file creation requires async APIs.
21+
* When DuckDB needs to create temporary files for spilling data to disk, it
22+
* calls openFile synchronously, expecting an immediate file handle.
23+
*
24+
* To work around this API mismatch, we pre-create a pool of OPFS files with
25+
* their sync access handles during the async temp directory registration. When
26+
* DuckDB needs a temp file, we can synchronously hand out one of these
27+
* pre-created handles from the pool. When the file is closed, we clear it and
28+
* return it to the pool for reuse.
29+
*/
30+
export class TmpPool {
31+
public readonly path: string;
32+
private dir: FileSystemDirectoryHandle;
33+
private maxUnused: number;
34+
private minUnused: number;
35+
private pool: HandleEntry[] = []; // unused files.
36+
private openMap = new Map<string, HandleEntry>(); // checked out files.
37+
private nextId = 1;
38+
private refillInFlight = false;
39+
40+
constructor(
41+
path: string,
42+
dir: FileSystemDirectoryHandle,
43+
maxUnused: number = 4,
44+
minUnused: number = 2
45+
) {
46+
if (minUnused >= maxUnused) throw new Error("minUnused must be < maxUnused");
47+
this.path = canonicalDirUrl(path);
48+
this.dir = dir;
49+
this.maxUnused = maxUnused;
50+
this.minUnused = minUnused;
51+
}
52+
53+
async init(): Promise<void> { await this.refillTo(this.maxUnused); }
54+
55+
matches(path: string): boolean {
56+
const canonical = canonicalDirUrl(path);
57+
return canonical === this.path || canonical.startsWith(this.path);
58+
}
59+
60+
openFile(path: string): FileSystemSyncAccessHandle {
61+
const existing = this.openMap.get(path); if (existing) return existing.handle;
62+
if (this.pool.length === 0) throw new Error("OPFS tmp pool exhausted");
63+
const entry = this.pool.pop()!; this.openMap.set(path, entry);
64+
if (this.pool.length < this.minUnused) this.maybeRefillAsync();
65+
return entry.handle;
66+
}
67+
68+
dropFile(path: string): void {
69+
const entry = this.openMap.get(path); if (!entry) return;
70+
entry.handle.flush();
71+
72+
if (this.pool.length >= this.maxUnused) {
73+
this.asyncDelete(entry).catch(() => {});
74+
} else {
75+
entry.handle.truncate(0);
76+
this.pool.push(entry);
77+
}
78+
this.openMap.delete(path);
79+
}
80+
81+
async destroy(): Promise<void> {
82+
await Promise.all(this.pool.splice(0).map(e => this.asyncDelete(e)));
83+
}
84+
85+
private async createEntry(): Promise<HandleEntry> {
86+
const name = `tmp${this.nextId++}`;
87+
const fh = await this.dir.getFileHandle(name, { create: true });
88+
const sah = await fh.createSyncAccessHandle();
89+
sah.truncate(0);
90+
return { name, handle: sah };
91+
}
92+
private async refillTo(target: number): Promise<void> {
93+
while (this.pool.length < target) {
94+
const e = await this.createEntry(); this.pool.push(e);
95+
}
96+
}
97+
private maybeRefillAsync() {
98+
if (this.refillInFlight) return;
99+
this.refillInFlight = true;
100+
this.refillTo(this.maxUnused).finally(() => { this.refillInFlight = false; });
101+
}
102+
private async asyncDelete(e: HandleEntry) {
103+
try { e.handle.flush(); } catch { /* ignore errors */ }
104+
try { e.handle.close(); } catch { /* ignore errors */ }
105+
try { await this.dir.removeEntry(e.name); } catch { /* ignore errors */ }
106+
}
107+
}
108+
109+
export function canonicalDirUrl(url: string) {
110+
return url.replace(/\/+$/, "");
111+
}
112+
113+
export async function resolveOpfsDirectory(opfsUrl: string): Promise<FileSystemDirectoryHandle> {
114+
const root = await (navigator as any).storage.getDirectory();
115+
const rel = opfsUrl.slice("opfs://".length).replace(/^\/+/, "");
116+
const parts = rel.split("/").filter(Boolean);
117+
let dir: FileSystemDirectoryHandle = root;
118+
for (const p of parts) {
119+
dir = await dir.getDirectoryHandle(p, { create: true });
120+
}
121+
return dir;
122+
}
123+
124+
export async function createOPFSTempPool(opfsDirUrl: string, cfg: TmpPoolConfig = {}) : Promise<TmpPool> {
125+
const key = canonicalDirUrl(opfsDirUrl);
126+
const dir = await resolveOpfsDirectory(key);
127+
const maxUnused = cfg.maxUnused ?? 10;
128+
const minUnused = cfg.minUnused ?? 2;
129+
const pool = new TmpPool(key, dir, maxUnused, minUnused);
130+
await pool.init();
131+
return pool;
132+
}

0 commit comments

Comments
 (0)