Merge pull request #652 from tobi/workoff/t_23fce575-dev-review

fix: avoid macOS Metal cleanup abort after JSON query
This commit is contained in:
Tobias Lütke 2026-05-16 13:41:02 -04:00 committed by GitHub
commit 474ac7863d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 203 additions and 16 deletions

View File

@ -16,6 +16,7 @@
flag. Previously it returned only the best matching chunk (~3.6KB max
per result). Output payload for `--full` queries is now proportional
to total document size.
- macOS Metal: `qmd query --json` now flushes successful JSON output and uses a safe immediate-exit path on Darwin to avoid ggml Metal finalizer aborts; other commands still dispose LLM contexts/models before the llama runtime. #368
- Embedding: `qmd embed -c <collection>` now scopes pending-doc selection
to the requested collection instead of embedding global pending work.
Scoped `--force` clears only collection-owned vectors, preserves shared

View File

@ -207,6 +207,76 @@ const cursor = {
show() { process.stderr.write('\x1b[?25h'); },
};
type CliLifecycleWritable = {
write(chunk: string | Uint8Array, callback?: (error?: Error | null) => void): boolean;
};
type FinishSuccessfulCliCommandOptions = {
command: string;
format?: OutputFormat;
cleanup?: () => Promise<void>;
exit?: (code: number) => void;
immediateExit?: (code: number) => void;
stdout?: CliLifecycleWritable;
stderr?: CliLifecycleWritable;
platform?: NodeJS.Platform;
};
async function flushWritable(stream: CliLifecycleWritable): Promise<void> {
await new Promise<void>((resolve) => {
stream.write("", () => resolve());
});
}
function shouldBypassNativeCleanup(options: FinishSuccessfulCliCommandOptions): boolean {
return (
(options.platform ?? process.platform) === "darwin" &&
options.command === "query" &&
options.format === "json" &&
process.env.QMD_DISABLE_DARWIN_QUERY_JSON_SAFE_EXIT !== "1"
);
}
function immediateProcessExit(code: number): void {
const processWithReallyExit = process as NodeJS.Process & { reallyExit?: (code?: number) => void };
if (typeof processWithReallyExit.reallyExit === "function") {
processWithReallyExit.reallyExit(code);
return;
}
process.exit(code);
}
/**
* Finish a successful CLI command after output has been flushed. On macOS JSON
* query runs, skip normal native teardown and use Node/Bun's immediate exit path:
* ggml Metal can abort from C++ finalizers after valid JSON has already been
* produced (#368). This wrapper is only reached after the command completed, so
* real query failures still exit through the normal error path before this runs.
*/
export async function finishSuccessfulCliCommand(options: FinishSuccessfulCliCommandOptions): Promise<void> {
const stderr = options.stderr ?? process.stderr;
const exit = options.exit ?? ((code: number) => process.exit(code));
const immediateExit = options.immediateExit ?? immediateProcessExit;
await flushWritable(options.stdout ?? process.stdout);
if (shouldBypassNativeCleanup(options)) {
await flushWritable(stderr);
immediateExit(0);
return;
}
try {
await (options.cleanup ?? disposeDefaultLlamaCpp)();
} catch (error) {
stderr.write(
`QMD Warning: cleanup after successful output failed (${error instanceof Error ? error.message : String(error)}); exiting 0 because command output completed.\n`
);
}
await flushWritable(stderr);
exit(0);
}
// Ensure cursor is restored on exit
process.on('SIGINT', () => { cursor.show(); process.exit(130); });
process.on('SIGTERM', () => { cursor.show(); process.exit(143); });
@ -3421,8 +3491,10 @@ if (isMain) {
}
if (cli.command !== "mcp") {
await disposeDefaultLlamaCpp();
process.exit(0);
await finishSuccessfulCliCommand({
command: cli.command,
format: cli.opts.format,
});
}
} // end if (main module)

View File

@ -540,6 +540,23 @@ export function resolveLlamaGpuMode(
return "auto";
}
async function disposeWithTimeout(resourceName: string, dispose: () => Promise<void>, timeoutMs = 1000): Promise<void> {
const timeoutPromise = new Promise<"timeout">((resolve) => {
setTimeout(() => resolve("timeout"), timeoutMs).unref();
});
try {
const result = await Promise.race([dispose(), timeoutPromise]);
if (result === "timeout") {
process.stderr.write(`QMD Warning: timed out disposing ${resourceName}; continuing shutdown.\n`);
}
} catch (error) {
process.stderr.write(
`QMD Warning: failed to dispose ${resourceName} (${error instanceof Error ? error.message : String(error)}); continuing shutdown.\n`
);
}
}
function resolveExpandContextSize(configValue?: number): number {
if (configValue !== undefined) {
if (!Number.isInteger(configValue) || configValue <= 0) {
@ -1465,22 +1482,37 @@ export class LlamaCpp implements LLM {
this.inactivityTimer = null;
}
// Disposing llama cascades to models and contexts automatically
// See: https://node-llama-cpp.withcat.ai/guide/objects-lifecycle
// Note: llama.dispose() can hang indefinitely, so we use a timeout
if (this.llama) {
const disposePromise = this.llama.dispose();
const timeoutPromise = new Promise<void>((resolve) => setTimeout(resolve, 1000));
await Promise.race([disposePromise, timeoutPromise]);
// Explicitly dispose in dependency order: contexts first, then models, then llama.
// Relying only on llama.dispose() leaves Metal resource sets alive until process
// finalization on Apple Silicon, where ggml_metal_device_free can abort after
// otherwise-successful CLI output (#368).
for (const ctx of this.embedContexts) {
await disposeWithTimeout("embedding context", () => ctx.dispose());
}
this.embedContexts = [];
for (const ctx of this.rerankContexts) {
await disposeWithTimeout("rerank context", () => ctx.dispose());
}
this.rerankContexts = [];
if (this.embedModel) {
await disposeWithTimeout("embedding model", () => this.embedModel!.dispose());
this.embedModel = null;
}
if (this.generateModel) {
await disposeWithTimeout("generation model", () => this.generateModel!.dispose());
this.generateModel = null;
}
if (this.rerankModel) {
await disposeWithTimeout("rerank model", () => this.rerankModel!.dispose());
this.rerankModel = null;
}
// Clear references
this.embedContexts = [];
this.rerankContexts = [];
this.embedModel = null;
this.generateModel = null;
this.rerankModel = null;
this.llama = null;
if (this.llama) {
await disposeWithTimeout("llama runtime", () => this.llama!.dispose());
this.llama = null;
}
// Clear any in-flight load/create promises
this.embedModelLoadPromise = null;

View File

@ -0,0 +1,82 @@
import { describe, expect, test } from "vitest";
import { finishSuccessfulCliCommand } from "../src/cli/qmd.ts";
import { LlamaCpp } from "../src/llm.ts";
describe("CLI successful-exit lifecycle", () => {
test("exits 0 after successful JSON output when post-output LLM cleanup fails", async () => {
const exitCodes: number[] = [];
const stderr: string[] = [];
const flushed: string[] = [];
await finishSuccessfulCliCommand({
command: "query",
format: "json",
platform: "linux",
cleanup: async () => {
throw new Error("ggml_metal_device_free abort simulation");
},
exit: (code) => {
exitCodes.push(code);
},
stdout: { write: (chunk: string | Uint8Array, cb?: (error?: Error | null) => void) => { flushed.push(String(chunk)); cb?.(); return true; } },
stderr: { write: (chunk: string | Uint8Array, cb?: (error?: Error | null) => void) => { stderr.push(String(chunk)); cb?.(); return true; } },
});
expect(exitCodes).toEqual([0]);
expect(stderr.join("")).toContain("QMD Warning: cleanup after successful output failed");
expect(flushed).toEqual([""]);
});
test("uses immediate exit for successful macOS JSON query after stdout flush", async () => {
const calls: string[] = [];
await finishSuccessfulCliCommand({
command: "query",
format: "json",
platform: "darwin",
cleanup: async () => {
calls.push("cleanup");
},
exit: (code) => {
calls.push(`exit:${code}`);
},
immediateExit: (code) => {
calls.push(`immediate-exit:${code}`);
},
stdout: { write: (_chunk: string | Uint8Array, cb?: (error?: Error | null) => void) => { calls.push("stdout-flush"); cb?.(); return true; } },
stderr: { write: (_chunk: string | Uint8Array, cb?: (error?: Error | null) => void) => { calls.push("stderr-flush"); cb?.(); return true; } },
});
expect(calls).toEqual(["stdout-flush", "stderr-flush", "immediate-exit:0"]);
});
test("disposes Llama resources in dependency order before CLI exit", async () => {
const calls: string[] = [];
const llm = new LlamaCpp({ inactivityTimeoutMs: 0 });
const disposable = (name: string) => ({
dispose: async () => {
calls.push(name);
},
});
Object.assign(llm as unknown as Record<string, unknown>, {
embedContexts: [disposable("embed-context")],
rerankContexts: [disposable("rerank-context")],
embedModel: disposable("embed-model"),
generateModel: disposable("generate-model"),
rerankModel: disposable("rerank-model"),
llama: disposable("llama"),
});
await llm.dispose();
expect(calls).toEqual([
"embed-context",
"rerank-context",
"embed-model",
"generate-model",
"rerank-model",
"llama",
]);
});
});