Merge pull request #652 from tobi/workoff/t_23fce575-dev-review
fix: avoid macOS Metal cleanup abort after JSON query
This commit is contained in:
commit
474ac7863d
@ -16,6 +16,7 @@
|
||||
flag. Previously it returned only the best matching chunk (~3.6KB max
|
||||
per result). Output payload for `--full` queries is now proportional
|
||||
to total document size.
|
||||
- macOS Metal: `qmd query --json` now flushes successful JSON output and uses a safe immediate-exit path on Darwin to avoid ggml Metal finalizer aborts; other commands still dispose LLM contexts/models before the llama runtime. #368
|
||||
- Embedding: `qmd embed -c <collection>` now scopes pending-doc selection
|
||||
to the requested collection instead of embedding global pending work.
|
||||
Scoped `--force` clears only collection-owned vectors, preserves shared
|
||||
|
||||
@ -207,6 +207,76 @@ const cursor = {
|
||||
show() { process.stderr.write('\x1b[?25h'); },
|
||||
};
|
||||
|
||||
type CliLifecycleWritable = {
|
||||
write(chunk: string | Uint8Array, callback?: (error?: Error | null) => void): boolean;
|
||||
};
|
||||
|
||||
type FinishSuccessfulCliCommandOptions = {
|
||||
command: string;
|
||||
format?: OutputFormat;
|
||||
cleanup?: () => Promise<void>;
|
||||
exit?: (code: number) => void;
|
||||
immediateExit?: (code: number) => void;
|
||||
stdout?: CliLifecycleWritable;
|
||||
stderr?: CliLifecycleWritable;
|
||||
platform?: NodeJS.Platform;
|
||||
};
|
||||
|
||||
async function flushWritable(stream: CliLifecycleWritable): Promise<void> {
|
||||
await new Promise<void>((resolve) => {
|
||||
stream.write("", () => resolve());
|
||||
});
|
||||
}
|
||||
|
||||
function shouldBypassNativeCleanup(options: FinishSuccessfulCliCommandOptions): boolean {
|
||||
return (
|
||||
(options.platform ?? process.platform) === "darwin" &&
|
||||
options.command === "query" &&
|
||||
options.format === "json" &&
|
||||
process.env.QMD_DISABLE_DARWIN_QUERY_JSON_SAFE_EXIT !== "1"
|
||||
);
|
||||
}
|
||||
|
||||
function immediateProcessExit(code: number): void {
|
||||
const processWithReallyExit = process as NodeJS.Process & { reallyExit?: (code?: number) => void };
|
||||
if (typeof processWithReallyExit.reallyExit === "function") {
|
||||
processWithReallyExit.reallyExit(code);
|
||||
return;
|
||||
}
|
||||
process.exit(code);
|
||||
}
|
||||
|
||||
/**
|
||||
* Finish a successful CLI command after output has been flushed. On macOS JSON
|
||||
* query runs, skip normal native teardown and use Node/Bun's immediate exit path:
|
||||
* ggml Metal can abort from C++ finalizers after valid JSON has already been
|
||||
* produced (#368). This wrapper is only reached after the command completed, so
|
||||
* real query failures still exit through the normal error path before this runs.
|
||||
*/
|
||||
export async function finishSuccessfulCliCommand(options: FinishSuccessfulCliCommandOptions): Promise<void> {
|
||||
const stderr = options.stderr ?? process.stderr;
|
||||
const exit = options.exit ?? ((code: number) => process.exit(code));
|
||||
const immediateExit = options.immediateExit ?? immediateProcessExit;
|
||||
|
||||
await flushWritable(options.stdout ?? process.stdout);
|
||||
|
||||
if (shouldBypassNativeCleanup(options)) {
|
||||
await flushWritable(stderr);
|
||||
immediateExit(0);
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
await (options.cleanup ?? disposeDefaultLlamaCpp)();
|
||||
} catch (error) {
|
||||
stderr.write(
|
||||
`QMD Warning: cleanup after successful output failed (${error instanceof Error ? error.message : String(error)}); exiting 0 because command output completed.\n`
|
||||
);
|
||||
}
|
||||
await flushWritable(stderr);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
// Ensure cursor is restored on exit
|
||||
process.on('SIGINT', () => { cursor.show(); process.exit(130); });
|
||||
process.on('SIGTERM', () => { cursor.show(); process.exit(143); });
|
||||
@ -3421,8 +3491,10 @@ if (isMain) {
|
||||
}
|
||||
|
||||
if (cli.command !== "mcp") {
|
||||
await disposeDefaultLlamaCpp();
|
||||
process.exit(0);
|
||||
await finishSuccessfulCliCommand({
|
||||
command: cli.command,
|
||||
format: cli.opts.format,
|
||||
});
|
||||
}
|
||||
|
||||
} // end if (main module)
|
||||
|
||||
60
src/llm.ts
60
src/llm.ts
@ -540,6 +540,23 @@ export function resolveLlamaGpuMode(
|
||||
return "auto";
|
||||
}
|
||||
|
||||
async function disposeWithTimeout(resourceName: string, dispose: () => Promise<void>, timeoutMs = 1000): Promise<void> {
|
||||
const timeoutPromise = new Promise<"timeout">((resolve) => {
|
||||
setTimeout(() => resolve("timeout"), timeoutMs).unref();
|
||||
});
|
||||
|
||||
try {
|
||||
const result = await Promise.race([dispose(), timeoutPromise]);
|
||||
if (result === "timeout") {
|
||||
process.stderr.write(`QMD Warning: timed out disposing ${resourceName}; continuing shutdown.\n`);
|
||||
}
|
||||
} catch (error) {
|
||||
process.stderr.write(
|
||||
`QMD Warning: failed to dispose ${resourceName} (${error instanceof Error ? error.message : String(error)}); continuing shutdown.\n`
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
function resolveExpandContextSize(configValue?: number): number {
|
||||
if (configValue !== undefined) {
|
||||
if (!Number.isInteger(configValue) || configValue <= 0) {
|
||||
@ -1465,22 +1482,37 @@ export class LlamaCpp implements LLM {
|
||||
this.inactivityTimer = null;
|
||||
}
|
||||
|
||||
// Disposing llama cascades to models and contexts automatically
|
||||
// See: https://node-llama-cpp.withcat.ai/guide/objects-lifecycle
|
||||
// Note: llama.dispose() can hang indefinitely, so we use a timeout
|
||||
if (this.llama) {
|
||||
const disposePromise = this.llama.dispose();
|
||||
const timeoutPromise = new Promise<void>((resolve) => setTimeout(resolve, 1000));
|
||||
await Promise.race([disposePromise, timeoutPromise]);
|
||||
// Explicitly dispose in dependency order: contexts first, then models, then llama.
|
||||
// Relying only on llama.dispose() leaves Metal resource sets alive until process
|
||||
// finalization on Apple Silicon, where ggml_metal_device_free can abort after
|
||||
// otherwise-successful CLI output (#368).
|
||||
for (const ctx of this.embedContexts) {
|
||||
await disposeWithTimeout("embedding context", () => ctx.dispose());
|
||||
}
|
||||
this.embedContexts = [];
|
||||
|
||||
for (const ctx of this.rerankContexts) {
|
||||
await disposeWithTimeout("rerank context", () => ctx.dispose());
|
||||
}
|
||||
this.rerankContexts = [];
|
||||
|
||||
if (this.embedModel) {
|
||||
await disposeWithTimeout("embedding model", () => this.embedModel!.dispose());
|
||||
this.embedModel = null;
|
||||
}
|
||||
if (this.generateModel) {
|
||||
await disposeWithTimeout("generation model", () => this.generateModel!.dispose());
|
||||
this.generateModel = null;
|
||||
}
|
||||
if (this.rerankModel) {
|
||||
await disposeWithTimeout("rerank model", () => this.rerankModel!.dispose());
|
||||
this.rerankModel = null;
|
||||
}
|
||||
|
||||
// Clear references
|
||||
this.embedContexts = [];
|
||||
this.rerankContexts = [];
|
||||
this.embedModel = null;
|
||||
this.generateModel = null;
|
||||
this.rerankModel = null;
|
||||
this.llama = null;
|
||||
if (this.llama) {
|
||||
await disposeWithTimeout("llama runtime", () => this.llama!.dispose());
|
||||
this.llama = null;
|
||||
}
|
||||
|
||||
// Clear any in-flight load/create promises
|
||||
this.embedModelLoadPromise = null;
|
||||
|
||||
82
test/cli-exit-lifecycle.test.ts
Normal file
82
test/cli-exit-lifecycle.test.ts
Normal file
@ -0,0 +1,82 @@
|
||||
import { describe, expect, test } from "vitest";
|
||||
import { finishSuccessfulCliCommand } from "../src/cli/qmd.ts";
|
||||
import { LlamaCpp } from "../src/llm.ts";
|
||||
|
||||
describe("CLI successful-exit lifecycle", () => {
|
||||
test("exits 0 after successful JSON output when post-output LLM cleanup fails", async () => {
|
||||
const exitCodes: number[] = [];
|
||||
const stderr: string[] = [];
|
||||
const flushed: string[] = [];
|
||||
|
||||
await finishSuccessfulCliCommand({
|
||||
command: "query",
|
||||
format: "json",
|
||||
platform: "linux",
|
||||
cleanup: async () => {
|
||||
throw new Error("ggml_metal_device_free abort simulation");
|
||||
},
|
||||
exit: (code) => {
|
||||
exitCodes.push(code);
|
||||
},
|
||||
stdout: { write: (chunk: string | Uint8Array, cb?: (error?: Error | null) => void) => { flushed.push(String(chunk)); cb?.(); return true; } },
|
||||
stderr: { write: (chunk: string | Uint8Array, cb?: (error?: Error | null) => void) => { stderr.push(String(chunk)); cb?.(); return true; } },
|
||||
});
|
||||
|
||||
expect(exitCodes).toEqual([0]);
|
||||
expect(stderr.join("")).toContain("QMD Warning: cleanup after successful output failed");
|
||||
expect(flushed).toEqual([""]);
|
||||
});
|
||||
|
||||
test("uses immediate exit for successful macOS JSON query after stdout flush", async () => {
|
||||
const calls: string[] = [];
|
||||
|
||||
await finishSuccessfulCliCommand({
|
||||
command: "query",
|
||||
format: "json",
|
||||
platform: "darwin",
|
||||
cleanup: async () => {
|
||||
calls.push("cleanup");
|
||||
},
|
||||
exit: (code) => {
|
||||
calls.push(`exit:${code}`);
|
||||
},
|
||||
immediateExit: (code) => {
|
||||
calls.push(`immediate-exit:${code}`);
|
||||
},
|
||||
stdout: { write: (_chunk: string | Uint8Array, cb?: (error?: Error | null) => void) => { calls.push("stdout-flush"); cb?.(); return true; } },
|
||||
stderr: { write: (_chunk: string | Uint8Array, cb?: (error?: Error | null) => void) => { calls.push("stderr-flush"); cb?.(); return true; } },
|
||||
});
|
||||
|
||||
expect(calls).toEqual(["stdout-flush", "stderr-flush", "immediate-exit:0"]);
|
||||
});
|
||||
|
||||
test("disposes Llama resources in dependency order before CLI exit", async () => {
|
||||
const calls: string[] = [];
|
||||
const llm = new LlamaCpp({ inactivityTimeoutMs: 0 });
|
||||
const disposable = (name: string) => ({
|
||||
dispose: async () => {
|
||||
calls.push(name);
|
||||
},
|
||||
});
|
||||
|
||||
Object.assign(llm as unknown as Record<string, unknown>, {
|
||||
embedContexts: [disposable("embed-context")],
|
||||
rerankContexts: [disposable("rerank-context")],
|
||||
embedModel: disposable("embed-model"),
|
||||
generateModel: disposable("generate-model"),
|
||||
rerankModel: disposable("rerank-model"),
|
||||
llama: disposable("llama"),
|
||||
});
|
||||
|
||||
await llm.dispose();
|
||||
|
||||
expect(calls).toEqual([
|
||||
"embed-context",
|
||||
"rerank-context",
|
||||
"embed-model",
|
||||
"generate-model",
|
||||
"rerank-model",
|
||||
"llama",
|
||||
]);
|
||||
});
|
||||
});
|
||||
Loading…
Reference in New Issue
Block a user