Merge pull request #652 from tobi/workoff/t_23fce575-dev-review

fix: avoid macOS Metal cleanup abort after JSON query
2026-05-16 13:41:02 -04:00 · 2026-05-16 13:41:02 -04:00 · 474ac7863d
commit 474ac7863d
parent 9bc316e545 b59ba6ab1e
4 changed files with 203 additions and 16 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -16,6 +16,7 @@
  flag. Previously it returned only the best matching chunk (~3.6KB max
  per result). Output payload for `--full` queries is now proportional
  to total document size.
+- macOS Metal: `qmd query --json` now flushes successful JSON output and uses a safe immediate-exit path on Darwin to avoid ggml Metal finalizer aborts; other commands still dispose LLM contexts/models before the llama runtime. #368
 - Embedding: `qmd embed -c <collection>` now scopes pending-doc selection
  to the requested collection instead of embedding global pending work.
  Scoped `--force` clears only collection-owned vectors, preserves shared
--- a/src/cli/qmd.ts
+++ b/src/cli/qmd.ts
@ -207,6 +207,76 @@ const cursor = {
  show() { process.stderr.write('\x1b[?25h'); },
 };

+type CliLifecycleWritable = {
+  write(chunk: string | Uint8Array, callback?: (error?: Error | null) => void): boolean;
+};
+
+type FinishSuccessfulCliCommandOptions = {
+  command: string;
+  format?: OutputFormat;
+  cleanup?: () => Promise<void>;
+  exit?: (code: number) => void;
+  immediateExit?: (code: number) => void;
+  stdout?: CliLifecycleWritable;
+  stderr?: CliLifecycleWritable;
+  platform?: NodeJS.Platform;
+};
+
+async function flushWritable(stream: CliLifecycleWritable): Promise<void> {
+  await new Promise<void>((resolve) => {
+    stream.write("", () => resolve());
+  });
+}
+
+function shouldBypassNativeCleanup(options: FinishSuccessfulCliCommandOptions): boolean {
+  return (
+    (options.platform ?? process.platform) === "darwin" &&
+    options.command === "query" &&
+    options.format === "json" &&
+    process.env.QMD_DISABLE_DARWIN_QUERY_JSON_SAFE_EXIT !== "1"
+  );
+}
+
+function immediateProcessExit(code: number): void {
+  const processWithReallyExit = process as NodeJS.Process & { reallyExit?: (code?: number) => void };
+  if (typeof processWithReallyExit.reallyExit === "function") {
+    processWithReallyExit.reallyExit(code);
+    return;
+  }
+  process.exit(code);
+}
+
+/**
+ * Finish a successful CLI command after output has been flushed. On macOS JSON
+ * query runs, skip normal native teardown and use Node/Bun's immediate exit path:
+ * ggml Metal can abort from C++ finalizers after valid JSON has already been
+ * produced (#368). This wrapper is only reached after the command completed, so
+ * real query failures still exit through the normal error path before this runs.
+ */
+export async function finishSuccessfulCliCommand(options: FinishSuccessfulCliCommandOptions): Promise<void> {
+  const stderr = options.stderr ?? process.stderr;
+  const exit = options.exit ?? ((code: number) => process.exit(code));
+  const immediateExit = options.immediateExit ?? immediateProcessExit;
+
+  await flushWritable(options.stdout ?? process.stdout);
+
+  if (shouldBypassNativeCleanup(options)) {
+    await flushWritable(stderr);
+    immediateExit(0);
+    return;
+  }
+
+  try {
+    await (options.cleanup ?? disposeDefaultLlamaCpp)();
+  } catch (error) {
+    stderr.write(
+      `QMD Warning: cleanup after successful output failed (${error instanceof Error ? error.message : String(error)}); exiting 0 because command output completed.\n`
+    );
+  }
+  await flushWritable(stderr);
+  exit(0);
+}
+
 // Ensure cursor is restored on exit
 process.on('SIGINT', () => { cursor.show(); process.exit(130); });
 process.on('SIGTERM', () => { cursor.show(); process.exit(143); });
@ -3421,8 +3491,10 @@ if (isMain) {
  }

  if (cli.command !== "mcp") {
-    await disposeDefaultLlamaCpp();
-    process.exit(0);
+    await finishSuccessfulCliCommand({
+      command: cli.command,
+      format: cli.opts.format,
+    });
  }

 } // end if (main module)
--- a/src/llm.ts
+++ b/src/llm.ts
@ -540,6 +540,23 @@ export function resolveLlamaGpuMode(
  return "auto";
 }

+async function disposeWithTimeout(resourceName: string, dispose: () => Promise<void>, timeoutMs = 1000): Promise<void> {
+  const timeoutPromise = new Promise<"timeout">((resolve) => {
+    setTimeout(() => resolve("timeout"), timeoutMs).unref();
+  });
+
+  try {
+    const result = await Promise.race([dispose(), timeoutPromise]);
+    if (result === "timeout") {
+      process.stderr.write(`QMD Warning: timed out disposing ${resourceName}; continuing shutdown.\n`);
+    }
+  } catch (error) {
+    process.stderr.write(
+      `QMD Warning: failed to dispose ${resourceName} (${error instanceof Error ? error.message : String(error)}); continuing shutdown.\n`
+    );
+  }
+}
+
 function resolveExpandContextSize(configValue?: number): number {
  if (configValue !== undefined) {
    if (!Number.isInteger(configValue) || configValue <= 0) {
@ -1465,22 +1482,37 @@ export class LlamaCpp implements LLM {
      this.inactivityTimer = null;
    }

-    // Disposing llama cascades to models and contexts automatically
-    // See: https://node-llama-cpp.withcat.ai/guide/objects-lifecycle
-    // Note: llama.dispose() can hang indefinitely, so we use a timeout
-    if (this.llama) {
-      const disposePromise = this.llama.dispose();
-      const timeoutPromise = new Promise<void>((resolve) => setTimeout(resolve, 1000));
-      await Promise.race([disposePromise, timeoutPromise]);
+    // Explicitly dispose in dependency order: contexts first, then models, then llama.
+    // Relying only on llama.dispose() leaves Metal resource sets alive until process
+    // finalization on Apple Silicon, where ggml_metal_device_free can abort after
+    // otherwise-successful CLI output (#368).
+    for (const ctx of this.embedContexts) {
+      await disposeWithTimeout("embedding context", () => ctx.dispose());
+    }
+    this.embedContexts = [];
+
+    for (const ctx of this.rerankContexts) {
+      await disposeWithTimeout("rerank context", () => ctx.dispose());
+    }
+    this.rerankContexts = [];
+
+    if (this.embedModel) {
+      await disposeWithTimeout("embedding model", () => this.embedModel!.dispose());
+      this.embedModel = null;
+    }
+    if (this.generateModel) {
+      await disposeWithTimeout("generation model", () => this.generateModel!.dispose());
+      this.generateModel = null;
+    }
+    if (this.rerankModel) {
+      await disposeWithTimeout("rerank model", () => this.rerankModel!.dispose());
+      this.rerankModel = null;
    }

-    // Clear references
-    this.embedContexts = [];
-    this.rerankContexts = [];
-    this.embedModel = null;
-    this.generateModel = null;
-    this.rerankModel = null;
-    this.llama = null;
+    if (this.llama) {
+      await disposeWithTimeout("llama runtime", () => this.llama!.dispose());
+      this.llama = null;
+    }

    // Clear any in-flight load/create promises
    this.embedModelLoadPromise = null;
--- a/test/cli-exit-lifecycle.test.ts
+++ b/test/cli-exit-lifecycle.test.ts
@ -0,0 +1,82 @@
+import { describe, expect, test } from "vitest";
+import { finishSuccessfulCliCommand } from "../src/cli/qmd.ts";
+import { LlamaCpp } from "../src/llm.ts";
+
+describe("CLI successful-exit lifecycle", () => {
+  test("exits 0 after successful JSON output when post-output LLM cleanup fails", async () => {
+    const exitCodes: number[] = [];
+    const stderr: string[] = [];
+    const flushed: string[] = [];
+
+    await finishSuccessfulCliCommand({
+      command: "query",
+      format: "json",
+      platform: "linux",
+      cleanup: async () => {
+        throw new Error("ggml_metal_device_free abort simulation");
+      },
+      exit: (code) => {
+        exitCodes.push(code);
+      },
+      stdout: { write: (chunk: string | Uint8Array, cb?: (error?: Error | null) => void) => { flushed.push(String(chunk)); cb?.(); return true; } },
+      stderr: { write: (chunk: string | Uint8Array, cb?: (error?: Error | null) => void) => { stderr.push(String(chunk)); cb?.(); return true; } },
+    });
+
+    expect(exitCodes).toEqual([0]);
+    expect(stderr.join("")).toContain("QMD Warning: cleanup after successful output failed");
+    expect(flushed).toEqual([""]);
+  });
+
+  test("uses immediate exit for successful macOS JSON query after stdout flush", async () => {
+    const calls: string[] = [];
+
+    await finishSuccessfulCliCommand({
+      command: "query",
+      format: "json",
+      platform: "darwin",
+      cleanup: async () => {
+        calls.push("cleanup");
+      },
+      exit: (code) => {
+        calls.push(`exit:${code}`);
+      },
+      immediateExit: (code) => {
+        calls.push(`immediate-exit:${code}`);
+      },
+      stdout: { write: (_chunk: string | Uint8Array, cb?: (error?: Error | null) => void) => { calls.push("stdout-flush"); cb?.(); return true; } },
+      stderr: { write: (_chunk: string | Uint8Array, cb?: (error?: Error | null) => void) => { calls.push("stderr-flush"); cb?.(); return true; } },
+    });
+
+    expect(calls).toEqual(["stdout-flush", "stderr-flush", "immediate-exit:0"]);
+  });
+
+  test("disposes Llama resources in dependency order before CLI exit", async () => {
+    const calls: string[] = [];
+    const llm = new LlamaCpp({ inactivityTimeoutMs: 0 });
+    const disposable = (name: string) => ({
+      dispose: async () => {
+        calls.push(name);
+      },
+    });
+
+    Object.assign(llm as unknown as Record<string, unknown>, {
+      embedContexts: [disposable("embed-context")],
+      rerankContexts: [disposable("rerank-context")],
+      embedModel: disposable("embed-model"),
+      generateModel: disposable("generate-model"),
+      rerankModel: disposable("rerank-model"),
+      llama: disposable("llama"),
+    });
+
+    await llm.dispose();
+
+    expect(calls).toEqual([
+      "embed-context",
+      "rerank-context",
+      "embed-model",
+      "generate-model",
+      "rerank-model",
+      "llama",
+    ]);
+  });
+});