From 60c75cb3327df40f930d23c12a4c98c4a0f79a97 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tobi=20L=C3=BCtke?= <tobi@lutke.com>
Date: Sat, 16 May 2026 17:20:21 +0000
Subject: [PATCH 1/2] fix: avoid macOS Metal cleanup abort after JSON query

---
 CHANGELOG.md                    |  1 +
 src/cli/qmd.ts                  | 76 ++++++++++++++++++++++++++++++-
 src/llm.ts                      | 60 ++++++++++++++++++------
 test/cli-exit-lifecycle.test.ts | 81 +++++++++++++++++++++++++++++++++
 4 files changed, 202 insertions(+), 16 deletions(-)
 create mode 100644 test/cli-exit-lifecycle.test.ts
diff --git a/CHANGELOG.md b/CHANGELOG.md
index ac69601..39b811a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,6 +15,7 @@
   flag. Previously it returned only the best matching chunk (~3.6KB max
   per result). Output payload for `--full` queries is now proportional
   to total document size.
+- macOS Metal: `qmd query --json` now flushes successful JSON output and uses a safe immediate-exit path on Darwin to avoid ggml Metal finalizer aborts; other commands still dispose LLM contexts/models before the llama runtime. #368
 - Embedding: `qmd embed -c <collection>` now scopes pending-doc selection
   to the requested collection instead of embedding global pending work.
   Scoped `--force` clears only collection-owned vectors, preserves shared
diff --git a/src/cli/qmd.ts b/src/cli/qmd.ts
index 40bc0dd..df73f36 100755
--- a/src/cli/qmd.ts
+++ b/src/cli/qmd.ts
@@ -207,6 +207,76 @@ const cursor = {
   show() { process.stderr.write('\x1b[?25h'); },
 };
 
+type CliLifecycleWritable = {
+  write(chunk: string | Uint8Array, callback?: (error?: Error | null) => void): boolean;
+};
+
+type FinishSuccessfulCliCommandOptions = {
+  command: string;
+  format?: OutputFormat;
+  cleanup?: () => Promise<void>;
+  exit?: (code: number) => void;
+  immediateExit?: (code: number) => void;
+  stdout?: CliLifecycleWritable;
+  stderr?: CliLifecycleWritable;
+  platform?: NodeJS.Platform;
+};
+
+async function flushWritable(stream: CliLifecycleWritable): Promise<void> {
+  await new Promise<void>((resolve) => {
+    stream.write("", () => resolve());
+  });
+}
+
+function shouldBypassNativeCleanup(options: FinishSuccessfulCliCommandOptions): boolean {
+  return (
+    (options.platform ?? process.platform) === "darwin" &&
+    options.command === "query" &&
+    options.format === "json" &&
+    process.env.QMD_DISABLE_DARWIN_QUERY_JSON_SAFE_EXIT !== "1"
+  );
+}
+
+function immediateProcessExit(code: number): void {
+  const processWithReallyExit = process as NodeJS.Process & { reallyExit?: (code?: number) => void };
+  if (typeof processWithReallyExit.reallyExit === "function") {
+    processWithReallyExit.reallyExit(code);
+    return;
+  }
+  process.exit(code);
+}
+
+/**
+ * Finish a successful CLI command after output has been flushed. On macOS JSON
+ * query runs, skip normal native teardown and use Node/Bun's immediate exit path:
+ * ggml Metal can abort from C++ finalizers after valid JSON has already been
+ * produced (#368). This wrapper is only reached after the command completed, so
+ * real query failures still exit through the normal error path before this runs.
+ */
+export async function finishSuccessfulCliCommand(options: FinishSuccessfulCliCommandOptions): Promise<void> {
+  const stderr = options.stderr ?? process.stderr;
+  const exit = options.exit ?? ((code: number) => process.exit(code));
+  const immediateExit = options.immediateExit ?? immediateProcessExit;
+
+  await flushWritable(options.stdout ?? process.stdout);
+
+  if (shouldBypassNativeCleanup(options)) {
+    await flushWritable(stderr);
+    immediateExit(0);
+    return;
+  }
+
+  try {
+    await (options.cleanup ?? disposeDefaultLlamaCpp)();
+  } catch (error) {
+    stderr.write(
+      `QMD Warning: cleanup after successful output failed (${error instanceof Error ? error.message : String(error)}); exiting 0 because command output completed.\n`
+    );
+  }
+  await flushWritable(stderr);
+  exit(0);
+}
+
 // Ensure cursor is restored on exit
 process.on('SIGINT', () => { cursor.show(); process.exit(130); });
 process.on('SIGTERM', () => { cursor.show(); process.exit(143); });
@@ -3415,8 +3485,10 @@ if (isMain) {
   }
 
   if (cli.command !== "mcp") {
-    await disposeDefaultLlamaCpp();
-    process.exit(0);
+    await finishSuccessfulCliCommand({
+      command: cli.command,
+      format: cli.opts.format,
+    });
   }
 
 } // end if (main module)
diff --git a/src/llm.ts b/src/llm.ts
index d469d36..f7ec2fd 100644
--- a/src/llm.ts
+++ b/src/llm.ts
@@ -497,6 +497,23 @@ export function resolveLlamaGpuMode(envValue = process.env.QMD_LLAMA_GPU): Llama
   return "auto";
 }
 
+async function disposeWithTimeout(resourceName: string, dispose: () => Promise<void>, timeoutMs = 1000): Promise<void> {
+  const timeoutPromise = new Promise<"timeout">((resolve) => {
+    setTimeout(() => resolve("timeout"), timeoutMs).unref();
+  });
+
+  try {
+    const result = await Promise.race([dispose(), timeoutPromise]);
+    if (result === "timeout") {
+      process.stderr.write(`QMD Warning: timed out disposing ${resourceName}; continuing shutdown.\n`);
+    }
+  } catch (error) {
+    process.stderr.write(
+      `QMD Warning: failed to dispose ${resourceName} (${error instanceof Error ? error.message : String(error)}); continuing shutdown.\n`
+    );
+  }
+}
+
 function resolveExpandContextSize(configValue?: number): number {
   if (configValue !== undefined) {
     if (!Number.isInteger(configValue) || configValue <= 0) {
@@ -1413,22 +1430,37 @@ export class LlamaCpp implements LLM {
       this.inactivityTimer = null;
     }
 
-    // Disposing llama cascades to models and contexts automatically
-    // See: https://node-llama-cpp.withcat.ai/guide/objects-lifecycle
-    // Note: llama.dispose() can hang indefinitely, so we use a timeout
-    if (this.llama) {
-      const disposePromise = this.llama.dispose();
-      const timeoutPromise = new Promise<void>((resolve) => setTimeout(resolve, 1000));
-      await Promise.race([disposePromise, timeoutPromise]);
+    // Explicitly dispose in dependency order: contexts first, then models, then llama.
+    // Relying only on llama.dispose() leaves Metal resource sets alive until process
+    // finalization on Apple Silicon, where ggml_metal_device_free can abort after
+    // otherwise-successful CLI output (#368).
+    for (const ctx of this.embedContexts) {
+      await disposeWithTimeout("embedding context", () => ctx.dispose());
+    }
+    this.embedContexts = [];
+
+    for (const ctx of this.rerankContexts) {
+      await disposeWithTimeout("rerank context", () => ctx.dispose());
+    }
+    this.rerankContexts = [];
+
+    if (this.embedModel) {
+      await disposeWithTimeout("embedding model", () => this.embedModel!.dispose());
+      this.embedModel = null;
+    }
+    if (this.generateModel) {
+      await disposeWithTimeout("generation model", () => this.generateModel!.dispose());
+      this.generateModel = null;
+    }
+    if (this.rerankModel) {
+      await disposeWithTimeout("rerank model", () => this.rerankModel!.dispose());
+      this.rerankModel = null;
     }
 
-    // Clear references
-    this.embedContexts = [];
-    this.rerankContexts = [];
-    this.embedModel = null;
-    this.generateModel = null;
-    this.rerankModel = null;
-    this.llama = null;
+    if (this.llama) {
+      await disposeWithTimeout("llama runtime", () => this.llama!.dispose());
+      this.llama = null;
+    }
 
     // Clear any in-flight load/create promises
     this.embedModelLoadPromise = null;
diff --git a/test/cli-exit-lifecycle.test.ts b/test/cli-exit-lifecycle.test.ts
new file mode 100644
index 0000000..b9328ed
--- /dev/null
+++ b/test/cli-exit-lifecycle.test.ts
@@ -0,0 +1,81 @@
+import { describe, expect, test } from "vitest";
+import { finishSuccessfulCliCommand } from "../src/cli/qmd.ts";
+import { LlamaCpp } from "../src/llm.ts";
+
+describe("CLI successful-exit lifecycle", () => {
+  test("exits 0 after successful JSON output when post-output LLM cleanup fails", async () => {
+    const exitCodes: number[] = [];
+    const stderr: string[] = [];
+    const flushed: string[] = [];
+
+    await finishSuccessfulCliCommand({
+      command: "query",
+      format: "json",
+      cleanup: async () => {
+        throw new Error("ggml_metal_device_free abort simulation");
+      },
+      exit: (code) => {
+        exitCodes.push(code);
+      },
+      stdout: { write: (chunk: string | Uint8Array, cb?: (error?: Error | null) => void) => { flushed.push(String(chunk)); cb?.(); return true; } },
+      stderr: { write: (chunk: string | Uint8Array, cb?: (error?: Error | null) => void) => { stderr.push(String(chunk)); cb?.(); return true; } },
+    });
+
+    expect(exitCodes).toEqual([0]);
+    expect(stderr.join("")).toContain("QMD Warning: cleanup after successful output failed");
+    expect(flushed).toEqual([""]);
+  });
+
+  test("uses immediate exit for successful macOS JSON query after stdout flush", async () => {
+    const calls: string[] = [];
+
+    await finishSuccessfulCliCommand({
+      command: "query",
+      format: "json",
+      platform: "darwin",
+      cleanup: async () => {
+        calls.push("cleanup");
+      },
+      exit: (code) => {
+        calls.push(`exit:${code}`);
+      },
+      immediateExit: (code) => {
+        calls.push(`immediate-exit:${code}`);
+      },
+      stdout: { write: (_chunk: string | Uint8Array, cb?: (error?: Error | null) => void) => { calls.push("stdout-flush"); cb?.(); return true; } },
+      stderr: { write: (_chunk: string | Uint8Array, cb?: (error?: Error | null) => void) => { calls.push("stderr-flush"); cb?.(); return true; } },
+    });
+
+    expect(calls).toEqual(["stdout-flush", "stderr-flush", "immediate-exit:0"]);
+  });
+
+  test("disposes Llama resources in dependency order before CLI exit", async () => {
+    const calls: string[] = [];
+    const llm = new LlamaCpp({ inactivityTimeoutMs: 0 });
+    const disposable = (name: string) => ({
+      dispose: async () => {
+        calls.push(name);
+      },
+    });
+
+    Object.assign(llm as unknown as Record<string, unknown>, {
+      embedContexts: [disposable("embed-context")],
+      rerankContexts: [disposable("rerank-context")],
+      embedModel: disposable("embed-model"),
+      generateModel: disposable("generate-model"),
+      rerankModel: disposable("rerank-model"),
+      llama: disposable("llama"),
+    });
+
+    await llm.dispose();
+
+    expect(calls).toEqual([
+      "embed-context",
+      "rerank-context",
+      "embed-model",
+      "generate-model",
+      "rerank-model",
+      "llama",
+    ]);
+  });
+});

From b59ba6ab1ed35631b17ac914bfceeea588d67ef7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tobi=20L=C3=BCtke?= <tobi@lutke.com>
Date: Sat, 16 May 2026 17:32:45 +0000
Subject: [PATCH 2/2] test: keep cleanup lifecycle regression portable

---
 test/cli-exit-lifecycle.test.ts | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/cli-exit-lifecycle.test.ts b/test/cli-exit-lifecycle.test.ts
index b9328ed..8558596 100644
--- a/test/cli-exit-lifecycle.test.ts
+++ b/test/cli-exit-lifecycle.test.ts
@@ -11,6 +11,7 @@ describe("CLI successful-exit lifecycle", () => {
     await finishSuccessfulCliCommand({
       command: "query",
       format: "json",
+      platform: "linux",
       cleanup: async () => {
         throw new Error("ggml_metal_device_free abort simulation");
       },