From dd5d82d52368fd0e7501d3f939233bed7dd617e6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tobi=20L=C3=BCtke?= <tobi@lutke.com>
Date: Sat, 16 May 2026 17:18:06 +0000
Subject: [PATCH] fix: keep llama GPU fallback noise off JSON stdout

---
 CHANGELOG.md     |  1 +
 README.md        |  1 +
 src/cli/qmd.ts   |  6 ++++
 src/llm.ts       | 66 +++++++++++++++++++++++++++++++----
 test/cli.test.ts |  1 +
 test/llm.test.ts | 90 ++++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 158 insertions(+), 7 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index b2757c8..d7378bb 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,7 @@
 
 ### Fixes
 
+- GPU: add `QMD_FORCE_CPU=1` / `--no-gpu` to bypass CUDA/Vulkan/Metal probing entirely, and route native llama.cpp stdout noise to stderr so JSON output stays parseable during search/query commands.
 - Embedding: `qmd embed -c <collection>` now scopes pending-doc selection
   to the requested collection instead of embedding global pending work.
   Scoped `--force` clears only collection-owned vectors, preserves shared
diff --git a/README.md b/README.md
index 02e4b1e..7eadb93 100644
--- a/README.md
+++ b/README.md
@@ -798,6 +798,7 @@ llm_cache       -- Cached LLM responses (query expansion, rerank scores)
 |----------|---------|-------------|
 | `XDG_CACHE_HOME` | `~/.cache` | Cache directory location |
 | `QMD_LLAMA_GPU` | `auto` | Force llama.cpp GPU backend (`metal`, `vulkan`, `cuda`) or disable GPU with `false` |
+| `QMD_FORCE_CPU` | unset | Set to `1`/`true` to force CPU mode before any CUDA/Vulkan/Metal probing. Equivalent CLI flag: `--no-gpu`. |
 | `QMD_EMBED_PARALLELISM` | automatic | Override embedding/reranking context parallelism (1-8). Windows CUDA defaults to `1` because parallel CUDA contexts can crash with `ggml-cuda.cu:98`; use Vulkan or raise this only if your driver is stable. |
 
 ## How It Works
diff --git a/src/cli/qmd.ts b/src/cli/qmd.ts
index 01dc540..7df8401 100755
--- a/src/cli/qmd.ts
+++ b/src/cli/qmd.ts
@@ -2562,6 +2562,7 @@ function parseCLI() {
       // Query options
       "candidate-limit": { type: "string", short: "C" },
       "no-rerank": { type: "boolean", default: false },
+      "no-gpu": { type: "boolean", default: false },
       intent: { type: "string" },
       // Chunking options
       "chunk-strategy": { type: "string" },  // "regex" (default) or "auto" (AST for code files)
@@ -2574,6 +2575,10 @@ function parseCLI() {
     strict: false, // Allow unknown options to pass through
   });
 
+  if (values["no-gpu"]) {
+    process.env.QMD_FORCE_CPU = "1";
+  }
+
   // Select index name (default: "index")
   const indexName = values.index as string | undefined;
   if (indexName) {
@@ -2826,6 +2831,7 @@ function showHelp(): void {
   console.log("  --full                     - Output full document instead of snippet");
   console.log("  -C, --candidate-limit <n>  - Max candidates to rerank (default 40, lower = faster)");
   console.log("  --no-rerank                - Skip LLM reranking (use RRF scores only, much faster on CPU)");
+  console.log("  --no-gpu                   - Force CPU mode for llama.cpp operations (same as QMD_FORCE_CPU=1)");
   console.log("  --line-numbers             - Include line numbers in output");
   console.log("  --explain                  - Include retrieval score traces (query --json/CLI)");
   console.log("  --files | --json | --csv | --md | --xml  - Output format");
diff --git a/src/llm.ts b/src/llm.ts
index d469d36..b0b30d4 100644
--- a/src/llm.ts
+++ b/src/llm.ts
@@ -22,10 +22,45 @@ type NodeLlamaCppModule = {
 
 let nodeLlamaCppImport: Promise<NodeLlamaCppModule> | null = null;
 async function loadNodeLlamaCpp(): Promise<NodeLlamaCppModule> {
-  nodeLlamaCppImport ??= import("node-llama-cpp") as Promise<NodeLlamaCppModule>;
+  nodeLlamaCppImport ??= withNativeStdoutRedirectedToStderr(
+    () => import("node-llama-cpp") as Promise<NodeLlamaCppModule>
+  );
   return nodeLlamaCppImport;
 }
 
+export function setNodeLlamaCppModuleForTest(module: NodeLlamaCppModule | null): void {
+  nodeLlamaCppImport = module ? Promise.resolve(module) : null;
+  failedGpuInitModes.clear();
+}
+
+type StdoutWrite = typeof process.stdout.write;
+let nativeStdoutRedirectDepth = 0;
+let originalStdoutWrite: StdoutWrite | null = null;
+
+/**
+ * Some node-llama-cpp native build/probe paths write library noise to stdout.
+ * JSON APIs must reserve stdout for machine-readable payloads, so route that
+ * noise to stderr while native llama initialization is in progress.
+ */
+export async function withNativeStdoutRedirectedToStderr<T>(fn: () => Promise<T>): Promise<T> {
+  if (nativeStdoutRedirectDepth === 0) {
+    originalStdoutWrite = process.stdout.write.bind(process.stdout) as StdoutWrite;
+    process.stdout.write = ((chunk: any, encoding?: any, cb?: any) => {
+      return process.stderr.write(chunk, encoding, cb as any);
+    }) as StdoutWrite;
+  }
+  nativeStdoutRedirectDepth++;
+  try {
+    return await fn();
+  } finally {
+    nativeStdoutRedirectDepth--;
+    if (nativeStdoutRedirectDepth === 0 && originalStdoutWrite) {
+      process.stdout.write = originalStdoutWrite;
+      originalStdoutWrite = null;
+    }
+  }
+}
+
 import { homedir } from "os";
 import { join } from "path";
 import { existsSync, mkdirSync, statSync, unlinkSync, readdirSync, readFileSync, writeFileSync, openSync, readSync, closeSync } from "fs";
@@ -487,7 +522,15 @@ export function resolveSafeParallelism(options: ParallelismOptions): number {
   return Math.max(1, options.computed);
 }
 
-export function resolveLlamaGpuMode(envValue = process.env.QMD_LLAMA_GPU): LlamaGpuMode {
+export function resolveLlamaGpuMode(
+  envValue = process.env.QMD_LLAMA_GPU,
+  forceCpuValue = process.env.QMD_FORCE_CPU
+): LlamaGpuMode {
+  const forceCpu = forceCpuValue?.trim().toLowerCase() ?? "";
+  if (forceCpu && !["false", "off", "none", "disable", "disabled", "0"].includes(forceCpu)) {
+    return false;
+  }
+
   const normalized = envValue?.trim().toLowerCase() ?? "";
   if (!normalized) return "auto";
   if (["false", "off", "none", "disable", "disabled", "0"].includes(normalized)) return false;
@@ -518,6 +561,8 @@ function resolveExpandContextSize(configValue?: number): number {
   return parsed;
 }
 
+const failedGpuInitModes = new Set<LlamaGpuMode>();
+
 export class LlamaCpp implements LLM {
   private readonly _ciMode = !!process.env.CI;
   private llama: Llama | null = null;
@@ -668,22 +713,29 @@ export class LlamaCpp implements LLM {
 
       const { getLlama, LlamaLogLevel } = await loadNodeLlamaCpp();
       const loadLlama = async (gpu: LlamaGpuMode) =>
-        await getLlama({
+        await withNativeStdoutRedirectedToStderr(() => getLlama({
           build: allowBuild ? "autoAttempt" : "never",
           logLevel: LlamaLogLevel.error,
           gpu,
           skipDownload: !allowBuild,
-        });
+        }));
 
       let llama: Llama;
-      if (gpuMode === false) {
+      if (gpuMode === false || failedGpuInitModes.has(gpuMode)) {
+        if (gpuMode !== false && failedGpuInitModes.has(gpuMode)) {
+          process.stderr.write(
+            `QMD Warning: skipping previously failed GPU init${gpuMode === "auto" ? "" : ` for QMD_LLAMA_GPU=${gpuMode}`}, using CPU.\n`
+          );
+        }
         llama = await loadLlama(false);
       } else {
         try {
           llama = await loadLlama(gpuMode);
         } catch (err) {
-          // GPU backend (e.g. Vulkan on headless/driverless machines) can throw at init.
-          // Fall back to CPU so qmd still works.
+          // GPU backend (e.g. Vulkan/CUDA on headless/driverless machines) can throw at init.
+          // Fall back to CPU so qmd still works, and cache the failure to avoid repeated
+          // expensive native build/probe attempts in this process.
+          failedGpuInitModes.add(gpuMode);
           process.stderr.write(
             `QMD Warning: GPU init failed${gpuMode === "auto" ? "" : ` for QMD_LLAMA_GPU=${gpuMode}`} (${err instanceof Error ? err.message : String(err)}), falling back to CPU.\n`
           );
diff --git a/test/cli.test.ts b/test/cli.test.ts
index e4ceb35..aacfff5 100644
--- a/test/cli.test.ts
+++ b/test/cli.test.ts
@@ -233,6 +233,7 @@ describe("CLI Help", () => {
     expect(stdout).toContain("Usage:");
     expect(stdout).toContain("qmd collection add");
     expect(stdout).toContain("qmd search");
+    expect(stdout).toContain("--no-gpu");
     expect(stdout).toContain("qmd skill show/install");
   });
 
diff --git a/test/llm.test.ts b/test/llm.test.ts
index ff22c0c..2fc03cd 100644
--- a/test/llm.test.ts
+++ b/test/llm.test.ts
@@ -13,6 +13,8 @@ import {
   getDefaultLlamaCpp,
   disposeDefaultLlamaCpp,
   resolveLlamaGpuMode,
+  setNodeLlamaCppModuleForTest,
+  withNativeStdoutRedirectedToStderr,
   resolveParallelismOverride,
   resolveSafeParallelism,
   withLLMSession,
@@ -78,6 +80,29 @@ describe("QMD_LLAMA_GPU resolution", () => {
     expect(resolveLlamaGpuMode(" cuda ")).toBe("cuda");
   });
 
+  test("QMD_FORCE_CPU disables GPU before QMD_LLAMA_GPU auto-detection", () => {
+    const prevForceCpu = process.env.QMD_FORCE_CPU;
+    process.env.QMD_FORCE_CPU = "1";
+    try {
+      expect(resolveLlamaGpuMode(undefined)).toBe(false);
+      expect(resolveLlamaGpuMode("cuda")).toBe(false);
+    } finally {
+      if (prevForceCpu === undefined) delete process.env.QMD_FORCE_CPU;
+      else process.env.QMD_FORCE_CPU = prevForceCpu;
+    }
+  });
+
+  test("QMD_FORCE_CPU ignores false-ish values", () => {
+    const prevForceCpu = process.env.QMD_FORCE_CPU;
+    process.env.QMD_FORCE_CPU = "0";
+    try {
+      expect(resolveLlamaGpuMode(undefined)).toBe("auto");
+    } finally {
+      if (prevForceCpu === undefined) delete process.env.QMD_FORCE_CPU;
+      else process.env.QMD_FORCE_CPU = prevForceCpu;
+    }
+  });
+
   test("warns and falls back to auto for unsupported values", () => {
     const stderrSpy = vi.spyOn(process.stderr, "write").mockReturnValue(true);
     try {
@@ -90,6 +115,71 @@ describe("QMD_LLAMA_GPU resolution", () => {
   });
 });
 
+describe("native llama stdout containment", () => {
+  test("redirects native stdout noise to stderr while JSON callers are initializing llama", async () => {
+    const stdoutSpy = vi.spyOn(process.stdout, "write").mockReturnValue(true);
+    const stderrSpy = vi.spyOn(process.stderr, "write").mockReturnValue(true);
+    try {
+      await withNativeStdoutRedirectedToStderr(async () => {
+        process.stdout.write("cmake build spam\n");
+        return "ok";
+      });
+
+      expect(stdoutSpy).not.toHaveBeenCalled();
+      expect(stderrSpy).toHaveBeenCalledWith("cmake build spam\n", undefined, undefined);
+    } finally {
+      stdoutSpy.mockRestore();
+      stderrSpy.mockRestore();
+    }
+  });
+
+  test("keeps native GPU failure noise off stdout and caches failed GPU init", async () => {
+    const prevGpu = process.env.QMD_LLAMA_GPU;
+    const prevForceCpu = process.env.QMD_FORCE_CPU;
+    process.env.QMD_LLAMA_GPU = "cuda";
+    delete process.env.QMD_FORCE_CPU;
+
+    const calls: unknown[] = [];
+    const fakeLlama = { gpu: false, cpuMathCores: 4 };
+    setNodeLlamaCppModuleForTest({
+      LlamaLogLevel: { error: "error" },
+      resolveModelFile: vi.fn(),
+      LlamaChatSession: vi.fn() as any,
+      getLlama: vi.fn(async (options: Record<string, unknown>) => {
+        calls.push(options.gpu);
+        if (options.gpu === "cuda") {
+          process.stdout.write("cmake build spam\n");
+          throw new Error("CUDA unavailable");
+        }
+        return fakeLlama as any;
+      }),
+    });
+
+    const stdoutSpy = vi.spyOn(process.stdout, "write").mockReturnValue(true);
+    const stderrSpy = vi.spyOn(process.stderr, "write").mockReturnValue(true);
+    try {
+      const first = new LlamaCpp();
+      const second = new LlamaCpp();
+
+      await (first as any).ensureLlama();
+      await (second as any).ensureLlama();
+
+      expect(stdoutSpy).not.toHaveBeenCalled();
+      expect(stderrSpy).toHaveBeenCalledWith("cmake build spam\n", undefined, undefined);
+      expect(calls).toEqual(["cuda", false, false]);
+      expect(String(stderrSpy.mock.calls.map(call => call[0]).join(""))).toContain("skipping previously failed GPU init");
+    } finally {
+      stdoutSpy.mockRestore();
+      stderrSpy.mockRestore();
+      setNodeLlamaCppModuleForTest(null);
+      if (prevGpu === undefined) delete process.env.QMD_LLAMA_GPU;
+      else process.env.QMD_LLAMA_GPU = prevGpu;
+      if (prevForceCpu === undefined) delete process.env.QMD_FORCE_CPU;
+      else process.env.QMD_FORCE_CPU = prevForceCpu;
+    }
+  });
+});
+
 describe("LLM context parallelism safety", () => {
   test("defaults Windows CUDA to one context to avoid ggml-cuda.cu:98 crashes", () => {
     expect(resolveSafeParallelism({