From dd5d82d52368fd0e7501d3f939233bed7dd617e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tobi=20L=C3=BCtke?= Date: Sat, 16 May 2026 17:18:06 +0000 Subject: [PATCH] fix: keep llama GPU fallback noise off JSON stdout --- CHANGELOG.md | 1 + README.md | 1 + src/cli/qmd.ts | 6 ++++ src/llm.ts | 66 +++++++++++++++++++++++++++++++---- test/cli.test.ts | 1 + test/llm.test.ts | 90 ++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 158 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b2757c8..d7378bb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ ### Fixes +- GPU: add `QMD_FORCE_CPU=1` / `--no-gpu` to bypass CUDA/Vulkan/Metal probing entirely, and route native llama.cpp stdout noise to stderr so JSON output stays parseable during search/query commands. - Embedding: `qmd embed -c ` now scopes pending-doc selection to the requested collection instead of embedding global pending work. Scoped `--force` clears only collection-owned vectors, preserves shared diff --git a/README.md b/README.md index 02e4b1e..7eadb93 100644 --- a/README.md +++ b/README.md @@ -798,6 +798,7 @@ llm_cache -- Cached LLM responses (query expansion, rerank scores) |----------|---------|-------------| | `XDG_CACHE_HOME` | `~/.cache` | Cache directory location | | `QMD_LLAMA_GPU` | `auto` | Force llama.cpp GPU backend (`metal`, `vulkan`, `cuda`) or disable GPU with `false` | +| `QMD_FORCE_CPU` | unset | Set to `1`/`true` to force CPU mode before any CUDA/Vulkan/Metal probing. Equivalent CLI flag: `--no-gpu`. | | `QMD_EMBED_PARALLELISM` | automatic | Override embedding/reranking context parallelism (1-8). Windows CUDA defaults to `1` because parallel CUDA contexts can crash with `ggml-cuda.cu:98`; use Vulkan or raise this only if your driver is stable. | ## How It Works diff --git a/src/cli/qmd.ts b/src/cli/qmd.ts index 01dc540..7df8401 100755 --- a/src/cli/qmd.ts +++ b/src/cli/qmd.ts @@ -2562,6 +2562,7 @@ function parseCLI() { // Query options "candidate-limit": { type: "string", short: "C" }, "no-rerank": { type: "boolean", default: false }, + "no-gpu": { type: "boolean", default: false }, intent: { type: "string" }, // Chunking options "chunk-strategy": { type: "string" }, // "regex" (default) or "auto" (AST for code files) @@ -2574,6 +2575,10 @@ function parseCLI() { strict: false, // Allow unknown options to pass through }); + if (values["no-gpu"]) { + process.env.QMD_FORCE_CPU = "1"; + } + // Select index name (default: "index") const indexName = values.index as string | undefined; if (indexName) { @@ -2826,6 +2831,7 @@ function showHelp(): void { console.log(" --full - Output full document instead of snippet"); console.log(" -C, --candidate-limit - Max candidates to rerank (default 40, lower = faster)"); console.log(" --no-rerank - Skip LLM reranking (use RRF scores only, much faster on CPU)"); + console.log(" --no-gpu - Force CPU mode for llama.cpp operations (same as QMD_FORCE_CPU=1)"); console.log(" --line-numbers - Include line numbers in output"); console.log(" --explain - Include retrieval score traces (query --json/CLI)"); console.log(" --files | --json | --csv | --md | --xml - Output format"); diff --git a/src/llm.ts b/src/llm.ts index d469d36..b0b30d4 100644 --- a/src/llm.ts +++ b/src/llm.ts @@ -22,10 +22,45 @@ type NodeLlamaCppModule = { let nodeLlamaCppImport: Promise | null = null; async function loadNodeLlamaCpp(): Promise { - nodeLlamaCppImport ??= import("node-llama-cpp") as Promise; + nodeLlamaCppImport ??= withNativeStdoutRedirectedToStderr( + () => import("node-llama-cpp") as Promise + ); return nodeLlamaCppImport; } +export function setNodeLlamaCppModuleForTest(module: NodeLlamaCppModule | null): void { + nodeLlamaCppImport = module ? Promise.resolve(module) : null; + failedGpuInitModes.clear(); +} + +type StdoutWrite = typeof process.stdout.write; +let nativeStdoutRedirectDepth = 0; +let originalStdoutWrite: StdoutWrite | null = null; + +/** + * Some node-llama-cpp native build/probe paths write library noise to stdout. + * JSON APIs must reserve stdout for machine-readable payloads, so route that + * noise to stderr while native llama initialization is in progress. + */ +export async function withNativeStdoutRedirectedToStderr(fn: () => Promise): Promise { + if (nativeStdoutRedirectDepth === 0) { + originalStdoutWrite = process.stdout.write.bind(process.stdout) as StdoutWrite; + process.stdout.write = ((chunk: any, encoding?: any, cb?: any) => { + return process.stderr.write(chunk, encoding, cb as any); + }) as StdoutWrite; + } + nativeStdoutRedirectDepth++; + try { + return await fn(); + } finally { + nativeStdoutRedirectDepth--; + if (nativeStdoutRedirectDepth === 0 && originalStdoutWrite) { + process.stdout.write = originalStdoutWrite; + originalStdoutWrite = null; + } + } +} + import { homedir } from "os"; import { join } from "path"; import { existsSync, mkdirSync, statSync, unlinkSync, readdirSync, readFileSync, writeFileSync, openSync, readSync, closeSync } from "fs"; @@ -487,7 +522,15 @@ export function resolveSafeParallelism(options: ParallelismOptions): number { return Math.max(1, options.computed); } -export function resolveLlamaGpuMode(envValue = process.env.QMD_LLAMA_GPU): LlamaGpuMode { +export function resolveLlamaGpuMode( + envValue = process.env.QMD_LLAMA_GPU, + forceCpuValue = process.env.QMD_FORCE_CPU +): LlamaGpuMode { + const forceCpu = forceCpuValue?.trim().toLowerCase() ?? ""; + if (forceCpu && !["false", "off", "none", "disable", "disabled", "0"].includes(forceCpu)) { + return false; + } + const normalized = envValue?.trim().toLowerCase() ?? ""; if (!normalized) return "auto"; if (["false", "off", "none", "disable", "disabled", "0"].includes(normalized)) return false; @@ -518,6 +561,8 @@ function resolveExpandContextSize(configValue?: number): number { return parsed; } +const failedGpuInitModes = new Set(); + export class LlamaCpp implements LLM { private readonly _ciMode = !!process.env.CI; private llama: Llama | null = null; @@ -668,22 +713,29 @@ export class LlamaCpp implements LLM { const { getLlama, LlamaLogLevel } = await loadNodeLlamaCpp(); const loadLlama = async (gpu: LlamaGpuMode) => - await getLlama({ + await withNativeStdoutRedirectedToStderr(() => getLlama({ build: allowBuild ? "autoAttempt" : "never", logLevel: LlamaLogLevel.error, gpu, skipDownload: !allowBuild, - }); + })); let llama: Llama; - if (gpuMode === false) { + if (gpuMode === false || failedGpuInitModes.has(gpuMode)) { + if (gpuMode !== false && failedGpuInitModes.has(gpuMode)) { + process.stderr.write( + `QMD Warning: skipping previously failed GPU init${gpuMode === "auto" ? "" : ` for QMD_LLAMA_GPU=${gpuMode}`}, using CPU.\n` + ); + } llama = await loadLlama(false); } else { try { llama = await loadLlama(gpuMode); } catch (err) { - // GPU backend (e.g. Vulkan on headless/driverless machines) can throw at init. - // Fall back to CPU so qmd still works. + // GPU backend (e.g. Vulkan/CUDA on headless/driverless machines) can throw at init. + // Fall back to CPU so qmd still works, and cache the failure to avoid repeated + // expensive native build/probe attempts in this process. + failedGpuInitModes.add(gpuMode); process.stderr.write( `QMD Warning: GPU init failed${gpuMode === "auto" ? "" : ` for QMD_LLAMA_GPU=${gpuMode}`} (${err instanceof Error ? err.message : String(err)}), falling back to CPU.\n` ); diff --git a/test/cli.test.ts b/test/cli.test.ts index e4ceb35..aacfff5 100644 --- a/test/cli.test.ts +++ b/test/cli.test.ts @@ -233,6 +233,7 @@ describe("CLI Help", () => { expect(stdout).toContain("Usage:"); expect(stdout).toContain("qmd collection add"); expect(stdout).toContain("qmd search"); + expect(stdout).toContain("--no-gpu"); expect(stdout).toContain("qmd skill show/install"); }); diff --git a/test/llm.test.ts b/test/llm.test.ts index ff22c0c..2fc03cd 100644 --- a/test/llm.test.ts +++ b/test/llm.test.ts @@ -13,6 +13,8 @@ import { getDefaultLlamaCpp, disposeDefaultLlamaCpp, resolveLlamaGpuMode, + setNodeLlamaCppModuleForTest, + withNativeStdoutRedirectedToStderr, resolveParallelismOverride, resolveSafeParallelism, withLLMSession, @@ -78,6 +80,29 @@ describe("QMD_LLAMA_GPU resolution", () => { expect(resolveLlamaGpuMode(" cuda ")).toBe("cuda"); }); + test("QMD_FORCE_CPU disables GPU before QMD_LLAMA_GPU auto-detection", () => { + const prevForceCpu = process.env.QMD_FORCE_CPU; + process.env.QMD_FORCE_CPU = "1"; + try { + expect(resolveLlamaGpuMode(undefined)).toBe(false); + expect(resolveLlamaGpuMode("cuda")).toBe(false); + } finally { + if (prevForceCpu === undefined) delete process.env.QMD_FORCE_CPU; + else process.env.QMD_FORCE_CPU = prevForceCpu; + } + }); + + test("QMD_FORCE_CPU ignores false-ish values", () => { + const prevForceCpu = process.env.QMD_FORCE_CPU; + process.env.QMD_FORCE_CPU = "0"; + try { + expect(resolveLlamaGpuMode(undefined)).toBe("auto"); + } finally { + if (prevForceCpu === undefined) delete process.env.QMD_FORCE_CPU; + else process.env.QMD_FORCE_CPU = prevForceCpu; + } + }); + test("warns and falls back to auto for unsupported values", () => { const stderrSpy = vi.spyOn(process.stderr, "write").mockReturnValue(true); try { @@ -90,6 +115,71 @@ describe("QMD_LLAMA_GPU resolution", () => { }); }); +describe("native llama stdout containment", () => { + test("redirects native stdout noise to stderr while JSON callers are initializing llama", async () => { + const stdoutSpy = vi.spyOn(process.stdout, "write").mockReturnValue(true); + const stderrSpy = vi.spyOn(process.stderr, "write").mockReturnValue(true); + try { + await withNativeStdoutRedirectedToStderr(async () => { + process.stdout.write("cmake build spam\n"); + return "ok"; + }); + + expect(stdoutSpy).not.toHaveBeenCalled(); + expect(stderrSpy).toHaveBeenCalledWith("cmake build spam\n", undefined, undefined); + } finally { + stdoutSpy.mockRestore(); + stderrSpy.mockRestore(); + } + }); + + test("keeps native GPU failure noise off stdout and caches failed GPU init", async () => { + const prevGpu = process.env.QMD_LLAMA_GPU; + const prevForceCpu = process.env.QMD_FORCE_CPU; + process.env.QMD_LLAMA_GPU = "cuda"; + delete process.env.QMD_FORCE_CPU; + + const calls: unknown[] = []; + const fakeLlama = { gpu: false, cpuMathCores: 4 }; + setNodeLlamaCppModuleForTest({ + LlamaLogLevel: { error: "error" }, + resolveModelFile: vi.fn(), + LlamaChatSession: vi.fn() as any, + getLlama: vi.fn(async (options: Record) => { + calls.push(options.gpu); + if (options.gpu === "cuda") { + process.stdout.write("cmake build spam\n"); + throw new Error("CUDA unavailable"); + } + return fakeLlama as any; + }), + }); + + const stdoutSpy = vi.spyOn(process.stdout, "write").mockReturnValue(true); + const stderrSpy = vi.spyOn(process.stderr, "write").mockReturnValue(true); + try { + const first = new LlamaCpp(); + const second = new LlamaCpp(); + + await (first as any).ensureLlama(); + await (second as any).ensureLlama(); + + expect(stdoutSpy).not.toHaveBeenCalled(); + expect(stderrSpy).toHaveBeenCalledWith("cmake build spam\n", undefined, undefined); + expect(calls).toEqual(["cuda", false, false]); + expect(String(stderrSpy.mock.calls.map(call => call[0]).join(""))).toContain("skipping previously failed GPU init"); + } finally { + stdoutSpy.mockRestore(); + stderrSpy.mockRestore(); + setNodeLlamaCppModuleForTest(null); + if (prevGpu === undefined) delete process.env.QMD_LLAMA_GPU; + else process.env.QMD_LLAMA_GPU = prevGpu; + if (prevForceCpu === undefined) delete process.env.QMD_FORCE_CPU; + else process.env.QMD_FORCE_CPU = prevForceCpu; + } + }); +}); + describe("LLM context parallelism safety", () => { test("defaults Windows CUDA to one context to avoid ggml-cuda.cu:98 crashes", () => { expect(resolveSafeParallelism({