Merge pull request #651 from tobi/workoff/t_0d576ae5-dev-review
fix: keep llama GPU fallback noise off JSON stdout
This commit is contained in:
commit
9bc316e545
@ -4,6 +4,7 @@
|
||||
|
||||
### Fixes
|
||||
|
||||
- GPU: add `QMD_FORCE_CPU=1` / `--no-gpu` to bypass CUDA/Vulkan/Metal probing entirely, and route native llama.cpp stdout noise to stderr so JSON output stays parseable during search/query commands.
|
||||
- Snippet line numbers: `qmd_query` (MCP), HTTP `/query`, and `qmd query`
|
||||
(CLI JSON output and snippet headers) now return absolute source-file
|
||||
line numbers instead of chunk-local ones, so the `line` field can be
|
||||
|
||||
@ -798,6 +798,7 @@ llm_cache -- Cached LLM responses (query expansion, rerank scores)
|
||||
|----------|---------|-------------|
|
||||
| `XDG_CACHE_HOME` | `~/.cache` | Cache directory location |
|
||||
| `QMD_LLAMA_GPU` | `auto` | Force llama.cpp GPU backend (`metal`, `vulkan`, `cuda`) or disable GPU with `false` |
|
||||
| `QMD_FORCE_CPU` | unset | Set to `1`/`true` to force CPU mode before any CUDA/Vulkan/Metal probing. Equivalent CLI flag: `--no-gpu`. |
|
||||
| `QMD_EMBED_PARALLELISM` | automatic | Override embedding/reranking context parallelism (1-8). Windows CUDA defaults to `1` because parallel CUDA contexts can crash with `ggml-cuda.cu:98`; use Vulkan or raise this only if your driver is stable. |
|
||||
|
||||
## How It Works
|
||||
|
||||
@ -2565,6 +2565,7 @@ function parseCLI() {
|
||||
// Query options
|
||||
"candidate-limit": { type: "string", short: "C" },
|
||||
"no-rerank": { type: "boolean", default: false },
|
||||
"no-gpu": { type: "boolean", default: false },
|
||||
intent: { type: "string" },
|
||||
// Chunking options
|
||||
"chunk-strategy": { type: "string" }, // "regex" (default) or "auto" (AST for code files)
|
||||
@ -2577,6 +2578,10 @@ function parseCLI() {
|
||||
strict: false, // Allow unknown options to pass through
|
||||
});
|
||||
|
||||
if (values["no-gpu"]) {
|
||||
process.env.QMD_FORCE_CPU = "1";
|
||||
}
|
||||
|
||||
// Select index name (default: "index")
|
||||
const indexName = values.index as string | undefined;
|
||||
if (indexName) {
|
||||
@ -2829,6 +2834,7 @@ function showHelp(): void {
|
||||
console.log(" --full - Output full document instead of snippet");
|
||||
console.log(" -C, --candidate-limit <n> - Max candidates to rerank (default 40, lower = faster)");
|
||||
console.log(" --no-rerank - Skip LLM reranking (use RRF scores only, much faster on CPU)");
|
||||
console.log(" --no-gpu - Force CPU mode for llama.cpp operations (same as QMD_FORCE_CPU=1)");
|
||||
console.log(" --line-numbers - Include line numbers in output");
|
||||
console.log(" --explain - Include retrieval score traces (query --json/CLI)");
|
||||
console.log(" --files | --json | --csv | --md | --xml - Output format");
|
||||
|
||||
66
src/llm.ts
66
src/llm.ts
@ -22,10 +22,45 @@ type NodeLlamaCppModule = {
|
||||
|
||||
let nodeLlamaCppImport: Promise<NodeLlamaCppModule> | null = null;
|
||||
async function loadNodeLlamaCpp(): Promise<NodeLlamaCppModule> {
|
||||
nodeLlamaCppImport ??= import("node-llama-cpp") as Promise<NodeLlamaCppModule>;
|
||||
nodeLlamaCppImport ??= withNativeStdoutRedirectedToStderr(
|
||||
() => import("node-llama-cpp") as Promise<NodeLlamaCppModule>
|
||||
);
|
||||
return nodeLlamaCppImport;
|
||||
}
|
||||
|
||||
export function setNodeLlamaCppModuleForTest(module: NodeLlamaCppModule | null): void {
|
||||
nodeLlamaCppImport = module ? Promise.resolve(module) : null;
|
||||
failedGpuInitModes.clear();
|
||||
}
|
||||
|
||||
type StdoutWrite = typeof process.stdout.write;
|
||||
let nativeStdoutRedirectDepth = 0;
|
||||
let originalStdoutWrite: StdoutWrite | null = null;
|
||||
|
||||
/**
|
||||
* Some node-llama-cpp native build/probe paths write library noise to stdout.
|
||||
* JSON APIs must reserve stdout for machine-readable payloads, so route that
|
||||
* noise to stderr while native llama initialization is in progress.
|
||||
*/
|
||||
export async function withNativeStdoutRedirectedToStderr<T>(fn: () => Promise<T>): Promise<T> {
|
||||
if (nativeStdoutRedirectDepth === 0) {
|
||||
originalStdoutWrite = process.stdout.write.bind(process.stdout) as StdoutWrite;
|
||||
process.stdout.write = ((chunk: any, encoding?: any, cb?: any) => {
|
||||
return process.stderr.write(chunk, encoding, cb as any);
|
||||
}) as StdoutWrite;
|
||||
}
|
||||
nativeStdoutRedirectDepth++;
|
||||
try {
|
||||
return await fn();
|
||||
} finally {
|
||||
nativeStdoutRedirectDepth--;
|
||||
if (nativeStdoutRedirectDepth === 0 && originalStdoutWrite) {
|
||||
process.stdout.write = originalStdoutWrite;
|
||||
originalStdoutWrite = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
import { homedir } from "os";
|
||||
import { join } from "path";
|
||||
import { existsSync, mkdirSync, statSync, unlinkSync, readdirSync, readFileSync, writeFileSync, openSync, readSync, closeSync } from "fs";
|
||||
@ -487,7 +522,15 @@ export function resolveSafeParallelism(options: ParallelismOptions): number {
|
||||
return Math.max(1, options.computed);
|
||||
}
|
||||
|
||||
export function resolveLlamaGpuMode(envValue = process.env.QMD_LLAMA_GPU): LlamaGpuMode {
|
||||
export function resolveLlamaGpuMode(
|
||||
envValue = process.env.QMD_LLAMA_GPU,
|
||||
forceCpuValue = process.env.QMD_FORCE_CPU
|
||||
): LlamaGpuMode {
|
||||
const forceCpu = forceCpuValue?.trim().toLowerCase() ?? "";
|
||||
if (forceCpu && !["false", "off", "none", "disable", "disabled", "0"].includes(forceCpu)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const normalized = envValue?.trim().toLowerCase() ?? "";
|
||||
if (!normalized) return "auto";
|
||||
if (["false", "off", "none", "disable", "disabled", "0"].includes(normalized)) return false;
|
||||
@ -518,6 +561,8 @@ function resolveExpandContextSize(configValue?: number): number {
|
||||
return parsed;
|
||||
}
|
||||
|
||||
const failedGpuInitModes = new Set<LlamaGpuMode>();
|
||||
|
||||
export class LlamaCpp implements LLM {
|
||||
private readonly _ciMode = !!process.env.CI;
|
||||
private llama: Llama | null = null;
|
||||
@ -668,22 +713,29 @@ export class LlamaCpp implements LLM {
|
||||
|
||||
const { getLlama, LlamaLogLevel } = await loadNodeLlamaCpp();
|
||||
const loadLlama = async (gpu: LlamaGpuMode) =>
|
||||
await getLlama({
|
||||
await withNativeStdoutRedirectedToStderr(() => getLlama({
|
||||
build: allowBuild ? "autoAttempt" : "never",
|
||||
logLevel: LlamaLogLevel.error,
|
||||
gpu,
|
||||
skipDownload: !allowBuild,
|
||||
});
|
||||
}));
|
||||
|
||||
let llama: Llama;
|
||||
if (gpuMode === false) {
|
||||
if (gpuMode === false || failedGpuInitModes.has(gpuMode)) {
|
||||
if (gpuMode !== false && failedGpuInitModes.has(gpuMode)) {
|
||||
process.stderr.write(
|
||||
`QMD Warning: skipping previously failed GPU init${gpuMode === "auto" ? "" : ` for QMD_LLAMA_GPU=${gpuMode}`}, using CPU.\n`
|
||||
);
|
||||
}
|
||||
llama = await loadLlama(false);
|
||||
} else {
|
||||
try {
|
||||
llama = await loadLlama(gpuMode);
|
||||
} catch (err) {
|
||||
// GPU backend (e.g. Vulkan on headless/driverless machines) can throw at init.
|
||||
// Fall back to CPU so qmd still works.
|
||||
// GPU backend (e.g. Vulkan/CUDA on headless/driverless machines) can throw at init.
|
||||
// Fall back to CPU so qmd still works, and cache the failure to avoid repeated
|
||||
// expensive native build/probe attempts in this process.
|
||||
failedGpuInitModes.add(gpuMode);
|
||||
process.stderr.write(
|
||||
`QMD Warning: GPU init failed${gpuMode === "auto" ? "" : ` for QMD_LLAMA_GPU=${gpuMode}`} (${err instanceof Error ? err.message : String(err)}), falling back to CPU.\n`
|
||||
);
|
||||
|
||||
@ -233,6 +233,7 @@ describe("CLI Help", () => {
|
||||
expect(stdout).toContain("Usage:");
|
||||
expect(stdout).toContain("qmd collection add");
|
||||
expect(stdout).toContain("qmd search");
|
||||
expect(stdout).toContain("--no-gpu");
|
||||
expect(stdout).toContain("qmd skill show/install");
|
||||
});
|
||||
|
||||
|
||||
@ -13,6 +13,8 @@ import {
|
||||
getDefaultLlamaCpp,
|
||||
disposeDefaultLlamaCpp,
|
||||
resolveLlamaGpuMode,
|
||||
setNodeLlamaCppModuleForTest,
|
||||
withNativeStdoutRedirectedToStderr,
|
||||
resolveParallelismOverride,
|
||||
resolveSafeParallelism,
|
||||
withLLMSession,
|
||||
@ -78,6 +80,29 @@ describe("QMD_LLAMA_GPU resolution", () => {
|
||||
expect(resolveLlamaGpuMode(" cuda ")).toBe("cuda");
|
||||
});
|
||||
|
||||
test("QMD_FORCE_CPU disables GPU before QMD_LLAMA_GPU auto-detection", () => {
|
||||
const prevForceCpu = process.env.QMD_FORCE_CPU;
|
||||
process.env.QMD_FORCE_CPU = "1";
|
||||
try {
|
||||
expect(resolveLlamaGpuMode(undefined)).toBe(false);
|
||||
expect(resolveLlamaGpuMode("cuda")).toBe(false);
|
||||
} finally {
|
||||
if (prevForceCpu === undefined) delete process.env.QMD_FORCE_CPU;
|
||||
else process.env.QMD_FORCE_CPU = prevForceCpu;
|
||||
}
|
||||
});
|
||||
|
||||
test("QMD_FORCE_CPU ignores false-ish values", () => {
|
||||
const prevForceCpu = process.env.QMD_FORCE_CPU;
|
||||
process.env.QMD_FORCE_CPU = "0";
|
||||
try {
|
||||
expect(resolveLlamaGpuMode(undefined)).toBe("auto");
|
||||
} finally {
|
||||
if (prevForceCpu === undefined) delete process.env.QMD_FORCE_CPU;
|
||||
else process.env.QMD_FORCE_CPU = prevForceCpu;
|
||||
}
|
||||
});
|
||||
|
||||
test("warns and falls back to auto for unsupported values", () => {
|
||||
const stderrSpy = vi.spyOn(process.stderr, "write").mockReturnValue(true);
|
||||
try {
|
||||
@ -90,6 +115,71 @@ describe("QMD_LLAMA_GPU resolution", () => {
|
||||
});
|
||||
});
|
||||
|
||||
describe("native llama stdout containment", () => {
|
||||
test("redirects native stdout noise to stderr while JSON callers are initializing llama", async () => {
|
||||
const stdoutSpy = vi.spyOn(process.stdout, "write").mockReturnValue(true);
|
||||
const stderrSpy = vi.spyOn(process.stderr, "write").mockReturnValue(true);
|
||||
try {
|
||||
await withNativeStdoutRedirectedToStderr(async () => {
|
||||
process.stdout.write("cmake build spam\n");
|
||||
return "ok";
|
||||
});
|
||||
|
||||
expect(stdoutSpy).not.toHaveBeenCalled();
|
||||
expect(stderrSpy).toHaveBeenCalledWith("cmake build spam\n", undefined, undefined);
|
||||
} finally {
|
||||
stdoutSpy.mockRestore();
|
||||
stderrSpy.mockRestore();
|
||||
}
|
||||
});
|
||||
|
||||
test("keeps native GPU failure noise off stdout and caches failed GPU init", async () => {
|
||||
const prevGpu = process.env.QMD_LLAMA_GPU;
|
||||
const prevForceCpu = process.env.QMD_FORCE_CPU;
|
||||
process.env.QMD_LLAMA_GPU = "cuda";
|
||||
delete process.env.QMD_FORCE_CPU;
|
||||
|
||||
const calls: unknown[] = [];
|
||||
const fakeLlama = { gpu: false, cpuMathCores: 4 };
|
||||
setNodeLlamaCppModuleForTest({
|
||||
LlamaLogLevel: { error: "error" },
|
||||
resolveModelFile: vi.fn(),
|
||||
LlamaChatSession: vi.fn() as any,
|
||||
getLlama: vi.fn(async (options: Record<string, unknown>) => {
|
||||
calls.push(options.gpu);
|
||||
if (options.gpu === "cuda") {
|
||||
process.stdout.write("cmake build spam\n");
|
||||
throw new Error("CUDA unavailable");
|
||||
}
|
||||
return fakeLlama as any;
|
||||
}),
|
||||
});
|
||||
|
||||
const stdoutSpy = vi.spyOn(process.stdout, "write").mockReturnValue(true);
|
||||
const stderrSpy = vi.spyOn(process.stderr, "write").mockReturnValue(true);
|
||||
try {
|
||||
const first = new LlamaCpp();
|
||||
const second = new LlamaCpp();
|
||||
|
||||
await (first as any).ensureLlama();
|
||||
await (second as any).ensureLlama();
|
||||
|
||||
expect(stdoutSpy).not.toHaveBeenCalled();
|
||||
expect(stderrSpy).toHaveBeenCalledWith("cmake build spam\n", undefined, undefined);
|
||||
expect(calls).toEqual(["cuda", false, false]);
|
||||
expect(String(stderrSpy.mock.calls.map(call => call[0]).join(""))).toContain("skipping previously failed GPU init");
|
||||
} finally {
|
||||
stdoutSpy.mockRestore();
|
||||
stderrSpy.mockRestore();
|
||||
setNodeLlamaCppModuleForTest(null);
|
||||
if (prevGpu === undefined) delete process.env.QMD_LLAMA_GPU;
|
||||
else process.env.QMD_LLAMA_GPU = prevGpu;
|
||||
if (prevForceCpu === undefined) delete process.env.QMD_FORCE_CPU;
|
||||
else process.env.QMD_FORCE_CPU = prevForceCpu;
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
describe("LLM context parallelism safety", () => {
|
||||
test("defaults Windows CUDA to one context to avoid ggml-cuda.cu:98 crashes", () => {
|
||||
expect(resolveSafeParallelism({
|
||||
|
||||
Loading…
Reference in New Issue
Block a user