diff --git a/CHANGELOG.md b/CHANGELOG.md index f8feb5d..1e50d26 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,10 @@ ## [Unreleased] +### Fixes + +- GPU: respect explicit `QMD_LLAMA_GPU=metal|vulkan|cuda` backend overrides instead of always using auto GPU selection. #529 + ## [2.1.0] - 2026-04-05 Code files now chunk at function and class boundaries via tree-sitter, diff --git a/src/llm.ts b/src/llm.ts index 485ca7b..6f9e982 100644 --- a/src/llm.ts +++ b/src/llm.ts @@ -385,6 +385,18 @@ export type LlamaCppConfig = { const DEFAULT_INACTIVITY_TIMEOUT_MS = 5 * 60 * 1000; const DEFAULT_EXPAND_CONTEXT_SIZE = 2048; +type LlamaGpuMode = "auto" | "metal" | "vulkan" | "cuda" | false; + +export function resolveLlamaGpuMode(envValue = process.env.QMD_LLAMA_GPU): LlamaGpuMode { + const normalized = envValue?.trim().toLowerCase() ?? ""; + if (!normalized) return "auto"; + if (["false", "off", "none", "disable", "disabled", "0"].includes(normalized)) return false; + if (normalized === "metal" || normalized === "vulkan" || normalized === "cuda") return normalized; + + process.stderr.write(`QMD Warning: invalid QMD_LLAMA_GPU="${envValue}", using auto GPU selection.\n`); + return "auto"; +} + function resolveExpandContextSize(configValue?: number): number { if (configValue !== undefined) { if (!Number.isInteger(configValue) || configValue <= 0) { @@ -552,11 +564,9 @@ export class LlamaCpp implements LLM { */ private async ensureLlama(): Promise { if (!this.llama) { - // Allow override via QMD_LLAMA_GPU: "false" | "off" | "none" forces CPU - const gpuOverride = (process.env.QMD_LLAMA_GPU ?? "").toLowerCase(); - const forceCpu = ["false", "off", "none", "disable", "disabled", "0"].includes(gpuOverride); + const gpuMode = resolveLlamaGpuMode(); - const loadLlama = async (gpu: "auto" | false) => + const loadLlama = async (gpu: LlamaGpuMode) => await getLlama({ build: "autoAttempt", logLevel: LlamaLogLevel.error, @@ -564,16 +574,16 @@ export class LlamaCpp implements LLM { }); let llama: Llama; - if (forceCpu) { + if (gpuMode === false) { llama = await loadLlama(false); } else { try { - llama = await loadLlama("auto"); + llama = await loadLlama(gpuMode); } catch (err) { // GPU backend (e.g. Vulkan on headless/driverless machines) can throw at init. // Fall back to CPU so qmd still works. process.stderr.write( - `QMD Warning: GPU init failed (${err instanceof Error ? err.message : String(err)}), falling back to CPU.\n` + `QMD Warning: GPU init failed${gpuMode === "auto" ? "" : ` for QMD_LLAMA_GPU=${gpuMode}`} (${err instanceof Error ? err.message : String(err)}), falling back to CPU.\n` ); llama = await loadLlama(false); } diff --git a/test/llm.test.ts b/test/llm.test.ts index d336036..f3797f0 100644 --- a/test/llm.test.ts +++ b/test/llm.test.ts @@ -12,6 +12,7 @@ import { LlamaCpp, getDefaultLlamaCpp, disposeDefaultLlamaCpp, + resolveLlamaGpuMode, withLLMSession, canUnloadLLM, SessionReleasedError, @@ -55,6 +56,38 @@ describe("LlamaCpp.modelExists", () => { }); }); +describe("QMD_LLAMA_GPU resolution", () => { + test("uses auto when unset or blank", () => { + expect(resolveLlamaGpuMode(undefined)).toBe("auto"); + expect(resolveLlamaGpuMode(" ")).toBe("auto"); + }); + + test("maps CPU disable values to false", () => { + expect(resolveLlamaGpuMode("false")).toBe(false); + expect(resolveLlamaGpuMode("OFF")).toBe(false); + expect(resolveLlamaGpuMode(" none ")).toBe(false); + expect(resolveLlamaGpuMode("disabled")).toBe(false); + expect(resolveLlamaGpuMode("0")).toBe(false); + }); + + test("passes through supported GPU backends", () => { + expect(resolveLlamaGpuMode("metal")).toBe("metal"); + expect(resolveLlamaGpuMode("VULKAN")).toBe("vulkan"); + expect(resolveLlamaGpuMode(" cuda ")).toBe("cuda"); + }); + + test("warns and falls back to auto for unsupported values", () => { + const stderrSpy = vi.spyOn(process.stderr, "write").mockReturnValue(true); + try { + expect(resolveLlamaGpuMode("rocm")).toBe("auto"); + expect(stderrSpy).toHaveBeenCalled(); + expect(String(stderrSpy.mock.calls[0]?.[0] || "")).toContain("QMD_LLAMA_GPU"); + } finally { + stderrSpy.mockRestore(); + } + }); +}); + describe("LlamaCpp expand context size config", () => { const defaultExpandContextSize = 2048;