From 26e3d0c07789f99932d996bb78137dab1686d123 Mon Sep 17 00:00:00 2001 From: cocoon Date: Tue, 7 Apr 2026 23:18:58 +0800 Subject: [PATCH] fix(status): avoid build attempts during device probe --- src/cli/qmd.ts | 11 +++++++---- src/llm.ts | 9 +++++---- test/llm.test.ts | 26 ++++++++++++++++++++++++++ 3 files changed, 38 insertions(+), 8 deletions(-) diff --git a/src/cli/qmd.ts b/src/cli/qmd.ts index a09ffb3..b0057ac 100755 --- a/src/cli/qmd.ts +++ b/src/cli/qmd.ts @@ -461,10 +461,10 @@ async function showStatus(): Promise { } // Device / GPU info + console.log(`\n${c.bold}Device${c.reset}`); try { const llm = getDefaultLlamaCpp(); - const device = await llm.getDeviceInfo(); - console.log(`\n${c.bold}Device${c.reset}`); + const device = await llm.getDeviceInfo({ allowBuild: false }); if (device.gpu) { console.log(` GPU: ${c.green}${device.gpu}${c.reset} (offloading: ${device.gpuOffloading ? 'yes' : 'no'})`); if (device.gpuDevices.length > 0) { @@ -486,8 +486,11 @@ async function showStatus(): Promise { console.log(` ${c.dim}Tip: Install CUDA, Vulkan, or Metal support for GPU acceleration.${c.reset}`); } console.log(` CPU: ${device.cpuCores} math cores`); - } catch { - // Don't fail status if LLM init fails + } catch (error) { + console.log(` Status: ${c.dim}skipped${c.reset} (status probe does not build llama.cpp backends)`); + if (error instanceof Error && error.message) { + console.log(` ${c.dim}${error.message}${c.reset}`); + } } // Tips section diff --git a/src/llm.ts b/src/llm.ts index 485ca7b..d07b89a 100644 --- a/src/llm.ts +++ b/src/llm.ts @@ -550,7 +550,7 @@ export class LlamaCpp implements LLM { /** * Initialize the llama instance (lazy) */ - private async ensureLlama(): Promise { + private async ensureLlama(allowBuild = true): Promise { if (!this.llama) { // Allow override via QMD_LLAMA_GPU: "false" | "off" | "none" forces CPU const gpuOverride = (process.env.QMD_LLAMA_GPU ?? "").toLowerCase(); @@ -558,9 +558,10 @@ export class LlamaCpp implements LLM { const loadLlama = async (gpu: "auto" | false) => await getLlama({ - build: "autoAttempt", + build: allowBuild ? "autoAttempt" : "never", logLevel: LlamaLogLevel.error, gpu, + skipDownload: !allowBuild, }); let llama: Llama; @@ -1244,14 +1245,14 @@ export class LlamaCpp implements LLM { * Get device/GPU info for status display. * Initializes llama if not already done. */ - async getDeviceInfo(): Promise<{ + async getDeviceInfo(options: { allowBuild?: boolean } = {}): Promise<{ gpu: string | false; gpuOffloading: boolean; gpuDevices: string[]; vram?: { total: number; used: number; free: number }; cpuCores: number; }> { - const llama = await this.ensureLlama(); + const llama = await this.ensureLlama(options.allowBuild ?? true); const gpuDevices = await llama.getGpuDeviceNames(); let vram: { total: number; used: number; free: number } | undefined; if (llama.gpu) { diff --git a/test/llm.test.ts b/test/llm.test.ts index d336036..f5c39cc 100644 --- a/test/llm.test.ts +++ b/test/llm.test.ts @@ -193,6 +193,32 @@ describe("LlamaCpp rerank deduping", () => { }); }); +describe("LlamaCpp.getDeviceInfo", () => { + test("can skip build attempts for status probes", async () => { + const llm = new LlamaCpp({}) as any; + const fakeLlama = { + gpu: "metal", + supportsGpuOffloading: true, + cpuMathCores: 8, + getGpuDeviceNames: vi.fn().mockResolvedValue(["Apple GPU"]), + getVramState: vi.fn().mockResolvedValue({ total: 1024, used: 256, free: 768 }), + }; + + llm.ensureLlama = vi.fn().mockResolvedValue(fakeLlama); + + const device = await llm.getDeviceInfo({ allowBuild: false }); + + expect(llm.ensureLlama).toHaveBeenCalledWith(false); + expect(device).toEqual({ + gpu: "metal", + gpuOffloading: true, + gpuDevices: ["Apple GPU"], + vram: { total: 1024, used: 256, free: 768 }, + cpuCores: 8, + }); + }); +}); + // ============================================================================= // Integration Tests (require actual models) // =============================================================================