fix(cli): keep status from importing llama

2026-05-09 18:12:37 +00:00 · 2026-05-09 18:12:37 +00:00 · 3d991b2a47
commit 3d991b2a47
parent d58fedf4b5
3 changed files with 49 additions and 9 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -12,6 +12,9 @@
 - CLI: make `qmd status` skip native `node-llama-cpp` device probing by
  default so status stays safe on machines with broken or unsupported GPU
  drivers. Set `QMD_STATUS_DEVICE_PROBE=1` to opt in.
+- CLI: lazy-load `node-llama-cpp` so lightweight commands such as
+  `qmd status` do not import native ML dependencies or trigger llama.cpp
+  builds on ARM/no-GPU machines. #491

 ## [2.1.0] - 2026-04-05

--- a/src/llm.ts
+++ b/src/llm.ts
@ -4,16 +4,28 @@
 * Provides embeddings, text generation, and reranking using local GGUF models.
 */

-import {
-  getLlama,
-  resolveModelFile,
-  LlamaChatSession,
-  LlamaLogLevel,
-  type Llama,
-  type LlamaModel,
-  type LlamaEmbeddingContext,
-  type Token as LlamaToken,
+import type {
+  Llama,
+  LlamaModel,
+  LlamaEmbeddingContext,
+  Token as LlamaToken,
 } from "node-llama-cpp";
+
+type NodeLlamaCppModule = {
+  getLlama: (options: Record<string, unknown>) => Promise<Llama>;
+  resolveModelFile: (model: string, cacheDir: string) => Promise<string>;
+  LlamaChatSession: new (options: { contextSequence: unknown }) => {
+    prompt: (prompt: string, options?: Record<string, unknown>) => Promise<string>;
+  };
+  LlamaLogLevel: { error: unknown };
+};
+
+let nodeLlamaCppImport: Promise<NodeLlamaCppModule> | null = null;
+async function loadNodeLlamaCpp(): Promise<NodeLlamaCppModule> {
+  nodeLlamaCppImport ??= import("node-llama-cpp") as Promise<NodeLlamaCppModule>;
+  return nodeLlamaCppImport;
+}
+
 import { homedir } from "os";
 import { join } from "path";
 import { existsSync, mkdirSync, statSync, unlinkSync, readdirSync, readFileSync, writeFileSync, openSync, readSync, closeSync } from "fs";
@ -344,6 +356,7 @@ export async function pullModels(
      }
    }

+    const { resolveModelFile } = await loadNodeLlamaCpp();
    const path = await resolveModelFile(model, cacheDir);
    validateGgufFile(path, model);
    const sizeBytes = existsSync(path) ? statSync(path).size : 0;
@ -619,6 +632,7 @@ export class LlamaCpp implements LLM {
    if (!this.llama) {
      const gpuMode = resolveLlamaGpuMode();

+      const { getLlama, LlamaLogLevel } = await loadNodeLlamaCpp();
      const loadLlama = async (gpu: LlamaGpuMode) =>
        await getLlama({
          build: allowBuild ? "autoAttempt" : "never",
@ -661,6 +675,7 @@ export class LlamaCpp implements LLM {
  private async resolveModel(modelUri: string): Promise<string> {
    this.ensureModelCacheDir();
    // resolveModelFile handles HF URIs and downloads to the cache dir
+    const { resolveModelFile } = await loadNodeLlamaCpp();
    const modelPath = await resolveModelFile(modelUri, this.modelCacheDir);
    validateGgufFile(modelPath, modelUri);
    return modelPath;
@ -1079,6 +1094,7 @@ export class LlamaCpp implements LLM {
    // Create fresh context -> sequence -> session for each call
    const context = await this.generateModel!.createContext();
    const sequence = context.getSequence();
+    const { LlamaChatSession } = await loadNodeLlamaCpp();
    const session = new LlamaChatSession({ contextSequence: sequence });

    const maxTokens = options.maxTokens ?? 150;
@ -1158,6 +1174,7 @@ export class LlamaCpp implements LLM {
      contextSize: this.expandContextSize,
    });
    const sequence = genContext.getSequence();
+    const { LlamaChatSession } = await loadNodeLlamaCpp();
    const session = new LlamaChatSession({ contextSequence: sequence });

    try {
--- a/test/cli-lazy-llm-import.test.ts
+++ b/test/cli-lazy-llm-import.test.ts
@ -0,0 +1,20 @@
+import { describe, expect, test } from "vitest";
+import { readFileSync } from "fs";
+import { join } from "path";
+
+describe("LLM module loading", () => {
+  test("node-llama-cpp is only dynamically imported by LLM operations", () => {
+    const source = readFileSync(join(process.cwd(), "src", "llm.ts"), "utf-8");
+
+    expect(source).not.toMatch(/import\s+(?!type\b)[\s\S]*?from\s+["']node-llama-cpp["']/);
+    expect(source).toContain('import("node-llama-cpp")');
+  });
+
+  test("importing the CLI for lightweight commands succeeds", async () => {
+    const mod = await import("../src/cli/qmd.ts");
+    expect(mod).toMatchObject({
+      buildEditorUri: expect.any(Function),
+      termLink: expect.any(Function),
+    });
+  });
+});