feat: default to NVIDIA embeddings

2026-05-09 16:50:04 +08:00 · 2026-05-09 16:50:04 +08:00 · 7c17c8bcce
commit 7c17c8bcce
parent fbad5791e3
6 changed files with 100 additions and 30 deletions
--- a/README.md
+++ b/README.md
@ -483,25 +483,25 @@ The `query` command uses **Reciprocal Rank Fusion (RRF)** with position-aware bl

 ### Models

-QMD uses `text-embedding-3-small` through an OpenAI-compatible `/embeddings` API for vector embeddings by default. Configure it with:
-
-```sh
-export QMD_EMBED_API_KEY="..."
-# Optional for non-OpenAI-compatible gateways:
-export QMD_EMBED_API_BASE_URL="https://api.openai.com/v1"
-export QMD_EMBED_MODEL="text-embedding-3-small"
-```
-
-NVIDIA NIM's OpenAI-compatible endpoint can be used directly. QMD reads
-`NVIDIA_API_KEY` when `QMD_EMBED_API_KEY` is not set and sends NVIDIA's required
-`input_type` automatically (`passage` while indexing, `query` while searching):
+QMD uses NVIDIA NIM's OpenAI-compatible `/embeddings` API for vector embeddings
+by default. Configure it with:

 ```sh
 export NVIDIA_API_KEY="..."
 export QMD_EMBED_API_BASE_URL="https://integrate.api.nvidia.com/v1"
 export QMD_EMBED_MODEL="nvidia/llama-3.2-nv-embedqa-1b-v2"
+export QMD_DISABLE_LOCAL_MODELS=1
 ```

+QMD reads `NVIDIA_API_KEY` when `QMD_EMBED_API_KEY` is not set and sends
+NVIDIA's required `input_type` automatically (`passage` while indexing, `query`
+while searching).
+
+`QMD_DISABLE_LOCAL_MODELS=1` is recommended for deployments that must not load
+local GGUF models. In that mode QMD rejects local embedding model URIs, skips
+local query expansion, and defaults search reranking off while still using the
+configured external embedding service for vector search.
+
 Reranking and query expansion still use local GGUF models via node-llama-cpp:

 | Model | Purpose | Size |
@ -933,7 +933,7 @@ Query ──► LLM Expansion ──► [Original, Variant 1, Variant 2]
 Models are configured in `src/llm.ts`:

 ```typescript
-const DEFAULT_EMBED_MODEL = "text-embedding-3-small";
+const DEFAULT_EMBED_MODEL = "nvidia/llama-3.2-nv-embedqa-1b-v2";
 const DEFAULT_RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
 const DEFAULT_GENERATE_MODEL = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";
 ```
@ -942,7 +942,7 @@ YAML configuration can override those defaults; see `example-index.yml` for a co

 ```yaml
 models:
-  embed: text-embedding-3-small
+  embed: nvidia/llama-3.2-nv-embedqa-1b-v2
  rerank: hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf
  generate: hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf
 ```
--- a/bin/qmd
+++ b/bin/qmd
@ -16,9 +16,6 @@ done
 DIR="$(cd -P "$(dirname "$SOURCE")/.." && pwd)"

 # Detect the package manager that installed dependencies by checking lockfiles.
-# $BUN_INSTALL is intentionally NOT checked — it only indicates that bun exists
-# on the system, not that it was used to install this package (see #361).
-#
 # package-lock.json takes priority: if it exists, npm installed the native
 # modules for Node.  The repo ships bun.lock, so without this check, source
 # builds that use npm would be incorrectly routed to bun, causing ABI
@ -27,6 +24,10 @@ if [ -f "$DIR/package-lock.json" ]; then
  exec node "$DIR/dist/cli/qmd.js" "$@"
 elif [ -f "$DIR/bun.lock" ] || [ -f "$DIR/bun.lockb" ]; then
  exec bun "$DIR/dist/cli/qmd.js" "$@"
+elif command -v bun >/dev/null 2>&1; then
+  exec bun "$DIR/dist/cli/qmd.js" "$@"
+elif [ -x "$HOME/.bun/bin/bun" ]; then
+  exec "$HOME/.bun/bin/bun" "$DIR/dist/cli/qmd.js" "$@"
 else
  exec node "$DIR/dist/cli/qmd.js" "$@"
 fi
--- a/src/llm.ts
+++ b/src/llm.ts
@ -191,9 +191,9 @@ export type RerankDocument = {
 // Model Configuration
 // =============================================================================

-// Embeddings use an OpenAI-compatible API by default.
+// Embeddings use NVIDIA's OpenAI-compatible API by default.
 // Override QMD_EMBED_MODEL with hf:/path/.gguf to opt into local node-llama-cpp embeddings.
-const DEFAULT_EMBED_MODEL = "text-embedding-3-small";
+const DEFAULT_EMBED_MODEL = "nvidia/llama-3.2-nv-embedqa-1b-v2";
 const DEFAULT_RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
 // const DEFAULT_GENERATE_MODEL = "hf:ggml-org/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q8_0.gguf";
 const DEFAULT_GENERATE_MODEL = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";
@ -214,7 +214,11 @@ const MODEL_CACHE_DIR = process.env.XDG_CACHE_HOME
  : join(homedir(), ".cache", "qmd", "models");
 export const DEFAULT_MODEL_CACHE_DIR = MODEL_CACHE_DIR;

-const DEFAULT_EMBED_API_BASE_URL = "https://api.openai.com/v1";
+const DEFAULT_EMBED_API_BASE_URL = "https://integrate.api.nvidia.com/v1";
+
+export function localModelsDisabled(): boolean {
+  return /^(1|true|yes|on)$/i.test(process.env.QMD_DISABLE_LOCAL_MODELS ?? "");
+}

 function isLocalEmbeddingModel(model: string): boolean {
  return model.startsWith("hf:") || model.endsWith(".gguf") || model.startsWith("/") || model.startsWith("./") || model.startsWith("../");
@ -999,7 +1003,7 @@ export class LlamaCpp implements LLM {
    if (texts.length === 0) return [];
    if (!this.embedApiKey) {
      throw new Error(
-        "External embedding API key is required. Set QMD_EMBED_API_KEY or OPENAI_API_KEY. " +
+        "External embedding API key is required. Set NVIDIA_API_KEY, QMD_EMBED_API_KEY, or OPENAI_API_KEY. " +
        "For local embeddings, set QMD_EMBED_MODEL to an hf: or .gguf model URI."
      );
    }
@ -1043,6 +1047,9 @@ export class LlamaCpp implements LLM {

  async embed(text: string, options: EmbedOptions = {}): Promise<EmbeddingResult | null> {
    const model = options.model ?? this.embedModelUri;
+    if (localModelsDisabled() && isLocalEmbeddingModel(model)) {
+      throw new Error("Local embedding models are disabled. Set QMD_EMBED_MODEL to an external API model.");
+    }
    if (!isLocalEmbeddingModel(model)) {
      const results = await this.embedExternal([text], model, options);
      return results[0] ?? null;
@ -1078,6 +1085,9 @@ export class LlamaCpp implements LLM {
   */
  async embedBatch(texts: string[], options: EmbedOptions = {}): Promise<(EmbeddingResult | null)[]> {
    const model = options.model ?? this.embedModelUri;
+    if (localModelsDisabled() && isLocalEmbeddingModel(model)) {
+      throw new Error("Local embedding models are disabled. Set QMD_EMBED_MODEL to an external API model.");
+    }
    if (!isLocalEmbeddingModel(model)) {
      return this.embedExternal(texts, model, options);
    }
@ -1209,6 +1219,7 @@ export class LlamaCpp implements LLM {
  // ==========================================================================

  async expandQuery(query: string, options: { context?: string, includeLexical?: boolean, intent?: string } = {}): Promise<Queryable[]> {
+    if (localModelsDisabled()) return [];
    if (this._ciMode) throw new Error("LLM operations are disabled in CI (set CI=true)");
    // Ping activity at start to keep models alive during this operation
    this.touchActivity();
@ -1308,6 +1319,12 @@ export class LlamaCpp implements LLM {
    documents: RerankDocument[],
    options: RerankOptions = {}
  ): Promise<RerankResult> {
+    if (localModelsDisabled()) {
+      return {
+        model: "disabled",
+        results: documents.map((doc, index) => ({ file: doc.file, score: 0, index })),
+      };
+    }
    if (this._ciMode) throw new Error("LLM operations are disabled in CI (set CI=true)");
    // Ping activity at start to keep models alive during this operation
    this.touchActivity();
--- a/src/store.ts
+++ b/src/store.ts
@ -25,6 +25,7 @@ import {
  formatDocForEmbedding,
  withLLMSessionForLlm,
  DEFAULT_EMBED_MODEL_URI,
+  localModelsDisabled,
  type RerankDocument,
  type ILLMSession,
 } from "./llm.js";
@ -4012,7 +4013,7 @@ export async function hybridQuery(
  const collection = options?.collection;
  const explain = options?.explain ?? false;
  const intent = options?.intent;
-  const skipRerank = options?.skipRerank ?? false;
+  const skipRerank = options?.skipRerank ?? localModelsDisabled();
  const hooks = options?.hooks;

  const rankedLists: RankedResult[][] = [];
@ -4407,7 +4408,7 @@ export async function structuredSearch(
  const candidateLimit = options?.candidateLimit ?? RERANK_CANDIDATE_LIMIT;
  const explain = options?.explain ?? false;
  const intent = options?.intent;
-  const skipRerank = options?.skipRerank ?? false;
+  const skipRerank = options?.skipRerank ?? localModelsDisabled();
  const hooks = options?.hooks;

  const collections = options?.collections;
--- a/test/launcher-detection.test.sh
+++ b/test/launcher-detection.test.sh
@ -20,10 +20,16 @@ fail() { printf "  %-60s FAIL\n" "$1 (got: $2, expected: $3)"; FAIL=$((FAIL + 1)
 # Instead of exec-ing a runtime, we echo which one would be chosen.
 detect_runtime() {
  local DIR="$1"
+  local BUN_AVAILABLE="${2:-1}"
+  local HOME_BUN_AVAILABLE="${3:-1}"
  if [ -f "$DIR/package-lock.json" ]; then
    echo "node"
  elif [ -f "$DIR/bun.lock" ] || [ -f "$DIR/bun.lockb" ]; then
    echo "bun"
+  elif [[ "$BUN_AVAILABLE" == "1" ]]; then
+    echo "bun"
+  elif [[ "$HOME_BUN_AVAILABLE" == "1" ]]; then
+    echo "home-bun"
  else
    echo "node"
  fi
@ -45,10 +51,24 @@ echo "=== bin/qmd runtime detection tests ==="

 # --- Test cases ---

-# 1. No lockfiles → default to node
+# 1. No lockfiles → default to bun when bun is available
 d="$TMPDIR_BASE/no-lockfiles"
 mkdir -p "$d"
-assert_runtime "no lockfiles → node" "$d" "node"
+assert_runtime "no lockfiles + bun available → bun" "$d" "bun"
+
+got=$(detect_runtime "$d" 0 0)
+if [[ "$got" == "node" ]]; then
+  ok "no lockfiles + no bun → node"
+else
+  fail "no lockfiles + no bun → node" "$got" "node"
+fi
+
+got=$(detect_runtime "$d" 0 1)
+if [[ "$got" == "home-bun" ]]; then
+  ok "no lockfiles + home bun → home-bun"
+else
+  fail "no lockfiles + home bun → home-bun" "$got" "home-bun"
+fi

 # 2. Only bun.lock → bun
 d="$TMPDIR_BASE/bun-lock-only"
--- a/test/llm.test.ts
+++ b/test/llm.test.ts
@ -150,7 +150,7 @@ describe("LlamaCpp expand context size config", () => {
 });

 describe("LlamaCpp model resolution (config > env > default)", () => {
-  const HARDCODED_EMBED = "text-embedding-3-small";
+  const HARDCODED_EMBED = "nvidia/llama-3.2-nv-embedqa-1b-v2";
  const HARDCODED_RERANK = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
  const HARDCODED_GENERATE = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";

@ -192,13 +192,17 @@ describe("LlamaCpp model resolution (config > env > default)", () => {
    }
  });

-  test("default embedding uses external OpenAI-compatible API", async () => {
+  test("default embedding uses NVIDIA OpenAI-compatible API", async () => {
    const prevKey = process.env.QMD_EMBED_API_KEY;
+    const prevNvidiaKey = process.env.NVIDIA_API_KEY;
+    const prevBaseUrl = process.env.QMD_EMBED_API_BASE_URL;
    process.env.QMD_EMBED_API_KEY = "test-key";
+    delete process.env.NVIDIA_API_KEY;
+    delete process.env.QMD_EMBED_API_BASE_URL;
    const fetchMock = vi.spyOn(globalThis, "fetch").mockResolvedValue({
      ok: true,
      json: async () => ({
-        model: "text-embedding-3-small",
+        model: "nvidia/llama-3.2-nv-embedqa-1b-v2",
        data: [{ index: 0, embedding: [0.1, 0.2, 0.3] }],
      }),
    } as Response);
@ -206,17 +210,27 @@ describe("LlamaCpp model resolution (config > env > default)", () => {
    try {
      const llm = new LlamaCpp({});
      const result = await llm.embed("hello");
-      expect(fetchMock).toHaveBeenCalledWith("https://api.openai.com/v1/embeddings", expect.objectContaining({
+      expect(fetchMock).toHaveBeenCalledWith("https://integrate.api.nvidia.com/v1/embeddings", expect.objectContaining({
        method: "POST",
      }));
+      const [, init] = fetchMock.mock.calls[0]!;
+      expect(JSON.parse((init as RequestInit).body as string)).toEqual({
+        model: "nvidia/llama-3.2-nv-embedqa-1b-v2",
+        input: ["hello"],
+        input_type: "passage",
+      });
      expect(result).toEqual({
        embedding: [0.1, 0.2, 0.3],
-        model: "text-embedding-3-small",
+        model: "nvidia/llama-3.2-nv-embedqa-1b-v2",
      });
    } finally {
      fetchMock.mockRestore();
      if (prevKey === undefined) delete process.env.QMD_EMBED_API_KEY;
      else process.env.QMD_EMBED_API_KEY = prevKey;
+      if (prevNvidiaKey === undefined) delete process.env.NVIDIA_API_KEY;
+      else process.env.NVIDIA_API_KEY = prevNvidiaKey;
+      if (prevBaseUrl === undefined) delete process.env.QMD_EMBED_API_BASE_URL;
+      else process.env.QMD_EMBED_API_BASE_URL = prevBaseUrl;
    }
  });

@ -259,6 +273,23 @@ describe("LlamaCpp model resolution (config > env > default)", () => {
    const llm = new LlamaCpp({ embedModel: "hf:custom/embed.gguf" });
    expect(llm.usesLocalEmbedding).toBe(true);
  });
+
+  test("QMD_DISABLE_LOCAL_MODELS rejects local embedding models and bypasses local LLMs", async () => {
+    const prev = process.env.QMD_DISABLE_LOCAL_MODELS;
+    process.env.QMD_DISABLE_LOCAL_MODELS = "1";
+    try {
+      const llm = new LlamaCpp({ embedModel: "hf:custom/embed.gguf" });
+      await expect(llm.embed("hello")).rejects.toThrow("Local embedding models are disabled");
+      await expect(llm.expandQuery("hello")).resolves.toEqual([]);
+      await expect(llm.rerank("hello", [{ file: "doc.md", text: "hello" }])).resolves.toEqual({
+        model: "disabled",
+        results: [{ file: "doc.md", score: 0, index: 0 }],
+      });
+    } finally {
+      if (prev === undefined) delete process.env.QMD_DISABLE_LOCAL_MODELS;
+      else process.env.QMD_DISABLE_LOCAL_MODELS = prev;
+    }
+  });
 });

 describe("LlamaCpp embedding truncation", () => {