feat: default to NVIDIA embeddings

2026-05-09 16:50:04 +08:00 · 2026-05-09 16:50:04 +08:00 · 7c17c8bcce
commit 7c17c8bcce
parent fbad5791e3
6 changed files with 100 additions and 30 deletions
--- a/README.md
+++ b/README.md
@ -483,25 +483,25 @@ The `query` command uses **Reciprocal Rank Fusion (RRF)** with position-aware bl
 ### Models
-QMD uses `text-embedding-3-small` through an OpenAI-compatible `/embeddings` API for vector embeddings by default. Configure it with:
+QMD uses NVIDIA NIM's OpenAI-compatible `/embeddings` API for vector embeddings
-
+by default. Configure it with:
 ```sh
 export QMD_EMBED_API_KEY="..."
 # Optional for non-OpenAI-compatible gateways:
 export QMD_EMBED_API_BASE_URL="https://api.openai.com/v1"
 export QMD_EMBED_MODEL="text-embedding-3-small"
 ```
 NVIDIA NIM's OpenAI-compatible endpoint can be used directly. QMD reads
 `NVIDIA_API_KEY` when `QMD_EMBED_API_KEY` is not set and sends NVIDIA's required
 `input_type` automatically (`passage` while indexing, `query` while searching):
 ```sh
 export NVIDIA_API_KEY="..."
 export QMD_EMBED_API_BASE_URL="https://integrate.api.nvidia.com/v1"
 export QMD_EMBED_MODEL="nvidia/llama-3.2-nv-embedqa-1b-v2"
 export QMD_DISABLE_LOCAL_MODELS=1
 ```
 QMD reads `NVIDIA_API_KEY` when `QMD_EMBED_API_KEY` is not set and sends
 NVIDIA's required `input_type` automatically (`passage` while indexing, `query`
 while searching).
 `QMD_DISABLE_LOCAL_MODELS=1` is recommended for deployments that must not load
 local GGUF models. In that mode QMD rejects local embedding model URIs, skips
 local query expansion, and defaults search reranking off while still using the
 configured external embedding service for vector search.
 Reranking and query expansion still use local GGUF models via node-llama-cpp:
 | Model | Purpose | Size |
@ -933,7 +933,7 @@ Query ──► LLM Expansion ──► [Original, Variant 1, Variant 2]
 Models are configured in `src/llm.ts`:
 ```typescript
-const DEFAULT_EMBED_MODEL = "text-embedding-3-small";
+const DEFAULT_EMBED_MODEL = "nvidia/llama-3.2-nv-embedqa-1b-v2";
 const DEFAULT_RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
 const DEFAULT_GENERATE_MODEL = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";
 ```
@ -942,7 +942,7 @@ YAML configuration can override those defaults; see `example-index.yml` for a co
 ```yaml
 models:
-  embed: text-embedding-3-small
+  embed: nvidia/llama-3.2-nv-embedqa-1b-v2
  rerank: hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf
  generate: hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf
 ```
--- a/bin/qmd
+++ b/bin/qmd
@ -16,9 +16,6 @@ done
 DIR="$(cd -P "$(dirname "$SOURCE")/.." && pwd)"
 # Detect the package manager that installed dependencies by checking lockfiles.
 # $BUN_INSTALL is intentionally NOT checked — it only indicates that bun exists
 # on the system, not that it was used to install this package (see #361).
 #
 # package-lock.json takes priority: if it exists, npm installed the native
 # modules for Node.  The repo ships bun.lock, so without this check, source
 # builds that use npm would be incorrectly routed to bun, causing ABI
@ -27,6 +24,10 @@ if [ -f "$DIR/package-lock.json" ]; then
  exec node "$DIR/dist/cli/qmd.js" "$@"
 elif [ -f "$DIR/bun.lock" ] || [ -f "$DIR/bun.lockb" ]; then
  exec bun "$DIR/dist/cli/qmd.js" "$@"
 elif command -v bun >/dev/null 2>&1; then
  exec bun "$DIR/dist/cli/qmd.js" "$@"
 elif [ -x "$HOME/.bun/bin/bun" ]; then
  exec "$HOME/.bun/bin/bun" "$DIR/dist/cli/qmd.js" "$@"
 else
  exec node "$DIR/dist/cli/qmd.js" "$@"
 fi
--- a/src/llm.ts
+++ b/src/llm.ts
@ -191,9 +191,9 @@ export type RerankDocument = {
 // Model Configuration
 // =============================================================================
-// Embeddings use an OpenAI-compatible API by default.
+// Embeddings use NVIDIA's OpenAI-compatible API by default.
 // Override QMD_EMBED_MODEL with hf:/path/.gguf to opt into local node-llama-cpp embeddings.
-const DEFAULT_EMBED_MODEL = "text-embedding-3-small";
+const DEFAULT_EMBED_MODEL = "nvidia/llama-3.2-nv-embedqa-1b-v2";
 const DEFAULT_RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
 // const DEFAULT_GENERATE_MODEL = "hf:ggml-org/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q8_0.gguf";
 const DEFAULT_GENERATE_MODEL = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";
@ -214,7 +214,11 @@ const MODEL_CACHE_DIR = process.env.XDG_CACHE_HOME
  : join(homedir(), ".cache", "qmd", "models");
 export const DEFAULT_MODEL_CACHE_DIR = MODEL_CACHE_DIR;
-const DEFAULT_EMBED_API_BASE_URL = "https://api.openai.com/v1";
+const DEFAULT_EMBED_API_BASE_URL = "https://integrate.api.nvidia.com/v1";
 export function localModelsDisabled(): boolean {
  return /^(1|true|yes|on)$/i.test(process.env.QMD_DISABLE_LOCAL_MODELS ?? "");
 }
 function isLocalEmbeddingModel(model: string): boolean {
  return model.startsWith("hf:") || model.endsWith(".gguf") || model.startsWith("/") || model.startsWith("./") || model.startsWith("../");
@ -999,7 +1003,7 @@ export class LlamaCpp implements LLM {
    if (texts.length === 0) return [];
    if (!this.embedApiKey) {
      throw new Error(
-        "External embedding API key is required. Set QMD_EMBED_API_KEY or OPENAI_API_KEY. " +
+        "External embedding API key is required. Set NVIDIA_API_KEY, QMD_EMBED_API_KEY, or OPENAI_API_KEY. " +
        "For local embeddings, set QMD_EMBED_MODEL to an hf: or .gguf model URI."
      );
    }
@ -1043,6 +1047,9 @@ export class LlamaCpp implements LLM {
  async embed(text: string, options: EmbedOptions = {}): Promise<EmbeddingResult | null> {
    const model = options.model ?? this.embedModelUri;
    if (localModelsDisabled() && isLocalEmbeddingModel(model)) {
      throw new Error("Local embedding models are disabled. Set QMD_EMBED_MODEL to an external API model.");
    }
    if (!isLocalEmbeddingModel(model)) {
      const results = await this.embedExternal([text], model, options);
      return results[0] ?? null;
@ -1078,6 +1085,9 @@ export class LlamaCpp implements LLM {
   */
  async embedBatch(texts: string[], options: EmbedOptions = {}): Promise<(EmbeddingResult | null)[]> {
    const model = options.model ?? this.embedModelUri;
    if (localModelsDisabled() && isLocalEmbeddingModel(model)) {
      throw new Error("Local embedding models are disabled. Set QMD_EMBED_MODEL to an external API model.");
    }
    if (!isLocalEmbeddingModel(model)) {
      return this.embedExternal(texts, model, options);
    }
@ -1209,6 +1219,7 @@ export class LlamaCpp implements LLM {
  // ==========================================================================
  async expandQuery(query: string, options: { context?: string, includeLexical?: boolean, intent?: string } = {}): Promise<Queryable[]> {
    if (localModelsDisabled()) return [];
    if (this._ciMode) throw new Error("LLM operations are disabled in CI (set CI=true)");
    // Ping activity at start to keep models alive during this operation
    this.touchActivity();
@ -1308,6 +1319,12 @@ export class LlamaCpp implements LLM {
    documents: RerankDocument[],
    options: RerankOptions = {}
  ): Promise<RerankResult> {
    if (localModelsDisabled()) {
      return {
        model: "disabled",
        results: documents.map((doc, index) => ({ file: doc.file, score: 0, index })),
      };
    }
    if (this._ciMode) throw new Error("LLM operations are disabled in CI (set CI=true)");
    // Ping activity at start to keep models alive during this operation
    this.touchActivity();
--- a/src/store.ts
+++ b/src/store.ts
@ -25,6 +25,7 @@ import {
  formatDocForEmbedding,
  withLLMSessionForLlm,
  DEFAULT_EMBED_MODEL_URI,
  localModelsDisabled,
  type RerankDocument,
  type ILLMSession,
 } from "./llm.js";
@ -4012,7 +4013,7 @@ export async function hybridQuery(
  const collection = options?.collection;
  const explain = options?.explain ?? false;
  const intent = options?.intent;
-  const skipRerank = options?.skipRerank ?? false;
+  const skipRerank = options?.skipRerank ?? localModelsDisabled();
  const hooks = options?.hooks;
  const rankedLists: RankedResult[][] = [];
@ -4407,7 +4408,7 @@ export async function structuredSearch(
  const candidateLimit = options?.candidateLimit ?? RERANK_CANDIDATE_LIMIT;
  const explain = options?.explain ?? false;
  const intent = options?.intent;
-  const skipRerank = options?.skipRerank ?? false;
+  const skipRerank = options?.skipRerank ?? localModelsDisabled();
  const hooks = options?.hooks;
  const collections = options?.collections;
--- a/test/launcher-detection.test.sh
+++ b/test/launcher-detection.test.sh
@ -20,10 +20,16 @@ fail() { printf "  %-60s FAIL\n" "$1 (got: $2, expected: $3)"; FAIL=$((FAIL + 1)
 # Instead of exec-ing a runtime, we echo which one would be chosen.
 detect_runtime() {
  local DIR="$1"
  local BUN_AVAILABLE="${2:-1}"
  local HOME_BUN_AVAILABLE="${3:-1}"
  if [ -f "$DIR/package-lock.json" ]; then
    echo "node"
  elif [ -f "$DIR/bun.lock" ] || [ -f "$DIR/bun.lockb" ]; then
    echo "bun"
  elif [[ "$BUN_AVAILABLE" == "1" ]]; then
    echo "bun"
  elif [[ "$HOME_BUN_AVAILABLE" == "1" ]]; then
    echo "home-bun"
  else
    echo "node"
  fi
@ -45,10 +51,24 @@ echo "=== bin/qmd runtime detection tests ==="
 # --- Test cases ---
-# 1. No lockfiles → default to node
+# 1. No lockfiles → default to bun when bun is available
 d="$TMPDIR_BASE/no-lockfiles"
 mkdir -p "$d"
-assert_runtime "no lockfiles → node" "$d" "node"
+assert_runtime "no lockfiles + bun available → bun" "$d" "bun"
 got=$(detect_runtime "$d" 0 0)
 if [[ "$got" == "node" ]]; then
  ok "no lockfiles + no bun → node"
 else
  fail "no lockfiles + no bun → node" "$got" "node"
 fi
 got=$(detect_runtime "$d" 0 1)
 if [[ "$got" == "home-bun" ]]; then
  ok "no lockfiles + home bun → home-bun"
 else
  fail "no lockfiles + home bun → home-bun" "$got" "home-bun"
 fi
 # 2. Only bun.lock → bun
 d="$TMPDIR_BASE/bun-lock-only"
--- a/test/llm.test.ts
+++ b/test/llm.test.ts
@ -150,7 +150,7 @@ describe("LlamaCpp expand context size config", () => {
 });
 describe("LlamaCpp model resolution (config > env > default)", () => {
-  const HARDCODED_EMBED = "text-embedding-3-small";
+  const HARDCODED_EMBED = "nvidia/llama-3.2-nv-embedqa-1b-v2";
  const HARDCODED_RERANK = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
  const HARDCODED_GENERATE = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";
@ -192,13 +192,17 @@ describe("LlamaCpp model resolution (config > env > default)", () => {
    }
  });
-  test("default embedding uses external OpenAI-compatible API", async () => {
+  test("default embedding uses NVIDIA OpenAI-compatible API", async () => {
    const prevKey = process.env.QMD_EMBED_API_KEY;
    const prevNvidiaKey = process.env.NVIDIA_API_KEY;
    const prevBaseUrl = process.env.QMD_EMBED_API_BASE_URL;
    process.env.QMD_EMBED_API_KEY = "test-key";
    delete process.env.NVIDIA_API_KEY;
    delete process.env.QMD_EMBED_API_BASE_URL;
    const fetchMock = vi.spyOn(globalThis, "fetch").mockResolvedValue({
      ok: true,
      json: async () => ({
-        model: "text-embedding-3-small",
+        model: "nvidia/llama-3.2-nv-embedqa-1b-v2",
        data: [{ index: 0, embedding: [0.1, 0.2, 0.3] }],
      }),
    } as Response);
@ -206,17 +210,27 @@ describe("LlamaCpp model resolution (config > env > default)", () => {
    try {
      const llm = new LlamaCpp({});
      const result = await llm.embed("hello");
-      expect(fetchMock).toHaveBeenCalledWith("https://api.openai.com/v1/embeddings", expect.objectContaining({
+      expect(fetchMock).toHaveBeenCalledWith("https://integrate.api.nvidia.com/v1/embeddings", expect.objectContaining({
        method: "POST",
      }));
      const [, init] = fetchMock.mock.calls[0]!;
      expect(JSON.parse((init as RequestInit).body as string)).toEqual({
        model: "nvidia/llama-3.2-nv-embedqa-1b-v2",
        input: ["hello"],
        input_type: "passage",
      });
      expect(result).toEqual({
        embedding: [0.1, 0.2, 0.3],
-        model: "text-embedding-3-small",
+        model: "nvidia/llama-3.2-nv-embedqa-1b-v2",
      });
    } finally {
      fetchMock.mockRestore();
      if (prevKey === undefined) delete process.env.QMD_EMBED_API_KEY;
      else process.env.QMD_EMBED_API_KEY = prevKey;
      if (prevNvidiaKey === undefined) delete process.env.NVIDIA_API_KEY;
      else process.env.NVIDIA_API_KEY = prevNvidiaKey;
      if (prevBaseUrl === undefined) delete process.env.QMD_EMBED_API_BASE_URL;
      else process.env.QMD_EMBED_API_BASE_URL = prevBaseUrl;
    }
  });
@ -259,6 +273,23 @@ describe("LlamaCpp model resolution (config > env > default)", () => {
    const llm = new LlamaCpp({ embedModel: "hf:custom/embed.gguf" });
    expect(llm.usesLocalEmbedding).toBe(true);
  });
  test("QMD_DISABLE_LOCAL_MODELS rejects local embedding models and bypasses local LLMs", async () => {
    const prev = process.env.QMD_DISABLE_LOCAL_MODELS;
    process.env.QMD_DISABLE_LOCAL_MODELS = "1";
    try {
      const llm = new LlamaCpp({ embedModel: "hf:custom/embed.gguf" });
      await expect(llm.embed("hello")).rejects.toThrow("Local embedding models are disabled");
      await expect(llm.expandQuery("hello")).resolves.toEqual([]);
      await expect(llm.rerank("hello", [{ file: "doc.md", text: "hello" }])).resolves.toEqual({
        model: "disabled",
        results: [{ file: "doc.md", score: 0, index: 0 }],
      });
    } finally {
      if (prev === undefined) delete process.env.QMD_DISABLE_LOCAL_MODELS;
      else process.env.QMD_DISABLE_LOCAL_MODELS = prev;
    }
  });
 });
 describe("LlamaCpp embedding truncation", () => {