From 7c17c8bcce22ce0e000509d5558f41877e532ed4 Mon Sep 17 00:00:00 2001
From: Haitao Pan <manbuzhe2009@qq.com>
Date: Sat, 9 May 2026 16:50:04 +0800
Subject: [PATCH] feat: default to NVIDIA embeddings

---
 README.md                       | 28 +++++++++++-----------
 bin/qmd                         |  7 +++---
 src/llm.ts                      | 25 ++++++++++++++++----
 src/store.ts                    |  5 ++--
 test/launcher-detection.test.sh | 24 +++++++++++++++++--
 test/llm.test.ts                | 41 +++++++++++++++++++++++++++++----
 6 files changed, 100 insertions(+), 30 deletions(-)

diff --git a/README.md b/README.md
index ede27aa..74efadd 100644
--- a/README.md
+++ b/README.md
@@ -483,25 +483,25 @@ The `query` command uses **Reciprocal Rank Fusion (RRF)** with position-aware bl
 
 ### Models
 
-QMD uses `text-embedding-3-small` through an OpenAI-compatible `/embeddings` API for vector embeddings by default. Configure it with:
-
-```sh
-export QMD_EMBED_API_KEY="..."
-# Optional for non-OpenAI-compatible gateways:
-export QMD_EMBED_API_BASE_URL="https://api.openai.com/v1"
-export QMD_EMBED_MODEL="text-embedding-3-small"
-```
-
-NVIDIA NIM's OpenAI-compatible endpoint can be used directly. QMD reads
-`NVIDIA_API_KEY` when `QMD_EMBED_API_KEY` is not set and sends NVIDIA's required
-`input_type` automatically (`passage` while indexing, `query` while searching):
+QMD uses NVIDIA NIM's OpenAI-compatible `/embeddings` API for vector embeddings
+by default. Configure it with:
 
 ```sh
 export NVIDIA_API_KEY="..."
 export QMD_EMBED_API_BASE_URL="https://integrate.api.nvidia.com/v1"
 export QMD_EMBED_MODEL="nvidia/llama-3.2-nv-embedqa-1b-v2"
+export QMD_DISABLE_LOCAL_MODELS=1
 ```
 
+QMD reads `NVIDIA_API_KEY` when `QMD_EMBED_API_KEY` is not set and sends
+NVIDIA's required `input_type` automatically (`passage` while indexing, `query`
+while searching).
+
+`QMD_DISABLE_LOCAL_MODELS=1` is recommended for deployments that must not load
+local GGUF models. In that mode QMD rejects local embedding model URIs, skips
+local query expansion, and defaults search reranking off while still using the
+configured external embedding service for vector search.
+
 Reranking and query expansion still use local GGUF models via node-llama-cpp:
 
 | Model | Purpose | Size |
@@ -933,7 +933,7 @@ Query ──► LLM Expansion ──► [Original, Variant 1, Variant 2]
 Models are configured in `src/llm.ts`:
 
 ```typescript
-const DEFAULT_EMBED_MODEL = "text-embedding-3-small";
+const DEFAULT_EMBED_MODEL = "nvidia/llama-3.2-nv-embedqa-1b-v2";
 const DEFAULT_RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
 const DEFAULT_GENERATE_MODEL = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";
 ```
@@ -942,7 +942,7 @@ YAML configuration can override those defaults; see `example-index.yml` for a co
 
 ```yaml
 models:
-  embed: text-embedding-3-small
+  embed: nvidia/llama-3.2-nv-embedqa-1b-v2
   rerank: hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf
   generate: hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf
 ```
diff --git a/bin/qmd b/bin/qmd
index f658b3b..b667080 100755
--- a/bin/qmd
+++ b/bin/qmd
@@ -16,9 +16,6 @@ done
 DIR="$(cd -P "$(dirname "$SOURCE")/.." && pwd)"
 
 # Detect the package manager that installed dependencies by checking lockfiles.
-# $BUN_INSTALL is intentionally NOT checked — it only indicates that bun exists
-# on the system, not that it was used to install this package (see #361).
-#
 # package-lock.json takes priority: if it exists, npm installed the native
 # modules for Node.  The repo ships bun.lock, so without this check, source
 # builds that use npm would be incorrectly routed to bun, causing ABI
@@ -27,6 +24,10 @@ if [ -f "$DIR/package-lock.json" ]; then
   exec node "$DIR/dist/cli/qmd.js" "$@"
 elif [ -f "$DIR/bun.lock" ] || [ -f "$DIR/bun.lockb" ]; then
   exec bun "$DIR/dist/cli/qmd.js" "$@"
+elif command -v bun >/dev/null 2>&1; then
+  exec bun "$DIR/dist/cli/qmd.js" "$@"
+elif [ -x "$HOME/.bun/bin/bun" ]; then
+  exec "$HOME/.bun/bin/bun" "$DIR/dist/cli/qmd.js" "$@"
 else
   exec node "$DIR/dist/cli/qmd.js" "$@"
 fi
diff --git a/src/llm.ts b/src/llm.ts
index 5022d27..cb8c6a2 100644
--- a/src/llm.ts
+++ b/src/llm.ts
@@ -191,9 +191,9 @@ export type RerankDocument = {
 // Model Configuration
 // =============================================================================
 
-// Embeddings use an OpenAI-compatible API by default.
+// Embeddings use NVIDIA's OpenAI-compatible API by default.
 // Override QMD_EMBED_MODEL with hf:/path/.gguf to opt into local node-llama-cpp embeddings.
-const DEFAULT_EMBED_MODEL = "text-embedding-3-small";
+const DEFAULT_EMBED_MODEL = "nvidia/llama-3.2-nv-embedqa-1b-v2";
 const DEFAULT_RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
 // const DEFAULT_GENERATE_MODEL = "hf:ggml-org/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q8_0.gguf";
 const DEFAULT_GENERATE_MODEL = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";
@@ -214,7 +214,11 @@ const MODEL_CACHE_DIR = process.env.XDG_CACHE_HOME
   : join(homedir(), ".cache", "qmd", "models");
 export const DEFAULT_MODEL_CACHE_DIR = MODEL_CACHE_DIR;
 
-const DEFAULT_EMBED_API_BASE_URL = "https://api.openai.com/v1";
+const DEFAULT_EMBED_API_BASE_URL = "https://integrate.api.nvidia.com/v1";
+
+export function localModelsDisabled(): boolean {
+  return /^(1|true|yes|on)$/i.test(process.env.QMD_DISABLE_LOCAL_MODELS ?? "");
+}
 
 function isLocalEmbeddingModel(model: string): boolean {
   return model.startsWith("hf:") || model.endsWith(".gguf") || model.startsWith("/") || model.startsWith("./") || model.startsWith("../");
@@ -999,7 +1003,7 @@ export class LlamaCpp implements LLM {
     if (texts.length === 0) return [];
     if (!this.embedApiKey) {
       throw new Error(
-        "External embedding API key is required. Set QMD_EMBED_API_KEY or OPENAI_API_KEY. " +
+        "External embedding API key is required. Set NVIDIA_API_KEY, QMD_EMBED_API_KEY, or OPENAI_API_KEY. " +
         "For local embeddings, set QMD_EMBED_MODEL to an hf: or .gguf model URI."
       );
     }
@@ -1043,6 +1047,9 @@ export class LlamaCpp implements LLM {
 
   async embed(text: string, options: EmbedOptions = {}): Promise<EmbeddingResult | null> {
     const model = options.model ?? this.embedModelUri;
+    if (localModelsDisabled() && isLocalEmbeddingModel(model)) {
+      throw new Error("Local embedding models are disabled. Set QMD_EMBED_MODEL to an external API model.");
+    }
     if (!isLocalEmbeddingModel(model)) {
       const results = await this.embedExternal([text], model, options);
       return results[0] ?? null;
@@ -1078,6 +1085,9 @@ export class LlamaCpp implements LLM {
    */
   async embedBatch(texts: string[], options: EmbedOptions = {}): Promise<(EmbeddingResult | null)[]> {
     const model = options.model ?? this.embedModelUri;
+    if (localModelsDisabled() && isLocalEmbeddingModel(model)) {
+      throw new Error("Local embedding models are disabled. Set QMD_EMBED_MODEL to an external API model.");
+    }
     if (!isLocalEmbeddingModel(model)) {
       return this.embedExternal(texts, model, options);
     }
@@ -1209,6 +1219,7 @@ export class LlamaCpp implements LLM {
   // ==========================================================================
 
   async expandQuery(query: string, options: { context?: string, includeLexical?: boolean, intent?: string } = {}): Promise<Queryable[]> {
+    if (localModelsDisabled()) return [];
     if (this._ciMode) throw new Error("LLM operations are disabled in CI (set CI=true)");
     // Ping activity at start to keep models alive during this operation
     this.touchActivity();
@@ -1308,6 +1319,12 @@ export class LlamaCpp implements LLM {
     documents: RerankDocument[],
     options: RerankOptions = {}
   ): Promise<RerankResult> {
+    if (localModelsDisabled()) {
+      return {
+        model: "disabled",
+        results: documents.map((doc, index) => ({ file: doc.file, score: 0, index })),
+      };
+    }
     if (this._ciMode) throw new Error("LLM operations are disabled in CI (set CI=true)");
     // Ping activity at start to keep models alive during this operation
     this.touchActivity();
diff --git a/src/store.ts b/src/store.ts
index c9d1115..f7c853a 100644
--- a/src/store.ts
+++ b/src/store.ts
@@ -25,6 +25,7 @@ import {
   formatDocForEmbedding,
   withLLMSessionForLlm,
   DEFAULT_EMBED_MODEL_URI,
+  localModelsDisabled,
   type RerankDocument,
   type ILLMSession,
 } from "./llm.js";
@@ -4012,7 +4013,7 @@ export async function hybridQuery(
   const collection = options?.collection;
   const explain = options?.explain ?? false;
   const intent = options?.intent;
-  const skipRerank = options?.skipRerank ?? false;
+  const skipRerank = options?.skipRerank ?? localModelsDisabled();
   const hooks = options?.hooks;
 
   const rankedLists: RankedResult[][] = [];
@@ -4407,7 +4408,7 @@ export async function structuredSearch(
   const candidateLimit = options?.candidateLimit ?? RERANK_CANDIDATE_LIMIT;
   const explain = options?.explain ?? false;
   const intent = options?.intent;
-  const skipRerank = options?.skipRerank ?? false;
+  const skipRerank = options?.skipRerank ?? localModelsDisabled();
   const hooks = options?.hooks;
 
   const collections = options?.collections;
diff --git a/test/launcher-detection.test.sh b/test/launcher-detection.test.sh
index abd0daa..833e758 100644
--- a/test/launcher-detection.test.sh
+++ b/test/launcher-detection.test.sh
@@ -20,10 +20,16 @@ fail() { printf "  %-60s FAIL\n" "$1 (got: $2, expected: $3)"; FAIL=$((FAIL + 1)
 # Instead of exec-ing a runtime, we echo which one would be chosen.
 detect_runtime() {
   local DIR="$1"
+  local BUN_AVAILABLE="${2:-1}"
+  local HOME_BUN_AVAILABLE="${3:-1}"
   if [ -f "$DIR/package-lock.json" ]; then
     echo "node"
   elif [ -f "$DIR/bun.lock" ] || [ -f "$DIR/bun.lockb" ]; then
     echo "bun"
+  elif [[ "$BUN_AVAILABLE" == "1" ]]; then
+    echo "bun"
+  elif [[ "$HOME_BUN_AVAILABLE" == "1" ]]; then
+    echo "home-bun"
   else
     echo "node"
   fi
@@ -45,10 +51,24 @@ echo "=== bin/qmd runtime detection tests ==="
 
 # --- Test cases ---
 
-# 1. No lockfiles → default to node
+# 1. No lockfiles → default to bun when bun is available
 d="$TMPDIR_BASE/no-lockfiles"
 mkdir -p "$d"
-assert_runtime "no lockfiles → node" "$d" "node"
+assert_runtime "no lockfiles + bun available → bun" "$d" "bun"
+
+got=$(detect_runtime "$d" 0 0)
+if [[ "$got" == "node" ]]; then
+  ok "no lockfiles + no bun → node"
+else
+  fail "no lockfiles + no bun → node" "$got" "node"
+fi
+
+got=$(detect_runtime "$d" 0 1)
+if [[ "$got" == "home-bun" ]]; then
+  ok "no lockfiles + home bun → home-bun"
+else
+  fail "no lockfiles + home bun → home-bun" "$got" "home-bun"
+fi
 
 # 2. Only bun.lock → bun
 d="$TMPDIR_BASE/bun-lock-only"
diff --git a/test/llm.test.ts b/test/llm.test.ts
index b1dd64f..f1b09ae 100644
--- a/test/llm.test.ts
+++ b/test/llm.test.ts
@@ -150,7 +150,7 @@ describe("LlamaCpp expand context size config", () => {
 });
 
 describe("LlamaCpp model resolution (config > env > default)", () => {
-  const HARDCODED_EMBED = "text-embedding-3-small";
+  const HARDCODED_EMBED = "nvidia/llama-3.2-nv-embedqa-1b-v2";
   const HARDCODED_RERANK = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
   const HARDCODED_GENERATE = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";
 
@@ -192,13 +192,17 @@ describe("LlamaCpp model resolution (config > env > default)", () => {
     }
   });
 
-  test("default embedding uses external OpenAI-compatible API", async () => {
+  test("default embedding uses NVIDIA OpenAI-compatible API", async () => {
     const prevKey = process.env.QMD_EMBED_API_KEY;
+    const prevNvidiaKey = process.env.NVIDIA_API_KEY;
+    const prevBaseUrl = process.env.QMD_EMBED_API_BASE_URL;
     process.env.QMD_EMBED_API_KEY = "test-key";
+    delete process.env.NVIDIA_API_KEY;
+    delete process.env.QMD_EMBED_API_BASE_URL;
     const fetchMock = vi.spyOn(globalThis, "fetch").mockResolvedValue({
       ok: true,
       json: async () => ({
-        model: "text-embedding-3-small",
+        model: "nvidia/llama-3.2-nv-embedqa-1b-v2",
         data: [{ index: 0, embedding: [0.1, 0.2, 0.3] }],
       }),
     } as Response);
@@ -206,17 +210,27 @@ describe("LlamaCpp model resolution (config > env > default)", () => {
     try {
       const llm = new LlamaCpp({});
       const result = await llm.embed("hello");
-      expect(fetchMock).toHaveBeenCalledWith("https://api.openai.com/v1/embeddings", expect.objectContaining({
+      expect(fetchMock).toHaveBeenCalledWith("https://integrate.api.nvidia.com/v1/embeddings", expect.objectContaining({
         method: "POST",
       }));
+      const [, init] = fetchMock.mock.calls[0]!;
+      expect(JSON.parse((init as RequestInit).body as string)).toEqual({
+        model: "nvidia/llama-3.2-nv-embedqa-1b-v2",
+        input: ["hello"],
+        input_type: "passage",
+      });
       expect(result).toEqual({
         embedding: [0.1, 0.2, 0.3],
-        model: "text-embedding-3-small",
+        model: "nvidia/llama-3.2-nv-embedqa-1b-v2",
       });
     } finally {
       fetchMock.mockRestore();
       if (prevKey === undefined) delete process.env.QMD_EMBED_API_KEY;
       else process.env.QMD_EMBED_API_KEY = prevKey;
+      if (prevNvidiaKey === undefined) delete process.env.NVIDIA_API_KEY;
+      else process.env.NVIDIA_API_KEY = prevNvidiaKey;
+      if (prevBaseUrl === undefined) delete process.env.QMD_EMBED_API_BASE_URL;
+      else process.env.QMD_EMBED_API_BASE_URL = prevBaseUrl;
     }
   });
 
@@ -259,6 +273,23 @@ describe("LlamaCpp model resolution (config > env > default)", () => {
     const llm = new LlamaCpp({ embedModel: "hf:custom/embed.gguf" });
     expect(llm.usesLocalEmbedding).toBe(true);
   });
+
+  test("QMD_DISABLE_LOCAL_MODELS rejects local embedding models and bypasses local LLMs", async () => {
+    const prev = process.env.QMD_DISABLE_LOCAL_MODELS;
+    process.env.QMD_DISABLE_LOCAL_MODELS = "1";
+    try {
+      const llm = new LlamaCpp({ embedModel: "hf:custom/embed.gguf" });
+      await expect(llm.embed("hello")).rejects.toThrow("Local embedding models are disabled");
+      await expect(llm.expandQuery("hello")).resolves.toEqual([]);
+      await expect(llm.rerank("hello", [{ file: "doc.md", text: "hello" }])).resolves.toEqual({
+        model: "disabled",
+        results: [{ file: "doc.md", score: 0, index: 0 }],
+      });
+    } finally {
+      if (prev === undefined) delete process.env.QMD_DISABLE_LOCAL_MODELS;
+      else process.env.QMD_DISABLE_LOCAL_MODELS = prev;
+    }
+  });
 });
 
 describe("LlamaCpp embedding truncation", () => {