From 7c17c8bcce22ce0e000509d5558f41877e532ed4 Mon Sep 17 00:00:00 2001 From: Haitao Pan Date: Sat, 9 May 2026 16:50:04 +0800 Subject: [PATCH] feat: default to NVIDIA embeddings --- README.md | 28 +++++++++++----------- bin/qmd | 7 +++--- src/llm.ts | 25 ++++++++++++++++---- src/store.ts | 5 ++-- test/launcher-detection.test.sh | 24 +++++++++++++++++-- test/llm.test.ts | 41 +++++++++++++++++++++++++++++---- 6 files changed, 100 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index ede27aa..74efadd 100644 --- a/README.md +++ b/README.md @@ -483,25 +483,25 @@ The `query` command uses **Reciprocal Rank Fusion (RRF)** with position-aware bl ### Models -QMD uses `text-embedding-3-small` through an OpenAI-compatible `/embeddings` API for vector embeddings by default. Configure it with: - -```sh -export QMD_EMBED_API_KEY="..." -# Optional for non-OpenAI-compatible gateways: -export QMD_EMBED_API_BASE_URL="https://api.openai.com/v1" -export QMD_EMBED_MODEL="text-embedding-3-small" -``` - -NVIDIA NIM's OpenAI-compatible endpoint can be used directly. QMD reads -`NVIDIA_API_KEY` when `QMD_EMBED_API_KEY` is not set and sends NVIDIA's required -`input_type` automatically (`passage` while indexing, `query` while searching): +QMD uses NVIDIA NIM's OpenAI-compatible `/embeddings` API for vector embeddings +by default. Configure it with: ```sh export NVIDIA_API_KEY="..." export QMD_EMBED_API_BASE_URL="https://integrate.api.nvidia.com/v1" export QMD_EMBED_MODEL="nvidia/llama-3.2-nv-embedqa-1b-v2" +export QMD_DISABLE_LOCAL_MODELS=1 ``` +QMD reads `NVIDIA_API_KEY` when `QMD_EMBED_API_KEY` is not set and sends +NVIDIA's required `input_type` automatically (`passage` while indexing, `query` +while searching). + +`QMD_DISABLE_LOCAL_MODELS=1` is recommended for deployments that must not load +local GGUF models. In that mode QMD rejects local embedding model URIs, skips +local query expansion, and defaults search reranking off while still using the +configured external embedding service for vector search. + Reranking and query expansion still use local GGUF models via node-llama-cpp: | Model | Purpose | Size | @@ -933,7 +933,7 @@ Query ──► LLM Expansion ──► [Original, Variant 1, Variant 2] Models are configured in `src/llm.ts`: ```typescript -const DEFAULT_EMBED_MODEL = "text-embedding-3-small"; +const DEFAULT_EMBED_MODEL = "nvidia/llama-3.2-nv-embedqa-1b-v2"; const DEFAULT_RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf"; const DEFAULT_GENERATE_MODEL = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf"; ``` @@ -942,7 +942,7 @@ YAML configuration can override those defaults; see `example-index.yml` for a co ```yaml models: - embed: text-embedding-3-small + embed: nvidia/llama-3.2-nv-embedqa-1b-v2 rerank: hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf generate: hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf ``` diff --git a/bin/qmd b/bin/qmd index f658b3b..b667080 100755 --- a/bin/qmd +++ b/bin/qmd @@ -16,9 +16,6 @@ done DIR="$(cd -P "$(dirname "$SOURCE")/.." && pwd)" # Detect the package manager that installed dependencies by checking lockfiles. -# $BUN_INSTALL is intentionally NOT checked — it only indicates that bun exists -# on the system, not that it was used to install this package (see #361). -# # package-lock.json takes priority: if it exists, npm installed the native # modules for Node. The repo ships bun.lock, so without this check, source # builds that use npm would be incorrectly routed to bun, causing ABI @@ -27,6 +24,10 @@ if [ -f "$DIR/package-lock.json" ]; then exec node "$DIR/dist/cli/qmd.js" "$@" elif [ -f "$DIR/bun.lock" ] || [ -f "$DIR/bun.lockb" ]; then exec bun "$DIR/dist/cli/qmd.js" "$@" +elif command -v bun >/dev/null 2>&1; then + exec bun "$DIR/dist/cli/qmd.js" "$@" +elif [ -x "$HOME/.bun/bin/bun" ]; then + exec "$HOME/.bun/bin/bun" "$DIR/dist/cli/qmd.js" "$@" else exec node "$DIR/dist/cli/qmd.js" "$@" fi diff --git a/src/llm.ts b/src/llm.ts index 5022d27..cb8c6a2 100644 --- a/src/llm.ts +++ b/src/llm.ts @@ -191,9 +191,9 @@ export type RerankDocument = { // Model Configuration // ============================================================================= -// Embeddings use an OpenAI-compatible API by default. +// Embeddings use NVIDIA's OpenAI-compatible API by default. // Override QMD_EMBED_MODEL with hf:/path/.gguf to opt into local node-llama-cpp embeddings. -const DEFAULT_EMBED_MODEL = "text-embedding-3-small"; +const DEFAULT_EMBED_MODEL = "nvidia/llama-3.2-nv-embedqa-1b-v2"; const DEFAULT_RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf"; // const DEFAULT_GENERATE_MODEL = "hf:ggml-org/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q8_0.gguf"; const DEFAULT_GENERATE_MODEL = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf"; @@ -214,7 +214,11 @@ const MODEL_CACHE_DIR = process.env.XDG_CACHE_HOME : join(homedir(), ".cache", "qmd", "models"); export const DEFAULT_MODEL_CACHE_DIR = MODEL_CACHE_DIR; -const DEFAULT_EMBED_API_BASE_URL = "https://api.openai.com/v1"; +const DEFAULT_EMBED_API_BASE_URL = "https://integrate.api.nvidia.com/v1"; + +export function localModelsDisabled(): boolean { + return /^(1|true|yes|on)$/i.test(process.env.QMD_DISABLE_LOCAL_MODELS ?? ""); +} function isLocalEmbeddingModel(model: string): boolean { return model.startsWith("hf:") || model.endsWith(".gguf") || model.startsWith("/") || model.startsWith("./") || model.startsWith("../"); @@ -999,7 +1003,7 @@ export class LlamaCpp implements LLM { if (texts.length === 0) return []; if (!this.embedApiKey) { throw new Error( - "External embedding API key is required. Set QMD_EMBED_API_KEY or OPENAI_API_KEY. " + + "External embedding API key is required. Set NVIDIA_API_KEY, QMD_EMBED_API_KEY, or OPENAI_API_KEY. " + "For local embeddings, set QMD_EMBED_MODEL to an hf: or .gguf model URI." ); } @@ -1043,6 +1047,9 @@ export class LlamaCpp implements LLM { async embed(text: string, options: EmbedOptions = {}): Promise { const model = options.model ?? this.embedModelUri; + if (localModelsDisabled() && isLocalEmbeddingModel(model)) { + throw new Error("Local embedding models are disabled. Set QMD_EMBED_MODEL to an external API model."); + } if (!isLocalEmbeddingModel(model)) { const results = await this.embedExternal([text], model, options); return results[0] ?? null; @@ -1078,6 +1085,9 @@ export class LlamaCpp implements LLM { */ async embedBatch(texts: string[], options: EmbedOptions = {}): Promise<(EmbeddingResult | null)[]> { const model = options.model ?? this.embedModelUri; + if (localModelsDisabled() && isLocalEmbeddingModel(model)) { + throw new Error("Local embedding models are disabled. Set QMD_EMBED_MODEL to an external API model."); + } if (!isLocalEmbeddingModel(model)) { return this.embedExternal(texts, model, options); } @@ -1209,6 +1219,7 @@ export class LlamaCpp implements LLM { // ========================================================================== async expandQuery(query: string, options: { context?: string, includeLexical?: boolean, intent?: string } = {}): Promise { + if (localModelsDisabled()) return []; if (this._ciMode) throw new Error("LLM operations are disabled in CI (set CI=true)"); // Ping activity at start to keep models alive during this operation this.touchActivity(); @@ -1308,6 +1319,12 @@ export class LlamaCpp implements LLM { documents: RerankDocument[], options: RerankOptions = {} ): Promise { + if (localModelsDisabled()) { + return { + model: "disabled", + results: documents.map((doc, index) => ({ file: doc.file, score: 0, index })), + }; + } if (this._ciMode) throw new Error("LLM operations are disabled in CI (set CI=true)"); // Ping activity at start to keep models alive during this operation this.touchActivity(); diff --git a/src/store.ts b/src/store.ts index c9d1115..f7c853a 100644 --- a/src/store.ts +++ b/src/store.ts @@ -25,6 +25,7 @@ import { formatDocForEmbedding, withLLMSessionForLlm, DEFAULT_EMBED_MODEL_URI, + localModelsDisabled, type RerankDocument, type ILLMSession, } from "./llm.js"; @@ -4012,7 +4013,7 @@ export async function hybridQuery( const collection = options?.collection; const explain = options?.explain ?? false; const intent = options?.intent; - const skipRerank = options?.skipRerank ?? false; + const skipRerank = options?.skipRerank ?? localModelsDisabled(); const hooks = options?.hooks; const rankedLists: RankedResult[][] = []; @@ -4407,7 +4408,7 @@ export async function structuredSearch( const candidateLimit = options?.candidateLimit ?? RERANK_CANDIDATE_LIMIT; const explain = options?.explain ?? false; const intent = options?.intent; - const skipRerank = options?.skipRerank ?? false; + const skipRerank = options?.skipRerank ?? localModelsDisabled(); const hooks = options?.hooks; const collections = options?.collections; diff --git a/test/launcher-detection.test.sh b/test/launcher-detection.test.sh index abd0daa..833e758 100644 --- a/test/launcher-detection.test.sh +++ b/test/launcher-detection.test.sh @@ -20,10 +20,16 @@ fail() { printf " %-60s FAIL\n" "$1 (got: $2, expected: $3)"; FAIL=$((FAIL + 1) # Instead of exec-ing a runtime, we echo which one would be chosen. detect_runtime() { local DIR="$1" + local BUN_AVAILABLE="${2:-1}" + local HOME_BUN_AVAILABLE="${3:-1}" if [ -f "$DIR/package-lock.json" ]; then echo "node" elif [ -f "$DIR/bun.lock" ] || [ -f "$DIR/bun.lockb" ]; then echo "bun" + elif [[ "$BUN_AVAILABLE" == "1" ]]; then + echo "bun" + elif [[ "$HOME_BUN_AVAILABLE" == "1" ]]; then + echo "home-bun" else echo "node" fi @@ -45,10 +51,24 @@ echo "=== bin/qmd runtime detection tests ===" # --- Test cases --- -# 1. No lockfiles → default to node +# 1. No lockfiles → default to bun when bun is available d="$TMPDIR_BASE/no-lockfiles" mkdir -p "$d" -assert_runtime "no lockfiles → node" "$d" "node" +assert_runtime "no lockfiles + bun available → bun" "$d" "bun" + +got=$(detect_runtime "$d" 0 0) +if [[ "$got" == "node" ]]; then + ok "no lockfiles + no bun → node" +else + fail "no lockfiles + no bun → node" "$got" "node" +fi + +got=$(detect_runtime "$d" 0 1) +if [[ "$got" == "home-bun" ]]; then + ok "no lockfiles + home bun → home-bun" +else + fail "no lockfiles + home bun → home-bun" "$got" "home-bun" +fi # 2. Only bun.lock → bun d="$TMPDIR_BASE/bun-lock-only" diff --git a/test/llm.test.ts b/test/llm.test.ts index b1dd64f..f1b09ae 100644 --- a/test/llm.test.ts +++ b/test/llm.test.ts @@ -150,7 +150,7 @@ describe("LlamaCpp expand context size config", () => { }); describe("LlamaCpp model resolution (config > env > default)", () => { - const HARDCODED_EMBED = "text-embedding-3-small"; + const HARDCODED_EMBED = "nvidia/llama-3.2-nv-embedqa-1b-v2"; const HARDCODED_RERANK = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf"; const HARDCODED_GENERATE = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf"; @@ -192,13 +192,17 @@ describe("LlamaCpp model resolution (config > env > default)", () => { } }); - test("default embedding uses external OpenAI-compatible API", async () => { + test("default embedding uses NVIDIA OpenAI-compatible API", async () => { const prevKey = process.env.QMD_EMBED_API_KEY; + const prevNvidiaKey = process.env.NVIDIA_API_KEY; + const prevBaseUrl = process.env.QMD_EMBED_API_BASE_URL; process.env.QMD_EMBED_API_KEY = "test-key"; + delete process.env.NVIDIA_API_KEY; + delete process.env.QMD_EMBED_API_BASE_URL; const fetchMock = vi.spyOn(globalThis, "fetch").mockResolvedValue({ ok: true, json: async () => ({ - model: "text-embedding-3-small", + model: "nvidia/llama-3.2-nv-embedqa-1b-v2", data: [{ index: 0, embedding: [0.1, 0.2, 0.3] }], }), } as Response); @@ -206,17 +210,27 @@ describe("LlamaCpp model resolution (config > env > default)", () => { try { const llm = new LlamaCpp({}); const result = await llm.embed("hello"); - expect(fetchMock).toHaveBeenCalledWith("https://api.openai.com/v1/embeddings", expect.objectContaining({ + expect(fetchMock).toHaveBeenCalledWith("https://integrate.api.nvidia.com/v1/embeddings", expect.objectContaining({ method: "POST", })); + const [, init] = fetchMock.mock.calls[0]!; + expect(JSON.parse((init as RequestInit).body as string)).toEqual({ + model: "nvidia/llama-3.2-nv-embedqa-1b-v2", + input: ["hello"], + input_type: "passage", + }); expect(result).toEqual({ embedding: [0.1, 0.2, 0.3], - model: "text-embedding-3-small", + model: "nvidia/llama-3.2-nv-embedqa-1b-v2", }); } finally { fetchMock.mockRestore(); if (prevKey === undefined) delete process.env.QMD_EMBED_API_KEY; else process.env.QMD_EMBED_API_KEY = prevKey; + if (prevNvidiaKey === undefined) delete process.env.NVIDIA_API_KEY; + else process.env.NVIDIA_API_KEY = prevNvidiaKey; + if (prevBaseUrl === undefined) delete process.env.QMD_EMBED_API_BASE_URL; + else process.env.QMD_EMBED_API_BASE_URL = prevBaseUrl; } }); @@ -259,6 +273,23 @@ describe("LlamaCpp model resolution (config > env > default)", () => { const llm = new LlamaCpp({ embedModel: "hf:custom/embed.gguf" }); expect(llm.usesLocalEmbedding).toBe(true); }); + + test("QMD_DISABLE_LOCAL_MODELS rejects local embedding models and bypasses local LLMs", async () => { + const prev = process.env.QMD_DISABLE_LOCAL_MODELS; + process.env.QMD_DISABLE_LOCAL_MODELS = "1"; + try { + const llm = new LlamaCpp({ embedModel: "hf:custom/embed.gguf" }); + await expect(llm.embed("hello")).rejects.toThrow("Local embedding models are disabled"); + await expect(llm.expandQuery("hello")).resolves.toEqual([]); + await expect(llm.rerank("hello", [{ file: "doc.md", text: "hello" }])).resolves.toEqual({ + model: "disabled", + results: [{ file: "doc.md", score: 0, index: 0 }], + }); + } finally { + if (prev === undefined) delete process.env.QMD_DISABLE_LOCAL_MODELS; + else process.env.QMD_DISABLE_LOCAL_MODELS = prev; + } + }); }); describe("LlamaCpp embedding truncation", () => {