diff --git a/CHANGELOG.md b/CHANGELOG.md index 16cd7cf..9ea1d76 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,11 @@ ### Fixes +- Skill: expand the packaged QMD skill with retrieval-first workflows, structured query examples, wiki/source collection guidance, and safe fallbacks when model-backed search is unavailable. +- Tests: make `bun run test` execute the local unit suite under both Node/Vitest and Bun (`test:node` + `test:bun`) so runtime-specific regressions are caught before CI. +- Model config: centralize embedding/rerank/generation model resolution so `qmd embed`, `status`, `query`, `vsearch`, `pull`, SDK vector search, and `bench` use the same active `.qmd/index.yaml` model hints and environment fallbacks. +- GPU/status: `qmd status` now uses the same embedding model identity as `qmd embed` when computing pending embeddings, so URI-backed embeddings are not incorrectly reported as pending under the legacy `embeddinggemma` alias. +- GPU status: `qmd status` now always shows GPU mode/configuration without unsafe native probing, and CPU-fallback warnings point to `QMD_STATUS_DEVICE_PROBE=1 qmd status` for an actual backend probe. The no-GPU warning is emitted once per process instead of once per LLM instance during benchmarks. - GPU: add `QMD_FORCE_CPU=1` / `--no-gpu` to bypass CUDA/Vulkan/Metal probing entirely, and route native llama.cpp stdout noise to stderr so JSON output stays parseable during search/query commands. - Snippet line numbers: `qmd_query` (MCP), HTTP `/query`, and `qmd query` (CLI JSON output and snippet headers) now return absolute source-file diff --git a/package.json b/package.json index 7425650..116a16e 100644 --- a/package.json +++ b/package.json @@ -25,7 +25,10 @@ "scripts": { "prepare": "[ -d .git ] && ./scripts/install-hooks.sh || true", "build": "tsc -p tsconfig.build.json && printf '#!/usr/bin/env node\n' | cat - dist/cli/qmd.js > dist/cli/qmd.tmp && mv dist/cli/qmd.tmp dist/cli/qmd.js && chmod +x dist/cli/qmd.js", - "test": "vitest run --reporter=verbose test/", + "test": "bun run test:unit", + "test:node": "node ./node_modules/vitest/vitest.mjs run --reporter=verbose", + "test:bun": "bun test --preload ./src/test-preload.ts", + "test:unit": "bun run test:node -- test/ && bun run test:bun -- test/", "qmd": "tsx src/cli/qmd.ts", "index": "tsx src/cli/qmd.ts index", "vector": "tsx src/cli/qmd.ts vector", diff --git a/skills/qmd/SKILL.md b/skills/qmd/SKILL.md index c0db897..904b24a 100644 --- a/skills/qmd/SKILL.md +++ b/skills/qmd/SKILL.md @@ -1,138 +1,161 @@ --- name: qmd -description: Search markdown knowledge bases, notes, and documentation using QMD. Use when users ask to search notes, find documents, or look up information. +description: Search local markdown knowledge bases, notes, docs, and wikis with QMD. Use when users ask to find notes, retrieve documents, inspect a wiki, answer from indexed markdown, or set up QMD access. license: MIT compatibility: Requires qmd CLI or MCP server. Install via `npm install -g @tobilu/qmd`. metadata: author: tobi - version: "2.0.0" + version: "2.1.0" allowed-tools: Bash(qmd:*), mcp__qmd__* --- -# QMD - Quick Markdown Search +# QMD - Query Markdown Documents -Local search engine for markdown content. +QMD is a local search and retrieval engine for markdown collections: notes, docs, +wikis, transcripts, and project knowledge bases. Use it before generic web search +when the user is asking about something that may already live in their indexed +local markdown. -## Status +## Status Check -!`qmd status 2>/dev/null || echo "Not installed: npm install -g @tobilu/qmd"` +Start by checking what QMD can see: -## MCP: `query` +```bash +qmd collection list +qmd ls +``` + +For health details: + +```bash +qmd status +``` + +If QMD is missing: + +```bash +npm install -g @tobilu/qmd +``` + +## Retrieval Workflow + +1. **Discover collections** with `qmd collection list` or `qmd ls`. +2. **Search first**, usually with a small result count. +3. **Retrieve source documents** with `qmd get` or `qmd multi-get`. +4. **Answer from the retrieved text**, citing file paths or docids. +5. **If results are weak**, rewrite the query using a different search mode. + +Do not answer from search-result snippets alone when the user needs substance. +Fetch the document. + +## Search Modes + +### Fast lexical search + +Use BM25 when you know names, exact terms, titles, identifiers, or code symbols: + +```bash +qmd search "cockpit OKR Goodhart" -n 10 +qmd search '"AI Before Headcount"' -c concepts -n 5 +``` + +Good `lex` queries are short: 2-6 discriminative terms, quoted phrases when exact, +and no filler words. + +### Hybrid query search + +Use `qmd query` when semantic recall, query expansion, vector search, or reranking +matters more than speed: + +```bash +qmd query "decision quality depends on surfacing assumptions and context" -n 10 +qmd query --json --explain "metrics as cockpit instruments but not OKRs" +``` + +`qmd query` may initialize local models. If models/GPU are unavailable, slow, or +crashing, fall back to `qmd search` and use better lexical terms. + +### Structured queries + +For subtle wiki/doc searches, structured queries are usually strongest: + +```bash +qmd query $'intent: Find the concept note about metrics as instruments without letting OKRs replace judgment.\nlex: cockpit instruments OKR Goodhart metrics judgment\nvec: data informed not metric driven product judgment\nhyde: A concept note says metrics are useful like cockpit instruments, but leaders should remain data-informed rather than metric-driven because OKRs and dashboards can Goodhart product judgment.' +``` + +Use this pattern when the user's wording is indirect: + +- `intent:` disambiguates the target. +- `lex:` anchors exact names, phrases, aliases, and rare terms. +- `vec:` adds the semantic paraphrase. +- `hyde:` describes the document that would answer the query. + +Put the best query first; early searches receive more weight in fusion. + +## MCP Tool: `query` + +When using the MCP server, prefer structured searches: ```json { "searches": [ - { "type": "lex", "query": "CAP theorem consistency" }, - { "type": "vec", "query": "tradeoff between consistency and availability" } + { "type": "lex", "query": "cockpit OKR Goodhart" }, + { "type": "vec", "query": "data informed not metric driven product judgment" }, + { "type": "hyde", "query": "A concept note explains that metrics are useful as instruments, but leaders should not let OKRs or dashboards replace judgment." } ], - "collections": ["docs"], + "intent": "Find the concept note about using metrics as instruments without becoming metric-driven.", + "collections": ["concepts"], "limit": 10 } ``` ### Query Types -| Type | Method | Input | -|------|--------|-------| -| `lex` | BM25 | Keywords — exact terms, names, code | -| `vec` | Vector | Question — natural language | -| `hyde` | Vector | Answer — hypothetical result (50-100 words) | +- `lex` — BM25 keyword search. Best for exact terms, names, titles, and code. +- `vec` — vector semantic search. Best for natural-language concepts. +- `hyde` — vector search using a hypothetical answer/document passage. -### Writing Good Queries - -**lex (keyword)** -- 2-5 terms, no filler words -- Exact phrase: `"connection pool"` (quoted) -- Exclude terms: `performance -sports` (minus prefix) -- Code identifiers work: `handleError async` - -**vec (semantic)** -- Full natural language question -- Be specific: `"how does the rate limiter handle burst traffic"` -- Include context: `"in the payment service, how are refunds processed"` - -**hyde (hypothetical document)** -- Write 50-100 words of what the *answer* looks like -- Use the vocabulary you expect in the result - -**expand (auto-expand)** -- Use a single-line query (implicit) or `expand: question` on its own line -- Lets the local LLM generate lex/vec/hyde variations -- Do not mix `expand:` with other typed lines — it's either a standalone expand query or a full query document - -### Intent (Disambiguation) - -When a query term is ambiguous, add `intent` to steer results: - -```json -{ - "searches": [ - { "type": "lex", "query": "performance" } - ], - "intent": "web page load times and Core Web Vitals" -} -``` - -Intent affects expansion, reranking, chunk selection, and snippet extraction. It does not search on its own — it's a steering signal that disambiguates queries like "performance" (web-perf vs team health vs fitness). - -### Combining Types - -| Goal | Approach | -|------|----------| -| Know exact terms | `lex` only | -| Don't know vocabulary | Use a single-line query (implicit `expand:`) or `vec` | -| Best recall | `lex` + `vec` | -| Complex topic | `lex` + `vec` + `hyde` | -| Ambiguous query | Add `intent` to any combination above | - -First query gets 2x weight in fusion — put your best guess first. - -### Lex Query Syntax - -| Syntax | Meaning | Example | -|--------|---------|---------| -| `term` | Prefix match | `perf` matches "performance" | -| `"phrase"` | Exact phrase | `"rate limiter"` | -| `-term` | Exclude | `performance -sports` | - -Note: `-term` only works in lex queries, not vec/hyde. - -### Collection Filtering - -```json -{ "collections": ["docs"] } // Single -{ "collections": ["docs", "notes"] } // Multiple (OR) -``` - -Omit to search all collections. - -## Other MCP Tools - -| Tool | Use | -|------|-----| -| `get` | Retrieve doc by path or `#docid` | -| `multi_get` | Retrieve multiple by glob/list | -| `status` | Collections and health | - -## CLI +## Retrieval Commands ```bash -qmd query "question" # Auto-expand + rerank -qmd query $'lex: X\nvec: Y' # Structured -qmd query $'expand: question' # Explicit expand -qmd query --json --explain "q" # Show score traces (RRF + rerank blend) -qmd search "keywords" # BM25 only (no LLM) -qmd get "#abc123" # By docid -qmd multi-get "journals/2026-*.md" -l 40 # Batch pull snippets by glob -qmd multi-get notes/foo.md,notes/bar.md # Comma-separated list, preserves order +qmd get "#abc123" # retrieve by docid +qmd get qmd://concepts/ai-before-headcount.md --full +qmd multi-get 'concepts/{ai-before-headcount.md,data-informed-not-metric-driven.md}' --md +qmd multi-get 'sources/podcast-2025-*.md' -l 80 ``` -## HTTP API +Use `multi-get` when comparing several hits or gathering context across pages. +Use `--full` when the exact source matters. + +## Collection Filtering ```bash -curl -X POST http://localhost:8181/query \ - -H "Content-Type: application/json" \ - -d '{"searches": [{"type": "lex", "query": "test"}]}' +qmd search "headcount autonomous agents" -c concepts -n 10 +qmd query "merchant support product reality" -c concepts -c sources -n 10 +``` + +Omit `-c` / `collections` to search everything. Add collection filters when a +broad query drifts into the wrong corpus. + +## Query Craft + +Good QMD searches mix three things: + +1. **Title/alias anchors:** exact page titles, named entities, phrases. +2. **Semantic paraphrase:** how a human would describe the idea. +3. **Negative space:** enough intent to avoid nearby-but-wrong concepts. + +Examples: + +```bash +# Exact-ish title lookup +qmd search '"arm the rebels" merchants tools big companies' -c concepts + +# Semantic concept lookup +qmd query $'intent: Find the customer proximity concept, not generic customer delight.\nlex: support pseudonymous merchant customer interviews\nvec: founder stays close to merchant reality through support and product use' + +# Source lookup +qmd search "six-week cadence WhatsApp merchant relationships Shawn Ryan" -c sources -n 10 ``` ## Setup @@ -142,3 +165,28 @@ npm install -g @tobilu/qmd qmd collection add ~/notes --name notes qmd embed ``` + +Only add collections or generate embeddings when the user asked for setup or +index maintenance. Searching and retrieving are safe; collection/index mutation is +not a casual first step. + +## MCP Setup + +See `references/mcp-setup.md` for Claude Code, Claude Desktop, OpenClaw, and HTTP +server configuration. + +## Pitfalls + +- **Do not stop at snippets.** Fetch documents before making claims. +- **Do not overuse semantic search.** If you know exact titles or terms, BM25 is + faster and often better. +- **Do not mutate indexes casually.** `qmd collection add`, `qmd update`, and + `qmd embed` change local state and can be expensive. +- **Model-backed commands can be environment-sensitive.** If `qmd query`, + `qmd vsearch`, or reranking fails because local models/GPU are unavailable, + use `qmd search` and stronger lexical/structured terms. +- **Ambiguous user wording needs intent.** Add `intent:` rather than hoping query + expansion guesses the right domain. +- **Collection names matter.** Search `concepts` for synthesized wiki pages, + `sources` for transcripts/raw source pages, and docs collections for code/project + documentation. diff --git a/src/cli/qmd.ts b/src/cli/qmd.ts index 25a2a0d..28762fa 100755 --- a/src/cli/qmd.ts +++ b/src/cli/qmd.ts @@ -78,7 +78,7 @@ import { type ReindexResult, type ChunkStrategy, } from "../store.js"; -import { disposeDefaultLlamaCpp, getDefaultLlamaCpp, setDefaultLlamaCpp, LlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "../llm.js"; +import { disposeDefaultLlamaCpp, getDefaultLlamaCpp, setDefaultLlamaCpp, LlamaCpp, withLLMSession, pullModels, DEFAULT_MODEL_CACHE_DIR, resolveEmbedModel, resolveGenerateModel, resolveRerankModel, resolveModels } from "../llm.js"; import { formatSearchResults, formatDocuments, @@ -311,8 +311,8 @@ function formatETA(seconds: number): string { // Check index health and print warnings/tips -function checkIndexHealth(db: Database): void { - const { needsEmbedding, totalDocs, daysStale } = getIndexHealth(db); +function checkIndexHealth(db: Database, model: string = resolveEmbedModelForCli()): void { + const { needsEmbedding, totalDocs, daysStale } = getIndexHealth(db, model); // Warn if many docs need embedding if (needsEmbedding > 0) { @@ -410,7 +410,8 @@ async function showStatus(): Promise { // Overall stats const totalDocs = db.prepare(`SELECT COUNT(*) as count FROM documents WHERE active = 1`).get() as { count: number }; const vectorCount = db.prepare(`SELECT COUNT(*) as count FROM content_vectors`).get() as { count: number }; - const needsEmbedding = getHashesNeedingEmbedding(db); + const statusEmbedModel = resolveEmbedModelForCli(); + const needsEmbedding = getHashesNeedingEmbedding(db, undefined, statusEmbedModel); // Most recent update across all collections const mostRecent = db.prepare(`SELECT MAX(modified_at) as latest FROM documents WHERE active = 1`).get() as { latest: string | null }; @@ -536,18 +537,26 @@ async function showStatus(): Promise { const match = uri.match(/^hf:([^/]+\/[^/]+)\//); return match ? `https://huggingface.co/${match[1]}` : uri; }; + const activeModels = resolveModelsForCli(); console.log(`\n${c.bold}Models${c.reset}`); - console.log(` Embedding: ${hfLink(DEFAULT_EMBED_MODEL_URI)}`); - console.log(` Reranking: ${hfLink(DEFAULT_RERANK_MODEL_URI)}`); - console.log(` Generation: ${hfLink(DEFAULT_GENERATE_MODEL_URI)}`); + console.log(` Embedding: ${hfLink(activeModels.embed)}`); + console.log(` Reranking: ${hfLink(activeModels.rerank)}`); + console.log(` Generation: ${hfLink(activeModels.generate)}`); } // Device / GPU info // Important: probing node-llama-cpp can abort the whole process on machines with // incompatible GPU drivers (for example Vulkan loader present but no usable driver). - // Keep `qmd status` safe by default and make the expensive/native probe opt-in. - if (process.env.QMD_STATUS_DEVICE_PROBE === "1") { - console.log(`\n${c.bold}Device${c.reset}`); + // Keep the native probe opt-in, but always show how QMD is configured and how to probe. + console.log(`\n${c.bold}Device${c.reset}`); + const configuredGpuMode = process.env.QMD_FORCE_CPU && !["false", "off", "none", "disable", "disabled", "0"].includes(process.env.QMD_FORCE_CPU.trim().toLowerCase()) + ? "CPU forced (QMD_FORCE_CPU)" + : (process.env.QMD_LLAMA_GPU?.trim() || "auto"); + console.log(` Mode: ${configuredGpuMode}`); + if (process.env.QMD_STATUS_DEVICE_PROBE !== "1") { + console.log(` Status: ${c.dim}not probed${c.reset} (set QMD_STATUS_DEVICE_PROBE=1 to test GPU/CPU backend)`); + } else { + console.log(` Status: probing native llama backend...`); try { const llm = getDefaultLlamaCpp(); const device = await llm.getDeviceInfo({ allowBuild: false }); @@ -1794,7 +1803,35 @@ function parseChunkStrategy(value: unknown): ChunkStrategy | undefined { } export function resolveEmbedModelForCli(): string { - return process.env.QMD_EMBED_MODEL ?? DEFAULT_EMBED_MODEL_URI; + try { + return resolveEmbedModel(loadConfig().models); + } catch { + return resolveEmbedModel(); + } +} + +export function resolveGenerateModelForCli(): string { + try { + return resolveGenerateModel(loadConfig().models); + } catch { + return resolveGenerateModel(); + } +} + +export function resolveRerankModelForCli(): string { + try { + return resolveRerankModel(loadConfig().models); + } catch { + return resolveRerankModel(); + } +} + +function resolveModelsForCli(): { embed: string; generate: string; rerank: string } { + try { + return resolveModels(loadConfig().models); + } catch { + return resolveModels(); + } } async function vectorIndex( @@ -3531,10 +3568,11 @@ if (isMain) { case "pull": { const refresh = cli.values.refresh === undefined ? false : Boolean(cli.values.refresh); + const activeModels = resolveModelsForCli(); const models = [ - DEFAULT_EMBED_MODEL_URI, - DEFAULT_GENERATE_MODEL_URI, - DEFAULT_RERANK_MODEL_URI, + activeModels.embed, + activeModels.generate, + activeModels.rerank, ]; console.log(`${c.bold}Pulling models${c.reset}`); const results = await pullModels(models, { diff --git a/src/index.ts b/src/index.ts index e8e2a45..f853a97 100644 --- a/src/index.ts +++ b/src/index.ts @@ -23,7 +23,6 @@ import { structuredSearch, extractSnippet, addLineNumbers, - DEFAULT_EMBED_MODEL, DEFAULT_MULTI_GET_MAX_BYTES, reindexCollection, generateEmbeddings, @@ -423,7 +422,7 @@ export async function createStore(options: StoreOptions): Promise { }); }, searchLex: async (q, opts) => internal.searchFTS(q, opts?.limit, opts?.collection), - searchVector: async (q, opts) => internal.searchVec(q, DEFAULT_EMBED_MODEL, opts?.limit, opts?.collection), + searchVector: async (q, opts) => internal.searchVec(q, llm.embedModelName, opts?.limit, opts?.collection), expandQuery: async (q, opts) => internal.expandQuery(q, undefined, opts?.intent), get: async (pathOrDocid, opts) => internal.findDocument(pathOrDocid, opts), getDocumentBody: async (pathOrDocid, opts) => { diff --git a/src/llm.ts b/src/llm.ts index bab9e5f..3047b20 100644 --- a/src/llm.ts +++ b/src/llm.ts @@ -31,6 +31,7 @@ async function loadNodeLlamaCpp(): Promise { export function setNodeLlamaCppModuleForTest(module: NodeLlamaCppModule | null): void { nodeLlamaCppImport = module ? Promise.resolve(module) : null; failedGpuInitModes.clear(); + noGpuAccelerationWarningShown = false; } type StdoutWrite = typeof process.stdout.write; @@ -83,7 +84,7 @@ export function isQwen3EmbeddingModel(modelUri: string): boolean { * Uses Qwen3-Embedding instruct format when a Qwen embedding model is active. */ export function formatQueryForEmbedding(query: string, modelUri?: string): string { - const uri = modelUri ?? process.env.QMD_EMBED_MODEL ?? DEFAULT_EMBED_MODEL; + const uri = modelUri ?? resolveEmbedModel(); if (isQwen3EmbeddingModel(uri)) { return `Instruct: Retrieve relevant documents for the given query\nQuery: ${query}`; } @@ -96,7 +97,7 @@ export function formatQueryForEmbedding(query: string, modelUri?: string): strin * Qwen3-Embedding encodes documents as raw text without special prefixes. */ export function formatDocForEmbedding(text: string, title?: string, modelUri?: string): string { - const uri = modelUri ?? process.env.QMD_EMBED_MODEL ?? DEFAULT_EMBED_MODEL; + const uri = modelUri ?? resolveEmbedModel(); if (isQwen3EmbeddingModel(uri)) { // Qwen3-Embedding: documents are raw text, no task prefix return title ? `${title}\n${text}` : text; @@ -255,6 +256,32 @@ export const DEFAULT_EMBED_MODEL_URI = DEFAULT_EMBED_MODEL; export const DEFAULT_RERANK_MODEL_URI = DEFAULT_RERANK_MODEL; export const DEFAULT_GENERATE_MODEL_URI = DEFAULT_GENERATE_MODEL; +export type ModelResolutionConfig = { + embed?: string; + generate?: string; + rerank?: string; +}; + +export function resolveEmbedModel(config?: ModelResolutionConfig): string { + return config?.embed || process.env.QMD_EMBED_MODEL || DEFAULT_EMBED_MODEL; +} + +export function resolveGenerateModel(config?: ModelResolutionConfig): string { + return config?.generate || process.env.QMD_GENERATE_MODEL || DEFAULT_GENERATE_MODEL; +} + +export function resolveRerankModel(config?: ModelResolutionConfig): string { + return config?.rerank || process.env.QMD_RERANK_MODEL || DEFAULT_RERANK_MODEL; +} + +export function resolveModels(config?: ModelResolutionConfig): Required { + return { + embed: resolveEmbedModel(config), + generate: resolveGenerateModel(config), + rerank: resolveRerankModel(config), + }; +} + // Local model cache directory const MODEL_CACHE_DIR = process.env.XDG_CACHE_HOME ? join(process.env.XDG_CACHE_HOME, "qmd", "models") @@ -579,6 +606,7 @@ function resolveExpandContextSize(configValue?: number): number { } const failedGpuInitModes = new Set(); +let noGpuAccelerationWarningShown = false; export class LlamaCpp implements LLM { private readonly _ciMode = !!process.env.CI; @@ -610,9 +638,9 @@ export class LlamaCpp implements LLM { constructor(config: LlamaCppConfig = {}) { - this.embedModelUri = config.embedModel || process.env.QMD_EMBED_MODEL || DEFAULT_EMBED_MODEL; - this.generateModelUri = config.generateModel || process.env.QMD_GENERATE_MODEL || DEFAULT_GENERATE_MODEL; - this.rerankModelUri = config.rerankModel || process.env.QMD_RERANK_MODEL || DEFAULT_RERANK_MODEL; + this.embedModelUri = resolveEmbedModel({ embed: config.embedModel }); + this.generateModelUri = resolveGenerateModel({ generate: config.generateModel }); + this.rerankModelUri = resolveRerankModel({ rerank: config.rerankModel }); this.modelCacheDir = config.modelCacheDir || MODEL_CACHE_DIR; this.expandContextSize = resolveExpandContextSize(config.expandContextSize); this.inactivityTimeoutMs = config.inactivityTimeoutMs ?? DEFAULT_INACTIVITY_TIMEOUT_MS; @@ -623,6 +651,14 @@ export class LlamaCpp implements LLM { return this.embedModelUri; } + get generateModelName(): string { + return this.generateModelUri; + } + + get rerankModelName(): string { + return this.rerankModelUri; + } + /** * Reset the inactivity timer. Called after each model operation. * When timer fires, models are unloaded to free memory (if no active sessions). @@ -760,9 +796,10 @@ export class LlamaCpp implements LLM { } } - if (llama.gpu === false) { + if (llama.gpu === false && !noGpuAccelerationWarningShown) { + noGpuAccelerationWarningShown = true; process.stderr.write( - "QMD Warning: no GPU acceleration, running on CPU (slow). Run 'qmd status' for details.\n" + "QMD Warning: no GPU acceleration, running on CPU (slow). Run 'QMD_STATUS_DEVICE_PROBE=1 qmd status' for device details.\n" ); } this.llama = llama; diff --git a/src/store.ts b/src/store.ts index 5323245..8d5ffa0 100644 --- a/src/store.ts +++ b/src/store.ts @@ -25,6 +25,9 @@ import { formatQueryForEmbedding, formatDocForEmbedding, withLLMSessionForLlm, + DEFAULT_EMBED_MODEL_URI, + DEFAULT_RERANK_MODEL_URI, + DEFAULT_GENERATE_MODEL_URI, type RerankDocument, type ILLMSession, } from "./llm.js"; @@ -39,9 +42,9 @@ import type { // Configuration // ============================================================================= -export const DEFAULT_EMBED_MODEL = "embeddinggemma"; -export const DEFAULT_RERANK_MODEL = "ExpedientFalcon/qwen3-reranker:0.6b-q8_0"; -export const DEFAULT_QUERY_MODEL = "Qwen/Qwen3-1.7B"; +export const DEFAULT_EMBED_MODEL = DEFAULT_EMBED_MODEL_URI; +export const DEFAULT_RERANK_MODEL = DEFAULT_RERANK_MODEL_URI; +export const DEFAULT_QUERY_MODEL = DEFAULT_GENERATE_MODEL_URI; export const DEFAULT_GLOB = "**/*.md"; export const DEFAULT_MULTI_GET_MAX_BYTES = 10 * 1024; // 10KB export const DEFAULT_EMBED_MAX_DOCS_PER_BATCH = 64; @@ -1749,8 +1752,8 @@ export function createStore(dbPath?: string): Store { searchVec: (query: string, model: string, limit?: number, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[]) => searchVec(db, query, model, limit, collectionName, session, precomputedEmbedding), // Query expansion & reranking - expandQuery: (query: string, model?: string, intent?: string) => expandQuery(query, model, db, intent, store.llm), - rerank: (query: string, documents: { file: string; text: string }[], model?: string, intent?: string) => rerank(query, documents, model, db, intent, store.llm), + expandQuery: (query: string, model?: string, intent?: string) => expandQuery(query, model ?? store.llm?.generateModelName ?? DEFAULT_QUERY_MODEL, db, intent, store.llm), + rerank: (query: string, documents: { file: string; text: string }[], model?: string, intent?: string) => rerank(query, documents, model ?? store.llm?.rerankModelName ?? DEFAULT_RERANK_MODEL, db, intent, store.llm), // Document retrieval findDocument: (filename: string, options?: { includeBody?: boolean }) => findDocument(db, filename, options), diff --git a/test/cli.test.ts b/test/cli.test.ts index 1b551f2..4cbb7fb 100644 --- a/test/cli.test.ts +++ b/test/cli.test.ts @@ -27,14 +27,15 @@ let testCounter = 0; // Unique counter for each test run const thisDir = dirname(fileURLToPath(import.meta.url)); const projectRoot = join(thisDir, ".."); const qmdScript = join(projectRoot, "src", "cli", "qmd.ts"); -// Resolve tsx binary from project's node_modules (not cwd-dependent) -const tsxBin = (() => { - const candidate = join(projectRoot, "node_modules", ".bin", "tsx"); - if (existsSync(candidate)) { - return candidate; - } - return join(process.cwd(), "node_modules", ".bin", "tsx"); -})(); +const isBunRuntime = typeof (globalThis as { Bun?: unknown }).Bun !== "undefined"; +const tsxCli = join(projectRoot, "node_modules", "tsx", "dist", "cli.mjs"); +const qmdCommand = isBunRuntime + ? { command: process.execPath, args: [qmdScript] } + : { command: process.execPath, args: [tsxCli, qmdScript] }; + +function qmdRunnerArgs(args: string[]): { command: string; args: string[] } { + return { command: qmdCommand.command, args: [...qmdCommand.args, ...args] }; +} // Helper to run qmd command with test database async function runQmd( @@ -44,7 +45,8 @@ async function runQmd( const workingDir = options.cwd || fixturesDir; const dbPath = options.dbPath || testDbPath; const configDir = options.configDir || testConfigDir; - const proc = spawn(tsxBin, [qmdScript, ...args], { + const runner = qmdRunnerArgs(args); + const proc = spawn(runner.command, runner.args, { cwd: workingDir, env: { ...process.env, @@ -252,15 +254,15 @@ describe("CLI Skills", () => { expect(stderr).toBe(""); expect(exitCode).toBe(0); expect(stdout).toContain("qmd"); - expect(stdout).toContain("Search markdown knowledge bases"); + expect(stdout).toContain("Search local markdown knowledge bases"); }); test("gets version-matched runtime skill content", async () => { const { stdout, stderr, exitCode } = await runQmd(["skills", "get", "qmd"]); expect(stderr).toBe(""); expect(exitCode).toBe(0); - expect(stdout).toContain("# QMD - Quick Markdown Search"); - expect(stdout).toContain("## MCP: `query`"); + expect(stdout).toContain("# QMD - Query Markdown Documents"); + expect(stdout).toContain("## MCP Tool: `query`"); expect(stdout).not.toContain("This file is a discovery stub"); }); @@ -268,7 +270,7 @@ describe("CLI Skills", () => { const { stdout, stderr, exitCode } = await runQmd(["skills", "get", "qmd", "--full"]); expect(stderr).toBe(""); expect(exitCode).toBe(0); - expect(stdout).toContain("# QMD - Quick Markdown Search"); + expect(stdout).toContain("# QMD - Query Markdown Documents"); expect(stdout).toContain("--- references/mcp-setup.md ---"); expect(stdout).toContain("# QMD MCP Server Setup"); }); @@ -284,8 +286,8 @@ describe("CLI Skills", () => { const { stdout, stderr, exitCode } = await runQmd(["skill", "show"]); expect(stderr).toBe(""); expect(exitCode).toBe(0); - expect(stdout).toContain("# QMD - Quick Markdown Search"); - expect(stdout).toContain("## MCP: `query`"); + expect(stdout).toContain("# QMD - Query Markdown Documents"); + expect(stdout).toContain("## MCP Tool: `query`"); expect(stdout).not.toContain("This file is a discovery stub"); }); @@ -300,8 +302,8 @@ describe("CLI Skills", () => { const installedSkillDir = join(installDir, ".agents", "skills", "qmd"); const installed = readFileSync(join(installedSkillDir, "SKILL.md"), "utf8"); - expect(installed).toContain("# QMD - Quick Markdown Search"); - expect(installed).toContain("## MCP: `query`"); + expect(installed).toContain("# QMD - Query Markdown Documents"); + expect(installed).toContain("## MCP Tool: `query`"); expect(installed).not.toContain("This file is a discovery stub"); expect(readFileSync(join(installedSkillDir, "references", "mcp-setup.md"), "utf8")).toContain("# QMD MCP Server Setup"); }); @@ -370,7 +372,7 @@ describe("CLI Skill Commands", () => { expect(exitCode).toBe(0); const skillDir = join(projectDir, ".agents", "skills", "qmd"); - expect(readFileSync(join(skillDir, "SKILL.md"), "utf-8")).toContain("# QMD - Quick Markdown Search"); + expect(readFileSync(join(skillDir, "SKILL.md"), "utf-8")).toContain("# QMD - Query Markdown Documents"); expect(existsSync(join(projectDir, ".claude", "skills", "qmd"))).toBe(false); expect(stdout).toContain(`✓ Installed QMD skill to ${skillDir}`); expect(stdout).toContain("Tip: create a Claude symlink manually"); @@ -388,9 +390,9 @@ describe("CLI Skill Commands", () => { const skillDir = join(fakeHome, ".agents", "skills", "qmd"); const claudeLink = join(fakeHome, ".claude", "skills", "qmd"); - expect(readFileSync(join(skillDir, "SKILL.md"), "utf-8")).toContain("# QMD - Quick Markdown Search"); + expect(readFileSync(join(skillDir, "SKILL.md"), "utf-8")).toContain("# QMD - Query Markdown Documents"); expect(lstatSync(claudeLink).isSymbolicLink()).toBe(true); - expect(readFileSync(join(claudeLink, "SKILL.md"), "utf-8")).toContain("# QMD - Quick Markdown Search"); + expect(readFileSync(join(claudeLink, "SKILL.md"), "utf-8")).toContain("# QMD - Query Markdown Documents"); expect(stdout).toContain(`✓ Installed QMD skill to ${skillDir}`); expect(stdout).toContain(`✓ Linked Claude skill at ${claudeLink}`); }); @@ -408,7 +410,7 @@ describe("CLI Skill Commands", () => { const skillDir = join(fakeHome, ".agents", "skills", "qmd"); expect(lstatSync(skillDir).isSymbolicLink()).toBe(false); - expect(readFileSync(join(skillDir, "SKILL.md"), "utf-8")).toContain("# QMD - Quick Markdown Search"); + expect(readFileSync(join(skillDir, "SKILL.md"), "utf-8")).toContain("# QMD - Query Markdown Documents"); expect(stdout).toContain(`✓ Claude already sees the skill via ${join(fakeHome, ".claude", "skills")}`); }); @@ -470,10 +472,13 @@ describe("CLI Status Command", () => { expect(stdout).toContain("Collection"); }); - test("skips device probing by default", async () => { + test("shows device mode without native probing by default", async () => { const { stdout, exitCode } = await runQmd(["status"]); expect(exitCode).toBe(0); - expect(stdout).not.toContain("Device"); + expect(stdout).toContain("Device"); + expect(stdout).toContain("Mode:"); + expect(stdout).toContain("not probed"); + expect(stdout).toContain("QMD_STATUS_DEVICE_PROBE=1"); }); }); @@ -1577,7 +1582,8 @@ describe("mcp http daemon", () => { port: number, options: { args?: string[]; env?: Record } = {}, ): import("child_process").ChildProcess { - const proc = spawn(tsxBin, [qmdScript, ...(options.args ?? []), "mcp", "--http", "--port", String(port)], { + const runner = qmdRunnerArgs([...(options.args ?? []), "mcp", "--http", "--port", String(port)]); + const proc = spawn(runner.command, runner.args, { cwd: fixturesDir, env: { ...process.env, diff --git a/test/llm.test.ts b/test/llm.test.ts index 2fc03cd..0ab1281 100644 --- a/test/llm.test.ts +++ b/test/llm.test.ts @@ -17,6 +17,10 @@ import { withNativeStdoutRedirectedToStderr, resolveParallelismOverride, resolveSafeParallelism, + resolveEmbedModel, + resolveGenerateModel, + resolveRerankModel, + resolveModels, withLLMSession, canUnloadLLM, SessionReleasedError, @@ -24,6 +28,63 @@ import { type ILLMSession, } from "../src/llm.js"; +describe("model name resolution", () => { + function withModelEnv(env: Record, fn: () => void): void { + const previous = { + QMD_EMBED_MODEL: process.env.QMD_EMBED_MODEL, + QMD_GENERATE_MODEL: process.env.QMD_GENERATE_MODEL, + QMD_RERANK_MODEL: process.env.QMD_RERANK_MODEL, + }; + try { + for (const [key, value] of Object.entries(env)) { + if (value === undefined) delete process.env[key]; + else process.env[key] = value; + } + fn(); + } finally { + for (const [key, value] of Object.entries(previous)) { + if (value === undefined) delete process.env[key]; + else process.env[key] = value; + } + } + } + + test("all model roles resolve config hints before env fallbacks", () => { + withModelEnv({ + QMD_EMBED_MODEL: "env-embed", + QMD_GENERATE_MODEL: "env-generate", + QMD_RERANK_MODEL: "env-rerank", + }, () => { + const config = { + embed: "config-embed", + generate: "config-generate", + rerank: "config-rerank", + }; + expect(resolveEmbedModel(config)).toBe("config-embed"); + expect(resolveGenerateModel(config)).toBe("config-generate"); + expect(resolveRerankModel(config)).toBe("config-rerank"); + expect(resolveModels(config)).toEqual(config); + }); + }); + + test("LlamaCpp constructor uses the same resolver as status/embed/query helpers", () => { + withModelEnv({ + QMD_EMBED_MODEL: "env-embed", + QMD_GENERATE_MODEL: "env-generate", + QMD_RERANK_MODEL: "env-rerank", + }, () => { + const llm = new LlamaCpp({ + embedModel: "config-embed", + generateModel: "config-generate", + rerankModel: "config-rerank", + }); + expect(llm.embedModelName).toBe(resolveEmbedModel({ embed: "config-embed" })); + expect(llm.generateModelName).toBe(resolveGenerateModel({ generate: "config-generate" })); + expect(llm.rerankModelName).toBe(resolveRerankModel({ rerank: "config-rerank" })); + }); + }); +}); + // ============================================================================= // Singleton Tests (no model loading required) // ============================================================================= @@ -178,6 +239,40 @@ describe("native llama stdout containment", () => { else process.env.QMD_FORCE_CPU = prevForceCpu; } }); + + test("warns about CPU fallback only once per process", async () => { + const prevGpu = process.env.QMD_LLAMA_GPU; + const prevForceCpu = process.env.QMD_FORCE_CPU; + process.env.QMD_LLAMA_GPU = "false"; + delete process.env.QMD_FORCE_CPU; + + setNodeLlamaCppModuleForTest({ + LlamaLogLevel: { error: "error" }, + resolveModelFile: vi.fn(), + LlamaChatSession: vi.fn() as any, + getLlama: vi.fn(async () => ({ gpu: false, cpuMathCores: 4 }) as any), + }); + + const stderrSpy = vi.spyOn(process.stderr, "write").mockReturnValue(true); + try { + const first = new LlamaCpp(); + const second = new LlamaCpp(); + + await (first as any).ensureLlama(); + await (second as any).ensureLlama(); + + const stderr = String(stderrSpy.mock.calls.map(call => call[0]).join("")); + expect(stderr.match(/no GPU acceleration/g)?.length).toBe(1); + expect(stderr).toContain("QMD_STATUS_DEVICE_PROBE=1 qmd status"); + } finally { + stderrSpy.mockRestore(); + setNodeLlamaCppModuleForTest(null); + if (prevGpu === undefined) delete process.env.QMD_LLAMA_GPU; + else process.env.QMD_LLAMA_GPU = prevGpu; + if (prevForceCpu === undefined) delete process.env.QMD_FORCE_CPU; + else process.env.QMD_FORCE_CPU = prevForceCpu; + } + }); }); describe("LLM context parallelism safety", () => { diff --git a/test/local-config.test.ts b/test/local-config.test.ts index ef9af72..8bc6bf0 100644 --- a/test/local-config.test.ts +++ b/test/local-config.test.ts @@ -5,6 +5,17 @@ import { tmpdir } from "node:os"; import { afterEach, describe, expect, test } from "vitest"; import { findLocalConfigPath, getLocalDbPath } from "../src/collections.js"; +function cliCommandArgs(command: string): { bin: string; args: string[] } { + const cliPath = join(process.cwd(), "src/cli/qmd.ts"); + if (process.versions.bun) { + return { bin: process.execPath, args: [cliPath, command] }; + } + return { + bin: process.execPath, + args: [join(process.cwd(), "node_modules/tsx/dist/cli.mjs"), cliPath, command], + }; +} + const roots: string[] = []; function tempProject(): string { @@ -56,12 +67,11 @@ describe("local .qmd project config", () => { mkdirSync(join(root, ".qmd"), { recursive: true }); mkdirSync(join(root, "docs"), { recursive: true }); writeFileSync(join(root, "docs", "a.md"), "# A\n\nLocal test document.\n"); - writeFileSync(join(root, ".qmd", "index.yaml"), `collections:\n docs:\n path: ${JSON.stringify(join(root, "docs"))}\n pattern: "**/*.md"\n context:\n /: Local test docs\n`); + writeFileSync(join(root, ".qmd", "index.yaml"), `collections:\n docs:\n path: ${JSON.stringify(join(root, "docs"))}\n pattern: "**/*.md"\n context:\n /: Local test docs\nmodels:\n embed: local-embed-model\n rerank: local-rerank-model\n generate: local-generate-model\n`); const home = join(root, "home"); - const tsxBin = join(process.cwd(), "node_modules", ".bin", "tsx"); - const runner = existsSync(tsxBin) ? tsxBin : "bun"; - const output = execFileSync(runner, [join(process.cwd(), "src/cli/qmd.ts"), "status"], { + const { bin, args } = cliCommandArgs("status"); + const output = execFileSync(bin, args, { cwd: root, encoding: "utf-8", env: { @@ -69,12 +79,19 @@ describe("local .qmd project config", () => { HOME: home, XDG_CONFIG_HOME: join(home, ".config"), XDG_CACHE_HOME: join(home, ".cache"), + QMD_EMBED_MODEL: "env-embed-model", + QMD_RERANK_MODEL: "env-rerank-model", + QMD_GENERATE_MODEL: "env-generate-model", }, }); const localIndex = join(root, ".qmd", "index.sqlite"); expect(output).toContain(`Index: ${realpathSync(localIndex)}`); expect(output).toContain("docs (qmd://docs/)"); + expect(output).toContain("Embedding: local-embed-model"); + expect(output).toContain("Reranking: local-rerank-model"); + expect(output).toContain("Generation: local-generate-model"); + expect(output).not.toContain("env-embed-model"); expect(existsSync(localIndex)).toBe(true); expect(existsSync(join(home, ".cache", "qmd", "index.sqlite"))).toBe(false); }); diff --git a/test/mcp.test.ts b/test/mcp.test.ts index 495c624..d0abeb7 100644 --- a/test/mcp.test.ts +++ b/test/mcp.test.ts @@ -186,7 +186,7 @@ function seedTestData(db: Database): void { for (let i = 0; i < 768; i++) embedding[i] = Math.random(); for (const doc of docs.slice(0, 4)) { // Skip large file for embeddings - db.prepare(`INSERT INTO content_vectors (hash, seq, pos, model, embedded_at) VALUES (?, 0, 0, 'embeddinggemma', ?)`).run(doc.hash, now); + db.prepare(`INSERT INTO content_vectors (hash, seq, pos, model, embedded_at) VALUES (?, 0, 0, ?, ?)`).run(doc.hash, DEFAULT_EMBED_MODEL, now); db.prepare(`INSERT INTO vectors_vec (hash_seq, embedding) VALUES (?, ?)`).run(`${doc.hash}_0`, embedding); } } diff --git a/test/package.test.ts b/test/package.test.ts index 7ba41ac..030d1aa 100644 --- a/test/package.test.ts +++ b/test/package.test.ts @@ -20,8 +20,8 @@ describe("package grammar distribution", () => { expect(pkg.files, "published package files").toContain("scripts/check-package-grammars.mjs"); expect(pkg.files, "published package files").toContain("skills/"); const qmdSkill = readFileSync(new URL("skills/qmd/SKILL.md", root), "utf8"); - expect(qmdSkill).toContain("# QMD - Quick Markdown Search"); - expect(qmdSkill).toContain("## MCP: `query`"); + expect(qmdSkill).toContain("# QMD - Query Markdown Documents"); + expect(qmdSkill).toContain("## MCP Tool: `query`"); expect(qmdSkill).not.toContain("This file is a discovery stub"); const scriptPath = join(root.pathname, "scripts", "check-package-grammars.mjs");