Merge pull request #656 from tobi/fix/gpu-status-warning

Fix GPU status guidance and benchmark warnings
This commit is contained in:
Tobias Lütke 2026-05-16 19:55:39 -04:00 committed by GitHub
commit ddbd6bd8be
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 415 additions and 164 deletions

View File

@ -8,6 +8,11 @@
### Fixes
- Skill: expand the packaged QMD skill with retrieval-first workflows, structured query examples, wiki/source collection guidance, and safe fallbacks when model-backed search is unavailable.
- Tests: make `bun run test` execute the local unit suite under both Node/Vitest and Bun (`test:node` + `test:bun`) so runtime-specific regressions are caught before CI.
- Model config: centralize embedding/rerank/generation model resolution so `qmd embed`, `status`, `query`, `vsearch`, `pull`, SDK vector search, and `bench` use the same active `.qmd/index.yaml` model hints and environment fallbacks.
- GPU/status: `qmd status` now uses the same embedding model identity as `qmd embed` when computing pending embeddings, so URI-backed embeddings are not incorrectly reported as pending under the legacy `embeddinggemma` alias.
- GPU status: `qmd status` now always shows GPU mode/configuration without unsafe native probing, and CPU-fallback warnings point to `QMD_STATUS_DEVICE_PROBE=1 qmd status` for an actual backend probe. The no-GPU warning is emitted once per process instead of once per LLM instance during benchmarks.
- GPU: add `QMD_FORCE_CPU=1` / `--no-gpu` to bypass CUDA/Vulkan/Metal probing entirely, and route native llama.cpp stdout noise to stderr so JSON output stays parseable during search/query commands.
- Snippet line numbers: `qmd_query` (MCP), HTTP `/query`, and `qmd query`
(CLI JSON output and snippet headers) now return absolute source-file

View File

@ -25,7 +25,10 @@
"scripts": {
"prepare": "[ -d .git ] && ./scripts/install-hooks.sh || true",
"build": "tsc -p tsconfig.build.json && printf '#!/usr/bin/env node\n' | cat - dist/cli/qmd.js > dist/cli/qmd.tmp && mv dist/cli/qmd.tmp dist/cli/qmd.js && chmod +x dist/cli/qmd.js",
"test": "vitest run --reporter=verbose test/",
"test": "bun run test:unit",
"test:node": "node ./node_modules/vitest/vitest.mjs run --reporter=verbose",
"test:bun": "bun test --preload ./src/test-preload.ts",
"test:unit": "bun run test:node -- test/ && bun run test:bun -- test/",
"qmd": "tsx src/cli/qmd.ts",
"index": "tsx src/cli/qmd.ts index",
"vector": "tsx src/cli/qmd.ts vector",

View File

@ -1,138 +1,161 @@
---
name: qmd
description: Search markdown knowledge bases, notes, and documentation using QMD. Use when users ask to search notes, find documents, or look up information.
description: Search local markdown knowledge bases, notes, docs, and wikis with QMD. Use when users ask to find notes, retrieve documents, inspect a wiki, answer from indexed markdown, or set up QMD access.
license: MIT
compatibility: Requires qmd CLI or MCP server. Install via `npm install -g @tobilu/qmd`.
metadata:
author: tobi
version: "2.0.0"
version: "2.1.0"
allowed-tools: Bash(qmd:*), mcp__qmd__*
---
# QMD - Quick Markdown Search
# QMD - Query Markdown Documents
Local search engine for markdown content.
QMD is a local search and retrieval engine for markdown collections: notes, docs,
wikis, transcripts, and project knowledge bases. Use it before generic web search
when the user is asking about something that may already live in their indexed
local markdown.
## Status
## Status Check
!`qmd status 2>/dev/null || echo "Not installed: npm install -g @tobilu/qmd"`
Start by checking what QMD can see:
## MCP: `query`
```bash
qmd collection list
qmd ls
```
For health details:
```bash
qmd status
```
If QMD is missing:
```bash
npm install -g @tobilu/qmd
```
## Retrieval Workflow
1. **Discover collections** with `qmd collection list` or `qmd ls`.
2. **Search first**, usually with a small result count.
3. **Retrieve source documents** with `qmd get` or `qmd multi-get`.
4. **Answer from the retrieved text**, citing file paths or docids.
5. **If results are weak**, rewrite the query using a different search mode.
Do not answer from search-result snippets alone when the user needs substance.
Fetch the document.
## Search Modes
### Fast lexical search
Use BM25 when you know names, exact terms, titles, identifiers, or code symbols:
```bash
qmd search "cockpit OKR Goodhart" -n 10
qmd search '"AI Before Headcount"' -c concepts -n 5
```
Good `lex` queries are short: 2-6 discriminative terms, quoted phrases when exact,
and no filler words.
### Hybrid query search
Use `qmd query` when semantic recall, query expansion, vector search, or reranking
matters more than speed:
```bash
qmd query "decision quality depends on surfacing assumptions and context" -n 10
qmd query --json --explain "metrics as cockpit instruments but not OKRs"
```
`qmd query` may initialize local models. If models/GPU are unavailable, slow, or
crashing, fall back to `qmd search` and use better lexical terms.
### Structured queries
For subtle wiki/doc searches, structured queries are usually strongest:
```bash
qmd query $'intent: Find the concept note about metrics as instruments without letting OKRs replace judgment.\nlex: cockpit instruments OKR Goodhart metrics judgment\nvec: data informed not metric driven product judgment\nhyde: A concept note says metrics are useful like cockpit instruments, but leaders should remain data-informed rather than metric-driven because OKRs and dashboards can Goodhart product judgment.'
```
Use this pattern when the user's wording is indirect:
- `intent:` disambiguates the target.
- `lex:` anchors exact names, phrases, aliases, and rare terms.
- `vec:` adds the semantic paraphrase.
- `hyde:` describes the document that would answer the query.
Put the best query first; early searches receive more weight in fusion.
## MCP Tool: `query`
When using the MCP server, prefer structured searches:
```json
{
"searches": [
{ "type": "lex", "query": "CAP theorem consistency" },
{ "type": "vec", "query": "tradeoff between consistency and availability" }
{ "type": "lex", "query": "cockpit OKR Goodhart" },
{ "type": "vec", "query": "data informed not metric driven product judgment" },
{ "type": "hyde", "query": "A concept note explains that metrics are useful as instruments, but leaders should not let OKRs or dashboards replace judgment." }
],
"collections": ["docs"],
"intent": "Find the concept note about using metrics as instruments without becoming metric-driven.",
"collections": ["concepts"],
"limit": 10
}
```
### Query Types
| Type | Method | Input |
|------|--------|-------|
| `lex` | BM25 | Keywords — exact terms, names, code |
| `vec` | Vector | Question — natural language |
| `hyde` | Vector | Answer — hypothetical result (50-100 words) |
- `lex` — BM25 keyword search. Best for exact terms, names, titles, and code.
- `vec` — vector semantic search. Best for natural-language concepts.
- `hyde` — vector search using a hypothetical answer/document passage.
### Writing Good Queries
**lex (keyword)**
- 2-5 terms, no filler words
- Exact phrase: `"connection pool"` (quoted)
- Exclude terms: `performance -sports` (minus prefix)
- Code identifiers work: `handleError async`
**vec (semantic)**
- Full natural language question
- Be specific: `"how does the rate limiter handle burst traffic"`
- Include context: `"in the payment service, how are refunds processed"`
**hyde (hypothetical document)**
- Write 50-100 words of what the *answer* looks like
- Use the vocabulary you expect in the result
**expand (auto-expand)**
- Use a single-line query (implicit) or `expand: question` on its own line
- Lets the local LLM generate lex/vec/hyde variations
- Do not mix `expand:` with other typed lines — it's either a standalone expand query or a full query document
### Intent (Disambiguation)
When a query term is ambiguous, add `intent` to steer results:
```json
{
"searches": [
{ "type": "lex", "query": "performance" }
],
"intent": "web page load times and Core Web Vitals"
}
```
Intent affects expansion, reranking, chunk selection, and snippet extraction. It does not search on its own — it's a steering signal that disambiguates queries like "performance" (web-perf vs team health vs fitness).
### Combining Types
| Goal | Approach |
|------|----------|
| Know exact terms | `lex` only |
| Don't know vocabulary | Use a single-line query (implicit `expand:`) or `vec` |
| Best recall | `lex` + `vec` |
| Complex topic | `lex` + `vec` + `hyde` |
| Ambiguous query | Add `intent` to any combination above |
First query gets 2x weight in fusion — put your best guess first.
### Lex Query Syntax
| Syntax | Meaning | Example |
|--------|---------|---------|
| `term` | Prefix match | `perf` matches "performance" |
| `"phrase"` | Exact phrase | `"rate limiter"` |
| `-term` | Exclude | `performance -sports` |
Note: `-term` only works in lex queries, not vec/hyde.
### Collection Filtering
```json
{ "collections": ["docs"] } // Single
{ "collections": ["docs", "notes"] } // Multiple (OR)
```
Omit to search all collections.
## Other MCP Tools
| Tool | Use |
|------|-----|
| `get` | Retrieve doc by path or `#docid` |
| `multi_get` | Retrieve multiple by glob/list |
| `status` | Collections and health |
## CLI
## Retrieval Commands
```bash
qmd query "question" # Auto-expand + rerank
qmd query $'lex: X\nvec: Y' # Structured
qmd query $'expand: question' # Explicit expand
qmd query --json --explain "q" # Show score traces (RRF + rerank blend)
qmd search "keywords" # BM25 only (no LLM)
qmd get "#abc123" # By docid
qmd multi-get "journals/2026-*.md" -l 40 # Batch pull snippets by glob
qmd multi-get notes/foo.md,notes/bar.md # Comma-separated list, preserves order
qmd get "#abc123" # retrieve by docid
qmd get qmd://concepts/ai-before-headcount.md --full
qmd multi-get 'concepts/{ai-before-headcount.md,data-informed-not-metric-driven.md}' --md
qmd multi-get 'sources/podcast-2025-*.md' -l 80
```
## HTTP API
Use `multi-get` when comparing several hits or gathering context across pages.
Use `--full` when the exact source matters.
## Collection Filtering
```bash
curl -X POST http://localhost:8181/query \
-H "Content-Type: application/json" \
-d '{"searches": [{"type": "lex", "query": "test"}]}'
qmd search "headcount autonomous agents" -c concepts -n 10
qmd query "merchant support product reality" -c concepts -c sources -n 10
```
Omit `-c` / `collections` to search everything. Add collection filters when a
broad query drifts into the wrong corpus.
## Query Craft
Good QMD searches mix three things:
1. **Title/alias anchors:** exact page titles, named entities, phrases.
2. **Semantic paraphrase:** how a human would describe the idea.
3. **Negative space:** enough intent to avoid nearby-but-wrong concepts.
Examples:
```bash
# Exact-ish title lookup
qmd search '"arm the rebels" merchants tools big companies' -c concepts
# Semantic concept lookup
qmd query $'intent: Find the customer proximity concept, not generic customer delight.\nlex: support pseudonymous merchant customer interviews\nvec: founder stays close to merchant reality through support and product use'
# Source lookup
qmd search "six-week cadence WhatsApp merchant relationships Shawn Ryan" -c sources -n 10
```
## Setup
@ -142,3 +165,28 @@ npm install -g @tobilu/qmd
qmd collection add ~/notes --name notes
qmd embed
```
Only add collections or generate embeddings when the user asked for setup or
index maintenance. Searching and retrieving are safe; collection/index mutation is
not a casual first step.
## MCP Setup
See `references/mcp-setup.md` for Claude Code, Claude Desktop, OpenClaw, and HTTP
server configuration.
## Pitfalls
- **Do not stop at snippets.** Fetch documents before making claims.
- **Do not overuse semantic search.** If you know exact titles or terms, BM25 is
faster and often better.
- **Do not mutate indexes casually.** `qmd collection add`, `qmd update`, and
`qmd embed` change local state and can be expensive.
- **Model-backed commands can be environment-sensitive.** If `qmd query`,
`qmd vsearch`, or reranking fails because local models/GPU are unavailable,
use `qmd search` and stronger lexical/structured terms.
- **Ambiguous user wording needs intent.** Add `intent:` rather than hoping query
expansion guesses the right domain.
- **Collection names matter.** Search `concepts` for synthesized wiki pages,
`sources` for transcripts/raw source pages, and docs collections for code/project
documentation.

View File

@ -78,7 +78,7 @@ import {
type ReindexResult,
type ChunkStrategy,
} from "../store.js";
import { disposeDefaultLlamaCpp, getDefaultLlamaCpp, setDefaultLlamaCpp, LlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "../llm.js";
import { disposeDefaultLlamaCpp, getDefaultLlamaCpp, setDefaultLlamaCpp, LlamaCpp, withLLMSession, pullModels, DEFAULT_MODEL_CACHE_DIR, resolveEmbedModel, resolveGenerateModel, resolveRerankModel, resolveModels } from "../llm.js";
import {
formatSearchResults,
formatDocuments,
@ -311,8 +311,8 @@ function formatETA(seconds: number): string {
// Check index health and print warnings/tips
function checkIndexHealth(db: Database): void {
const { needsEmbedding, totalDocs, daysStale } = getIndexHealth(db);
function checkIndexHealth(db: Database, model: string = resolveEmbedModelForCli()): void {
const { needsEmbedding, totalDocs, daysStale } = getIndexHealth(db, model);
// Warn if many docs need embedding
if (needsEmbedding > 0) {
@ -410,7 +410,8 @@ async function showStatus(): Promise<void> {
// Overall stats
const totalDocs = db.prepare(`SELECT COUNT(*) as count FROM documents WHERE active = 1`).get() as { count: number };
const vectorCount = db.prepare(`SELECT COUNT(*) as count FROM content_vectors`).get() as { count: number };
const needsEmbedding = getHashesNeedingEmbedding(db);
const statusEmbedModel = resolveEmbedModelForCli();
const needsEmbedding = getHashesNeedingEmbedding(db, undefined, statusEmbedModel);
// Most recent update across all collections
const mostRecent = db.prepare(`SELECT MAX(modified_at) as latest FROM documents WHERE active = 1`).get() as { latest: string | null };
@ -536,18 +537,26 @@ async function showStatus(): Promise<void> {
const match = uri.match(/^hf:([^/]+\/[^/]+)\//);
return match ? `https://huggingface.co/${match[1]}` : uri;
};
const activeModels = resolveModelsForCli();
console.log(`\n${c.bold}Models${c.reset}`);
console.log(` Embedding: ${hfLink(DEFAULT_EMBED_MODEL_URI)}`);
console.log(` Reranking: ${hfLink(DEFAULT_RERANK_MODEL_URI)}`);
console.log(` Generation: ${hfLink(DEFAULT_GENERATE_MODEL_URI)}`);
console.log(` Embedding: ${hfLink(activeModels.embed)}`);
console.log(` Reranking: ${hfLink(activeModels.rerank)}`);
console.log(` Generation: ${hfLink(activeModels.generate)}`);
}
// Device / GPU info
// Important: probing node-llama-cpp can abort the whole process on machines with
// incompatible GPU drivers (for example Vulkan loader present but no usable driver).
// Keep `qmd status` safe by default and make the expensive/native probe opt-in.
if (process.env.QMD_STATUS_DEVICE_PROBE === "1") {
console.log(`\n${c.bold}Device${c.reset}`);
// Keep the native probe opt-in, but always show how QMD is configured and how to probe.
console.log(`\n${c.bold}Device${c.reset}`);
const configuredGpuMode = process.env.QMD_FORCE_CPU && !["false", "off", "none", "disable", "disabled", "0"].includes(process.env.QMD_FORCE_CPU.trim().toLowerCase())
? "CPU forced (QMD_FORCE_CPU)"
: (process.env.QMD_LLAMA_GPU?.trim() || "auto");
console.log(` Mode: ${configuredGpuMode}`);
if (process.env.QMD_STATUS_DEVICE_PROBE !== "1") {
console.log(` Status: ${c.dim}not probed${c.reset} (set QMD_STATUS_DEVICE_PROBE=1 to test GPU/CPU backend)`);
} else {
console.log(` Status: probing native llama backend...`);
try {
const llm = getDefaultLlamaCpp();
const device = await llm.getDeviceInfo({ allowBuild: false });
@ -1794,7 +1803,35 @@ function parseChunkStrategy(value: unknown): ChunkStrategy | undefined {
}
export function resolveEmbedModelForCli(): string {
return process.env.QMD_EMBED_MODEL ?? DEFAULT_EMBED_MODEL_URI;
try {
return resolveEmbedModel(loadConfig().models);
} catch {
return resolveEmbedModel();
}
}
export function resolveGenerateModelForCli(): string {
try {
return resolveGenerateModel(loadConfig().models);
} catch {
return resolveGenerateModel();
}
}
export function resolveRerankModelForCli(): string {
try {
return resolveRerankModel(loadConfig().models);
} catch {
return resolveRerankModel();
}
}
function resolveModelsForCli(): { embed: string; generate: string; rerank: string } {
try {
return resolveModels(loadConfig().models);
} catch {
return resolveModels();
}
}
async function vectorIndex(
@ -3531,10 +3568,11 @@ if (isMain) {
case "pull": {
const refresh = cli.values.refresh === undefined ? false : Boolean(cli.values.refresh);
const activeModels = resolveModelsForCli();
const models = [
DEFAULT_EMBED_MODEL_URI,
DEFAULT_GENERATE_MODEL_URI,
DEFAULT_RERANK_MODEL_URI,
activeModels.embed,
activeModels.generate,
activeModels.rerank,
];
console.log(`${c.bold}Pulling models${c.reset}`);
const results = await pullModels(models, {

View File

@ -23,7 +23,6 @@ import {
structuredSearch,
extractSnippet,
addLineNumbers,
DEFAULT_EMBED_MODEL,
DEFAULT_MULTI_GET_MAX_BYTES,
reindexCollection,
generateEmbeddings,
@ -423,7 +422,7 @@ export async function createStore(options: StoreOptions): Promise<QMDStore> {
});
},
searchLex: async (q, opts) => internal.searchFTS(q, opts?.limit, opts?.collection),
searchVector: async (q, opts) => internal.searchVec(q, DEFAULT_EMBED_MODEL, opts?.limit, opts?.collection),
searchVector: async (q, opts) => internal.searchVec(q, llm.embedModelName, opts?.limit, opts?.collection),
expandQuery: async (q, opts) => internal.expandQuery(q, undefined, opts?.intent),
get: async (pathOrDocid, opts) => internal.findDocument(pathOrDocid, opts),
getDocumentBody: async (pathOrDocid, opts) => {

View File

@ -31,6 +31,7 @@ async function loadNodeLlamaCpp(): Promise<NodeLlamaCppModule> {
export function setNodeLlamaCppModuleForTest(module: NodeLlamaCppModule | null): void {
nodeLlamaCppImport = module ? Promise.resolve(module) : null;
failedGpuInitModes.clear();
noGpuAccelerationWarningShown = false;
}
type StdoutWrite = typeof process.stdout.write;
@ -83,7 +84,7 @@ export function isQwen3EmbeddingModel(modelUri: string): boolean {
* Uses Qwen3-Embedding instruct format when a Qwen embedding model is active.
*/
export function formatQueryForEmbedding(query: string, modelUri?: string): string {
const uri = modelUri ?? process.env.QMD_EMBED_MODEL ?? DEFAULT_EMBED_MODEL;
const uri = modelUri ?? resolveEmbedModel();
if (isQwen3EmbeddingModel(uri)) {
return `Instruct: Retrieve relevant documents for the given query\nQuery: ${query}`;
}
@ -96,7 +97,7 @@ export function formatQueryForEmbedding(query: string, modelUri?: string): strin
* Qwen3-Embedding encodes documents as raw text without special prefixes.
*/
export function formatDocForEmbedding(text: string, title?: string, modelUri?: string): string {
const uri = modelUri ?? process.env.QMD_EMBED_MODEL ?? DEFAULT_EMBED_MODEL;
const uri = modelUri ?? resolveEmbedModel();
if (isQwen3EmbeddingModel(uri)) {
// Qwen3-Embedding: documents are raw text, no task prefix
return title ? `${title}\n${text}` : text;
@ -255,6 +256,32 @@ export const DEFAULT_EMBED_MODEL_URI = DEFAULT_EMBED_MODEL;
export const DEFAULT_RERANK_MODEL_URI = DEFAULT_RERANK_MODEL;
export const DEFAULT_GENERATE_MODEL_URI = DEFAULT_GENERATE_MODEL;
export type ModelResolutionConfig = {
embed?: string;
generate?: string;
rerank?: string;
};
export function resolveEmbedModel(config?: ModelResolutionConfig): string {
return config?.embed || process.env.QMD_EMBED_MODEL || DEFAULT_EMBED_MODEL;
}
export function resolveGenerateModel(config?: ModelResolutionConfig): string {
return config?.generate || process.env.QMD_GENERATE_MODEL || DEFAULT_GENERATE_MODEL;
}
export function resolveRerankModel(config?: ModelResolutionConfig): string {
return config?.rerank || process.env.QMD_RERANK_MODEL || DEFAULT_RERANK_MODEL;
}
export function resolveModels(config?: ModelResolutionConfig): Required<ModelResolutionConfig> {
return {
embed: resolveEmbedModel(config),
generate: resolveGenerateModel(config),
rerank: resolveRerankModel(config),
};
}
// Local model cache directory
const MODEL_CACHE_DIR = process.env.XDG_CACHE_HOME
? join(process.env.XDG_CACHE_HOME, "qmd", "models")
@ -579,6 +606,7 @@ function resolveExpandContextSize(configValue?: number): number {
}
const failedGpuInitModes = new Set<LlamaGpuMode>();
let noGpuAccelerationWarningShown = false;
export class LlamaCpp implements LLM {
private readonly _ciMode = !!process.env.CI;
@ -610,9 +638,9 @@ export class LlamaCpp implements LLM {
constructor(config: LlamaCppConfig = {}) {
this.embedModelUri = config.embedModel || process.env.QMD_EMBED_MODEL || DEFAULT_EMBED_MODEL;
this.generateModelUri = config.generateModel || process.env.QMD_GENERATE_MODEL || DEFAULT_GENERATE_MODEL;
this.rerankModelUri = config.rerankModel || process.env.QMD_RERANK_MODEL || DEFAULT_RERANK_MODEL;
this.embedModelUri = resolveEmbedModel({ embed: config.embedModel });
this.generateModelUri = resolveGenerateModel({ generate: config.generateModel });
this.rerankModelUri = resolveRerankModel({ rerank: config.rerankModel });
this.modelCacheDir = config.modelCacheDir || MODEL_CACHE_DIR;
this.expandContextSize = resolveExpandContextSize(config.expandContextSize);
this.inactivityTimeoutMs = config.inactivityTimeoutMs ?? DEFAULT_INACTIVITY_TIMEOUT_MS;
@ -623,6 +651,14 @@ export class LlamaCpp implements LLM {
return this.embedModelUri;
}
get generateModelName(): string {
return this.generateModelUri;
}
get rerankModelName(): string {
return this.rerankModelUri;
}
/**
* Reset the inactivity timer. Called after each model operation.
* When timer fires, models are unloaded to free memory (if no active sessions).
@ -760,9 +796,10 @@ export class LlamaCpp implements LLM {
}
}
if (llama.gpu === false) {
if (llama.gpu === false && !noGpuAccelerationWarningShown) {
noGpuAccelerationWarningShown = true;
process.stderr.write(
"QMD Warning: no GPU acceleration, running on CPU (slow). Run 'qmd status' for details.\n"
"QMD Warning: no GPU acceleration, running on CPU (slow). Run 'QMD_STATUS_DEVICE_PROBE=1 qmd status' for device details.\n"
);
}
this.llama = llama;

View File

@ -25,6 +25,9 @@ import {
formatQueryForEmbedding,
formatDocForEmbedding,
withLLMSessionForLlm,
DEFAULT_EMBED_MODEL_URI,
DEFAULT_RERANK_MODEL_URI,
DEFAULT_GENERATE_MODEL_URI,
type RerankDocument,
type ILLMSession,
} from "./llm.js";
@ -39,9 +42,9 @@ import type {
// Configuration
// =============================================================================
export const DEFAULT_EMBED_MODEL = "embeddinggemma";
export const DEFAULT_RERANK_MODEL = "ExpedientFalcon/qwen3-reranker:0.6b-q8_0";
export const DEFAULT_QUERY_MODEL = "Qwen/Qwen3-1.7B";
export const DEFAULT_EMBED_MODEL = DEFAULT_EMBED_MODEL_URI;
export const DEFAULT_RERANK_MODEL = DEFAULT_RERANK_MODEL_URI;
export const DEFAULT_QUERY_MODEL = DEFAULT_GENERATE_MODEL_URI;
export const DEFAULT_GLOB = "**/*.md";
export const DEFAULT_MULTI_GET_MAX_BYTES = 10 * 1024; // 10KB
export const DEFAULT_EMBED_MAX_DOCS_PER_BATCH = 64;
@ -1749,8 +1752,8 @@ export function createStore(dbPath?: string): Store {
searchVec: (query: string, model: string, limit?: number, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[]) => searchVec(db, query, model, limit, collectionName, session, precomputedEmbedding),
// Query expansion & reranking
expandQuery: (query: string, model?: string, intent?: string) => expandQuery(query, model, db, intent, store.llm),
rerank: (query: string, documents: { file: string; text: string }[], model?: string, intent?: string) => rerank(query, documents, model, db, intent, store.llm),
expandQuery: (query: string, model?: string, intent?: string) => expandQuery(query, model ?? store.llm?.generateModelName ?? DEFAULT_QUERY_MODEL, db, intent, store.llm),
rerank: (query: string, documents: { file: string; text: string }[], model?: string, intent?: string) => rerank(query, documents, model ?? store.llm?.rerankModelName ?? DEFAULT_RERANK_MODEL, db, intent, store.llm),
// Document retrieval
findDocument: (filename: string, options?: { includeBody?: boolean }) => findDocument(db, filename, options),

View File

@ -27,14 +27,15 @@ let testCounter = 0; // Unique counter for each test run
const thisDir = dirname(fileURLToPath(import.meta.url));
const projectRoot = join(thisDir, "..");
const qmdScript = join(projectRoot, "src", "cli", "qmd.ts");
// Resolve tsx binary from project's node_modules (not cwd-dependent)
const tsxBin = (() => {
const candidate = join(projectRoot, "node_modules", ".bin", "tsx");
if (existsSync(candidate)) {
return candidate;
}
return join(process.cwd(), "node_modules", ".bin", "tsx");
})();
const isBunRuntime = typeof (globalThis as { Bun?: unknown }).Bun !== "undefined";
const tsxCli = join(projectRoot, "node_modules", "tsx", "dist", "cli.mjs");
const qmdCommand = isBunRuntime
? { command: process.execPath, args: [qmdScript] }
: { command: process.execPath, args: [tsxCli, qmdScript] };
function qmdRunnerArgs(args: string[]): { command: string; args: string[] } {
return { command: qmdCommand.command, args: [...qmdCommand.args, ...args] };
}
// Helper to run qmd command with test database
async function runQmd(
@ -44,7 +45,8 @@ async function runQmd(
const workingDir = options.cwd || fixturesDir;
const dbPath = options.dbPath || testDbPath;
const configDir = options.configDir || testConfigDir;
const proc = spawn(tsxBin, [qmdScript, ...args], {
const runner = qmdRunnerArgs(args);
const proc = spawn(runner.command, runner.args, {
cwd: workingDir,
env: {
...process.env,
@ -252,15 +254,15 @@ describe("CLI Skills", () => {
expect(stderr).toBe("");
expect(exitCode).toBe(0);
expect(stdout).toContain("qmd");
expect(stdout).toContain("Search markdown knowledge bases");
expect(stdout).toContain("Search local markdown knowledge bases");
});
test("gets version-matched runtime skill content", async () => {
const { stdout, stderr, exitCode } = await runQmd(["skills", "get", "qmd"]);
expect(stderr).toBe("");
expect(exitCode).toBe(0);
expect(stdout).toContain("# QMD - Quick Markdown Search");
expect(stdout).toContain("## MCP: `query`");
expect(stdout).toContain("# QMD - Query Markdown Documents");
expect(stdout).toContain("## MCP Tool: `query`");
expect(stdout).not.toContain("This file is a discovery stub");
});
@ -268,7 +270,7 @@ describe("CLI Skills", () => {
const { stdout, stderr, exitCode } = await runQmd(["skills", "get", "qmd", "--full"]);
expect(stderr).toBe("");
expect(exitCode).toBe(0);
expect(stdout).toContain("# QMD - Quick Markdown Search");
expect(stdout).toContain("# QMD - Query Markdown Documents");
expect(stdout).toContain("--- references/mcp-setup.md ---");
expect(stdout).toContain("# QMD MCP Server Setup");
});
@ -284,8 +286,8 @@ describe("CLI Skills", () => {
const { stdout, stderr, exitCode } = await runQmd(["skill", "show"]);
expect(stderr).toBe("");
expect(exitCode).toBe(0);
expect(stdout).toContain("# QMD - Quick Markdown Search");
expect(stdout).toContain("## MCP: `query`");
expect(stdout).toContain("# QMD - Query Markdown Documents");
expect(stdout).toContain("## MCP Tool: `query`");
expect(stdout).not.toContain("This file is a discovery stub");
});
@ -300,8 +302,8 @@ describe("CLI Skills", () => {
const installedSkillDir = join(installDir, ".agents", "skills", "qmd");
const installed = readFileSync(join(installedSkillDir, "SKILL.md"), "utf8");
expect(installed).toContain("# QMD - Quick Markdown Search");
expect(installed).toContain("## MCP: `query`");
expect(installed).toContain("# QMD - Query Markdown Documents");
expect(installed).toContain("## MCP Tool: `query`");
expect(installed).not.toContain("This file is a discovery stub");
expect(readFileSync(join(installedSkillDir, "references", "mcp-setup.md"), "utf8")).toContain("# QMD MCP Server Setup");
});
@ -370,7 +372,7 @@ describe("CLI Skill Commands", () => {
expect(exitCode).toBe(0);
const skillDir = join(projectDir, ".agents", "skills", "qmd");
expect(readFileSync(join(skillDir, "SKILL.md"), "utf-8")).toContain("# QMD - Quick Markdown Search");
expect(readFileSync(join(skillDir, "SKILL.md"), "utf-8")).toContain("# QMD - Query Markdown Documents");
expect(existsSync(join(projectDir, ".claude", "skills", "qmd"))).toBe(false);
expect(stdout).toContain(`✓ Installed QMD skill to ${skillDir}`);
expect(stdout).toContain("Tip: create a Claude symlink manually");
@ -388,9 +390,9 @@ describe("CLI Skill Commands", () => {
const skillDir = join(fakeHome, ".agents", "skills", "qmd");
const claudeLink = join(fakeHome, ".claude", "skills", "qmd");
expect(readFileSync(join(skillDir, "SKILL.md"), "utf-8")).toContain("# QMD - Quick Markdown Search");
expect(readFileSync(join(skillDir, "SKILL.md"), "utf-8")).toContain("# QMD - Query Markdown Documents");
expect(lstatSync(claudeLink).isSymbolicLink()).toBe(true);
expect(readFileSync(join(claudeLink, "SKILL.md"), "utf-8")).toContain("# QMD - Quick Markdown Search");
expect(readFileSync(join(claudeLink, "SKILL.md"), "utf-8")).toContain("# QMD - Query Markdown Documents");
expect(stdout).toContain(`✓ Installed QMD skill to ${skillDir}`);
expect(stdout).toContain(`✓ Linked Claude skill at ${claudeLink}`);
});
@ -408,7 +410,7 @@ describe("CLI Skill Commands", () => {
const skillDir = join(fakeHome, ".agents", "skills", "qmd");
expect(lstatSync(skillDir).isSymbolicLink()).toBe(false);
expect(readFileSync(join(skillDir, "SKILL.md"), "utf-8")).toContain("# QMD - Quick Markdown Search");
expect(readFileSync(join(skillDir, "SKILL.md"), "utf-8")).toContain("# QMD - Query Markdown Documents");
expect(stdout).toContain(`✓ Claude already sees the skill via ${join(fakeHome, ".claude", "skills")}`);
});
@ -470,10 +472,13 @@ describe("CLI Status Command", () => {
expect(stdout).toContain("Collection");
});
test("skips device probing by default", async () => {
test("shows device mode without native probing by default", async () => {
const { stdout, exitCode } = await runQmd(["status"]);
expect(exitCode).toBe(0);
expect(stdout).not.toContain("Device");
expect(stdout).toContain("Device");
expect(stdout).toContain("Mode:");
expect(stdout).toContain("not probed");
expect(stdout).toContain("QMD_STATUS_DEVICE_PROBE=1");
});
});
@ -1577,7 +1582,8 @@ describe("mcp http daemon", () => {
port: number,
options: { args?: string[]; env?: Record<string, string> } = {},
): import("child_process").ChildProcess {
const proc = spawn(tsxBin, [qmdScript, ...(options.args ?? []), "mcp", "--http", "--port", String(port)], {
const runner = qmdRunnerArgs([...(options.args ?? []), "mcp", "--http", "--port", String(port)]);
const proc = spawn(runner.command, runner.args, {
cwd: fixturesDir,
env: {
...process.env,

View File

@ -17,6 +17,10 @@ import {
withNativeStdoutRedirectedToStderr,
resolveParallelismOverride,
resolveSafeParallelism,
resolveEmbedModel,
resolveGenerateModel,
resolveRerankModel,
resolveModels,
withLLMSession,
canUnloadLLM,
SessionReleasedError,
@ -24,6 +28,63 @@ import {
type ILLMSession,
} from "../src/llm.js";
describe("model name resolution", () => {
function withModelEnv(env: Record<string, string | undefined>, fn: () => void): void {
const previous = {
QMD_EMBED_MODEL: process.env.QMD_EMBED_MODEL,
QMD_GENERATE_MODEL: process.env.QMD_GENERATE_MODEL,
QMD_RERANK_MODEL: process.env.QMD_RERANK_MODEL,
};
try {
for (const [key, value] of Object.entries(env)) {
if (value === undefined) delete process.env[key];
else process.env[key] = value;
}
fn();
} finally {
for (const [key, value] of Object.entries(previous)) {
if (value === undefined) delete process.env[key];
else process.env[key] = value;
}
}
}
test("all model roles resolve config hints before env fallbacks", () => {
withModelEnv({
QMD_EMBED_MODEL: "env-embed",
QMD_GENERATE_MODEL: "env-generate",
QMD_RERANK_MODEL: "env-rerank",
}, () => {
const config = {
embed: "config-embed",
generate: "config-generate",
rerank: "config-rerank",
};
expect(resolveEmbedModel(config)).toBe("config-embed");
expect(resolveGenerateModel(config)).toBe("config-generate");
expect(resolveRerankModel(config)).toBe("config-rerank");
expect(resolveModels(config)).toEqual(config);
});
});
test("LlamaCpp constructor uses the same resolver as status/embed/query helpers", () => {
withModelEnv({
QMD_EMBED_MODEL: "env-embed",
QMD_GENERATE_MODEL: "env-generate",
QMD_RERANK_MODEL: "env-rerank",
}, () => {
const llm = new LlamaCpp({
embedModel: "config-embed",
generateModel: "config-generate",
rerankModel: "config-rerank",
});
expect(llm.embedModelName).toBe(resolveEmbedModel({ embed: "config-embed" }));
expect(llm.generateModelName).toBe(resolveGenerateModel({ generate: "config-generate" }));
expect(llm.rerankModelName).toBe(resolveRerankModel({ rerank: "config-rerank" }));
});
});
});
// =============================================================================
// Singleton Tests (no model loading required)
// =============================================================================
@ -178,6 +239,40 @@ describe("native llama stdout containment", () => {
else process.env.QMD_FORCE_CPU = prevForceCpu;
}
});
test("warns about CPU fallback only once per process", async () => {
const prevGpu = process.env.QMD_LLAMA_GPU;
const prevForceCpu = process.env.QMD_FORCE_CPU;
process.env.QMD_LLAMA_GPU = "false";
delete process.env.QMD_FORCE_CPU;
setNodeLlamaCppModuleForTest({
LlamaLogLevel: { error: "error" },
resolveModelFile: vi.fn(),
LlamaChatSession: vi.fn() as any,
getLlama: vi.fn(async () => ({ gpu: false, cpuMathCores: 4 }) as any),
});
const stderrSpy = vi.spyOn(process.stderr, "write").mockReturnValue(true);
try {
const first = new LlamaCpp();
const second = new LlamaCpp();
await (first as any).ensureLlama();
await (second as any).ensureLlama();
const stderr = String(stderrSpy.mock.calls.map(call => call[0]).join(""));
expect(stderr.match(/no GPU acceleration/g)?.length).toBe(1);
expect(stderr).toContain("QMD_STATUS_DEVICE_PROBE=1 qmd status");
} finally {
stderrSpy.mockRestore();
setNodeLlamaCppModuleForTest(null);
if (prevGpu === undefined) delete process.env.QMD_LLAMA_GPU;
else process.env.QMD_LLAMA_GPU = prevGpu;
if (prevForceCpu === undefined) delete process.env.QMD_FORCE_CPU;
else process.env.QMD_FORCE_CPU = prevForceCpu;
}
});
});
describe("LLM context parallelism safety", () => {

View File

@ -5,6 +5,17 @@ import { tmpdir } from "node:os";
import { afterEach, describe, expect, test } from "vitest";
import { findLocalConfigPath, getLocalDbPath } from "../src/collections.js";
function cliCommandArgs(command: string): { bin: string; args: string[] } {
const cliPath = join(process.cwd(), "src/cli/qmd.ts");
if (process.versions.bun) {
return { bin: process.execPath, args: [cliPath, command] };
}
return {
bin: process.execPath,
args: [join(process.cwd(), "node_modules/tsx/dist/cli.mjs"), cliPath, command],
};
}
const roots: string[] = [];
function tempProject(): string {
@ -56,12 +67,11 @@ describe("local .qmd project config", () => {
mkdirSync(join(root, ".qmd"), { recursive: true });
mkdirSync(join(root, "docs"), { recursive: true });
writeFileSync(join(root, "docs", "a.md"), "# A\n\nLocal test document.\n");
writeFileSync(join(root, ".qmd", "index.yaml"), `collections:\n docs:\n path: ${JSON.stringify(join(root, "docs"))}\n pattern: "**/*.md"\n context:\n /: Local test docs\n`);
writeFileSync(join(root, ".qmd", "index.yaml"), `collections:\n docs:\n path: ${JSON.stringify(join(root, "docs"))}\n pattern: "**/*.md"\n context:\n /: Local test docs\nmodels:\n embed: local-embed-model\n rerank: local-rerank-model\n generate: local-generate-model\n`);
const home = join(root, "home");
const tsxBin = join(process.cwd(), "node_modules", ".bin", "tsx");
const runner = existsSync(tsxBin) ? tsxBin : "bun";
const output = execFileSync(runner, [join(process.cwd(), "src/cli/qmd.ts"), "status"], {
const { bin, args } = cliCommandArgs("status");
const output = execFileSync(bin, args, {
cwd: root,
encoding: "utf-8",
env: {
@ -69,12 +79,19 @@ describe("local .qmd project config", () => {
HOME: home,
XDG_CONFIG_HOME: join(home, ".config"),
XDG_CACHE_HOME: join(home, ".cache"),
QMD_EMBED_MODEL: "env-embed-model",
QMD_RERANK_MODEL: "env-rerank-model",
QMD_GENERATE_MODEL: "env-generate-model",
},
});
const localIndex = join(root, ".qmd", "index.sqlite");
expect(output).toContain(`Index: ${realpathSync(localIndex)}`);
expect(output).toContain("docs (qmd://docs/)");
expect(output).toContain("Embedding: local-embed-model");
expect(output).toContain("Reranking: local-rerank-model");
expect(output).toContain("Generation: local-generate-model");
expect(output).not.toContain("env-embed-model");
expect(existsSync(localIndex)).toBe(true);
expect(existsSync(join(home, ".cache", "qmd", "index.sqlite"))).toBe(false);
});

View File

@ -186,7 +186,7 @@ function seedTestData(db: Database): void {
for (let i = 0; i < 768; i++) embedding[i] = Math.random();
for (const doc of docs.slice(0, 4)) { // Skip large file for embeddings
db.prepare(`INSERT INTO content_vectors (hash, seq, pos, model, embedded_at) VALUES (?, 0, 0, 'embeddinggemma', ?)`).run(doc.hash, now);
db.prepare(`INSERT INTO content_vectors (hash, seq, pos, model, embedded_at) VALUES (?, 0, 0, ?, ?)`).run(doc.hash, DEFAULT_EMBED_MODEL, now);
db.prepare(`INSERT INTO vectors_vec (hash_seq, embedding) VALUES (?, ?)`).run(`${doc.hash}_0`, embedding);
}
}

View File

@ -20,8 +20,8 @@ describe("package grammar distribution", () => {
expect(pkg.files, "published package files").toContain("scripts/check-package-grammars.mjs");
expect(pkg.files, "published package files").toContain("skills/");
const qmdSkill = readFileSync(new URL("skills/qmd/SKILL.md", root), "utf8");
expect(qmdSkill).toContain("# QMD - Quick Markdown Search");
expect(qmdSkill).toContain("## MCP: `query`");
expect(qmdSkill).toContain("# QMD - Query Markdown Documents");
expect(qmdSkill).toContain("## MCP Tool: `query`");
expect(qmdSkill).not.toContain("This file is a discovery stub");
const scriptPath = join(root.pathname, "scripts", "check-package-grammars.mjs");