Merge pull request #656 from tobi/fix/gpu-status-warning

Fix GPU status guidance and benchmark warnings
2026-05-16 19:55:39 -04:00 · 2026-05-16 19:55:39 -04:00 · ddbd6bd8be
commit ddbd6bd8be
parent cdf3bc0712 ad8a371be2
12 changed files with 415 additions and 164 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -8,6 +8,11 @@

 ### Fixes

+- Skill: expand the packaged QMD skill with retrieval-first workflows, structured query examples, wiki/source collection guidance, and safe fallbacks when model-backed search is unavailable.
+- Tests: make `bun run test` execute the local unit suite under both Node/Vitest and Bun (`test:node` + `test:bun`) so runtime-specific regressions are caught before CI.
+- Model config: centralize embedding/rerank/generation model resolution so `qmd embed`, `status`, `query`, `vsearch`, `pull`, SDK vector search, and `bench` use the same active `.qmd/index.yaml` model hints and environment fallbacks.
+- GPU/status: `qmd status` now uses the same embedding model identity as `qmd embed` when computing pending embeddings, so URI-backed embeddings are not incorrectly reported as pending under the legacy `embeddinggemma` alias.
+- GPU status: `qmd status` now always shows GPU mode/configuration without unsafe native probing, and CPU-fallback warnings point to `QMD_STATUS_DEVICE_PROBE=1 qmd status` for an actual backend probe. The no-GPU warning is emitted once per process instead of once per LLM instance during benchmarks.
 - GPU: add `QMD_FORCE_CPU=1` / `--no-gpu` to bypass CUDA/Vulkan/Metal probing entirely, and route native llama.cpp stdout noise to stderr so JSON output stays parseable during search/query commands.
 - Snippet line numbers: `qmd_query` (MCP), HTTP `/query`, and `qmd query`
  (CLI JSON output and snippet headers) now return absolute source-file
--- a/package.json
+++ b/package.json
@ -25,7 +25,10 @@
  "scripts": {
    "prepare": "[ -d .git ] && ./scripts/install-hooks.sh || true",
    "build": "tsc -p tsconfig.build.json && printf '#!/usr/bin/env node\n' | cat - dist/cli/qmd.js > dist/cli/qmd.tmp && mv dist/cli/qmd.tmp dist/cli/qmd.js && chmod +x dist/cli/qmd.js",
-    "test": "vitest run --reporter=verbose test/",
+    "test": "bun run test:unit",
+    "test:node": "node ./node_modules/vitest/vitest.mjs run --reporter=verbose",
+    "test:bun": "bun test --preload ./src/test-preload.ts",
+    "test:unit": "bun run test:node -- test/ && bun run test:bun -- test/",
    "qmd": "tsx src/cli/qmd.ts",
    "index": "tsx src/cli/qmd.ts index",
    "vector": "tsx src/cli/qmd.ts vector",
--- a/skills/qmd/SKILL.md
+++ b/skills/qmd/SKILL.md
@ -1,138 +1,161 @@
 ---
 name: qmd
-description: Search markdown knowledge bases, notes, and documentation using QMD. Use when users ask to search notes, find documents, or look up information.
+description: Search local markdown knowledge bases, notes, docs, and wikis with QMD. Use when users ask to find notes, retrieve documents, inspect a wiki, answer from indexed markdown, or set up QMD access.
 license: MIT
 compatibility: Requires qmd CLI or MCP server. Install via `npm install -g @tobilu/qmd`.
 metadata:
  author: tobi
-  version: "2.0.0"
+  version: "2.1.0"
 allowed-tools: Bash(qmd:*), mcp__qmd__*
 ---

-# QMD - Quick Markdown Search
+# QMD - Query Markdown Documents

-Local search engine for markdown content.
+QMD is a local search and retrieval engine for markdown collections: notes, docs,
+wikis, transcripts, and project knowledge bases. Use it before generic web search
+when the user is asking about something that may already live in their indexed
+local markdown.

-## Status
+## Status Check

-!`qmd status 2>/dev/null || echo "Not installed: npm install -g @tobilu/qmd"`
+Start by checking what QMD can see:

-## MCP: `query`
+```bash
+qmd collection list
+qmd ls
+```
+
+For health details:
+
+```bash
+qmd status
+```
+
+If QMD is missing:
+
+```bash
+npm install -g @tobilu/qmd
+```
+
+## Retrieval Workflow
+
+1. **Discover collections** with `qmd collection list` or `qmd ls`.
+2. **Search first**, usually with a small result count.
+3. **Retrieve source documents** with `qmd get` or `qmd multi-get`.
+4. **Answer from the retrieved text**, citing file paths or docids.
+5. **If results are weak**, rewrite the query using a different search mode.
+
+Do not answer from search-result snippets alone when the user needs substance.
+Fetch the document.
+
+## Search Modes
+
+### Fast lexical search
+
+Use BM25 when you know names, exact terms, titles, identifiers, or code symbols:
+
+```bash
+qmd search "cockpit OKR Goodhart" -n 10
+qmd search '"AI Before Headcount"' -c concepts -n 5
+```
+
+Good `lex` queries are short: 2-6 discriminative terms, quoted phrases when exact,
+and no filler words.
+
+### Hybrid query search
+
+Use `qmd query` when semantic recall, query expansion, vector search, or reranking
+matters more than speed:
+
+```bash
+qmd query "decision quality depends on surfacing assumptions and context" -n 10
+qmd query --json --explain "metrics as cockpit instruments but not OKRs"
+```
+
+`qmd query` may initialize local models. If models/GPU are unavailable, slow, or
+crashing, fall back to `qmd search` and use better lexical terms.
+
+### Structured queries
+
+For subtle wiki/doc searches, structured queries are usually strongest:
+
+```bash
+qmd query $'intent: Find the concept note about metrics as instruments without letting OKRs replace judgment.\nlex: cockpit instruments OKR Goodhart metrics judgment\nvec: data informed not metric driven product judgment\nhyde: A concept note says metrics are useful like cockpit instruments, but leaders should remain data-informed rather than metric-driven because OKRs and dashboards can Goodhart product judgment.'
+```
+
+Use this pattern when the user's wording is indirect:
+
+- `intent:` disambiguates the target.
+- `lex:` anchors exact names, phrases, aliases, and rare terms.
+- `vec:` adds the semantic paraphrase.
+- `hyde:` describes the document that would answer the query.
+
+Put the best query first; early searches receive more weight in fusion.
+
+## MCP Tool: `query`
+
+When using the MCP server, prefer structured searches:

 ```json
 {
  "searches": [
-    { "type": "lex", "query": "CAP theorem consistency" },
-    { "type": "vec", "query": "tradeoff between consistency and availability" }
+    { "type": "lex", "query": "cockpit OKR Goodhart" },
+    { "type": "vec", "query": "data informed not metric driven product judgment" },
+    { "type": "hyde", "query": "A concept note explains that metrics are useful as instruments, but leaders should not let OKRs or dashboards replace judgment." }
  ],
-  "collections": ["docs"],
+  "intent": "Find the concept note about using metrics as instruments without becoming metric-driven.",
+  "collections": ["concepts"],
  "limit": 10
 }
 ```

 ### Query Types

-| Type | Method | Input |
-|------|--------|-------|
-| `lex` | BM25 | Keywords — exact terms, names, code |
-| `vec` | Vector | Question — natural language |
-| `hyde` | Vector | Answer — hypothetical result (50-100 words) |
+- `lex` — BM25 keyword search. Best for exact terms, names, titles, and code.
+- `vec` — vector semantic search. Best for natural-language concepts.
+- `hyde` — vector search using a hypothetical answer/document passage.

-### Writing Good Queries
-
-**lex (keyword)**
- 2-5 terms, no filler words
- Exact phrase: `"connection pool"` (quoted)
- Exclude terms: `performance -sports` (minus prefix)
- Code identifiers work: `handleError async`
-
-**vec (semantic)**
- Full natural language question
- Be specific: `"how does the rate limiter handle burst traffic"`
- Include context: `"in the payment service, how are refunds processed"`
-
-**hyde (hypothetical document)**
- Write 50-100 words of what the *answer* looks like
- Use the vocabulary you expect in the result
-
-**expand (auto-expand)**
- Use a single-line query (implicit) or `expand: question` on its own line
- Lets the local LLM generate lex/vec/hyde variations
- Do not mix `expand:` with other typed lines — it's either a standalone expand query or a full query document
-
-### Intent (Disambiguation)
-
-When a query term is ambiguous, add `intent` to steer results:
-
-```json
-{
-  "searches": [
-    { "type": "lex", "query": "performance" }
-  ],
-  "intent": "web page load times and Core Web Vitals"
-}
-```
-
-Intent affects expansion, reranking, chunk selection, and snippet extraction. It does not search on its own — it's a steering signal that disambiguates queries like "performance" (web-perf vs team health vs fitness).
-
-### Combining Types
-
-| Goal | Approach |
-|------|----------|
-| Know exact terms | `lex` only |
-| Don't know vocabulary | Use a single-line query (implicit `expand:`) or `vec` |
-| Best recall | `lex` + `vec` |
-| Complex topic | `lex` + `vec` + `hyde` |
-| Ambiguous query | Add `intent` to any combination above |
-
-First query gets 2x weight in fusion — put your best guess first.
-
-### Lex Query Syntax
-
-| Syntax | Meaning | Example |
-|--------|---------|---------|
-| `term` | Prefix match | `perf` matches "performance" |
-| `"phrase"` | Exact phrase | `"rate limiter"` |
-| `-term` | Exclude | `performance -sports` |
-
-Note: `-term` only works in lex queries, not vec/hyde.
-
-### Collection Filtering
-
-```json
-{ "collections": ["docs"] }              // Single
-{ "collections": ["docs", "notes"] }     // Multiple (OR)
-```
-
-Omit to search all collections.
-
-## Other MCP Tools
-
-| Tool | Use |
-|------|-----|
-| `get` | Retrieve doc by path or `#docid` |
-| `multi_get` | Retrieve multiple by glob/list |
-| `status` | Collections and health |
-
-## CLI
+## Retrieval Commands

 ```bash
-qmd query "question"              # Auto-expand + rerank
-qmd query $'lex: X\nvec: Y'       # Structured
-qmd query $'expand: question'     # Explicit expand
-qmd query --json --explain "q"    # Show score traces (RRF + rerank blend)
-qmd search "keywords"             # BM25 only (no LLM)
-qmd get "#abc123"                 # By docid
-qmd multi-get "journals/2026-*.md" -l 40  # Batch pull snippets by glob
-qmd multi-get notes/foo.md,notes/bar.md   # Comma-separated list, preserves order
+qmd get "#abc123"                         # retrieve by docid
+qmd get qmd://concepts/ai-before-headcount.md --full
+qmd multi-get 'concepts/{ai-before-headcount.md,data-informed-not-metric-driven.md}' --md
+qmd multi-get 'sources/podcast-2025-*.md' -l 80
 ```

-## HTTP API
+Use `multi-get` when comparing several hits or gathering context across pages.
+Use `--full` when the exact source matters.
+
+## Collection Filtering

 ```bash
-curl -X POST http://localhost:8181/query \
-  -H "Content-Type: application/json" \
-  -d '{"searches": [{"type": "lex", "query": "test"}]}'
+qmd search "headcount autonomous agents" -c concepts -n 10
+qmd query "merchant support product reality" -c concepts -c sources -n 10
+```
+
+Omit `-c` / `collections` to search everything. Add collection filters when a
+broad query drifts into the wrong corpus.
+
+## Query Craft
+
+Good QMD searches mix three things:
+
+1. **Title/alias anchors:** exact page titles, named entities, phrases.
+2. **Semantic paraphrase:** how a human would describe the idea.
+3. **Negative space:** enough intent to avoid nearby-but-wrong concepts.
+
+Examples:
+
+```bash
+# Exact-ish title lookup
+qmd search '"arm the rebels" merchants tools big companies' -c concepts
+
+# Semantic concept lookup
+qmd query $'intent: Find the customer proximity concept, not generic customer delight.\nlex: support pseudonymous merchant customer interviews\nvec: founder stays close to merchant reality through support and product use'
+
+# Source lookup
+qmd search "six-week cadence WhatsApp merchant relationships Shawn Ryan" -c sources -n 10
 ```

 ## Setup
@ -142,3 +165,28 @@ npm install -g @tobilu/qmd
 qmd collection add ~/notes --name notes
 qmd embed
 ```
+
+Only add collections or generate embeddings when the user asked for setup or
+index maintenance. Searching and retrieving are safe; collection/index mutation is
+not a casual first step.
+
+## MCP Setup
+
+See `references/mcp-setup.md` for Claude Code, Claude Desktop, OpenClaw, and HTTP
+server configuration.
+
+## Pitfalls
+
+- **Do not stop at snippets.** Fetch documents before making claims.
+- **Do not overuse semantic search.** If you know exact titles or terms, BM25 is
+  faster and often better.
+- **Do not mutate indexes casually.** `qmd collection add`, `qmd update`, and
+  `qmd embed` change local state and can be expensive.
+- **Model-backed commands can be environment-sensitive.** If `qmd query`,
+  `qmd vsearch`, or reranking fails because local models/GPU are unavailable,
+  use `qmd search` and stronger lexical/structured terms.
+- **Ambiguous user wording needs intent.** Add `intent:` rather than hoping query
+  expansion guesses the right domain.
+- **Collection names matter.** Search `concepts` for synthesized wiki pages,
+  `sources` for transcripts/raw source pages, and docs collections for code/project
+  documentation.
--- a/src/cli/qmd.ts
+++ b/src/cli/qmd.ts
@ -78,7 +78,7 @@ import {
  type ReindexResult,
  type ChunkStrategy,
 } from "../store.js";
-import { disposeDefaultLlamaCpp, getDefaultLlamaCpp, setDefaultLlamaCpp, LlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "../llm.js";
+import { disposeDefaultLlamaCpp, getDefaultLlamaCpp, setDefaultLlamaCpp, LlamaCpp, withLLMSession, pullModels, DEFAULT_MODEL_CACHE_DIR, resolveEmbedModel, resolveGenerateModel, resolveRerankModel, resolveModels } from "../llm.js";
 import {
  formatSearchResults,
  formatDocuments,
@ -311,8 +311,8 @@ function formatETA(seconds: number): string {


 // Check index health and print warnings/tips
-function checkIndexHealth(db: Database): void {
-  const { needsEmbedding, totalDocs, daysStale } = getIndexHealth(db);
+function checkIndexHealth(db: Database, model: string = resolveEmbedModelForCli()): void {
+  const { needsEmbedding, totalDocs, daysStale } = getIndexHealth(db, model);

  // Warn if many docs need embedding
  if (needsEmbedding > 0) {
@ -410,7 +410,8 @@ async function showStatus(): Promise<void> {
  // Overall stats
  const totalDocs = db.prepare(`SELECT COUNT(*) as count FROM documents WHERE active = 1`).get() as { count: number };
  const vectorCount = db.prepare(`SELECT COUNT(*) as count FROM content_vectors`).get() as { count: number };
-  const needsEmbedding = getHashesNeedingEmbedding(db);
+  const statusEmbedModel = resolveEmbedModelForCli();
+  const needsEmbedding = getHashesNeedingEmbedding(db, undefined, statusEmbedModel);

  // Most recent update across all collections
  const mostRecent = db.prepare(`SELECT MAX(modified_at) as latest FROM documents WHERE active = 1`).get() as { latest: string | null };
@ -536,18 +537,26 @@ async function showStatus(): Promise<void> {
      const match = uri.match(/^hf:([^/]+\/[^/]+)\//);
      return match ? `https://huggingface.co/${match[1]}` : uri;
    };
+    const activeModels = resolveModelsForCli();
    console.log(`\n${c.bold}Models${c.reset}`);
-    console.log(`  Embedding:   ${hfLink(DEFAULT_EMBED_MODEL_URI)}`);
-    console.log(`  Reranking:   ${hfLink(DEFAULT_RERANK_MODEL_URI)}`);
-    console.log(`  Generation:  ${hfLink(DEFAULT_GENERATE_MODEL_URI)}`);
+    console.log(`  Embedding:   ${hfLink(activeModels.embed)}`);
+    console.log(`  Reranking:   ${hfLink(activeModels.rerank)}`);
+    console.log(`  Generation:  ${hfLink(activeModels.generate)}`);
  }

  // Device / GPU info
  // Important: probing node-llama-cpp can abort the whole process on machines with
  // incompatible GPU drivers (for example Vulkan loader present but no usable driver).
-  // Keep `qmd status` safe by default and make the expensive/native probe opt-in.
-  if (process.env.QMD_STATUS_DEVICE_PROBE === "1") {
-    console.log(`\n${c.bold}Device${c.reset}`);
+  // Keep the native probe opt-in, but always show how QMD is configured and how to probe.
+  console.log(`\n${c.bold}Device${c.reset}`);
+  const configuredGpuMode = process.env.QMD_FORCE_CPU && !["false", "off", "none", "disable", "disabled", "0"].includes(process.env.QMD_FORCE_CPU.trim().toLowerCase())
+    ? "CPU forced (QMD_FORCE_CPU)"
+    : (process.env.QMD_LLAMA_GPU?.trim() || "auto");
+  console.log(`  Mode:     ${configuredGpuMode}`);
+  if (process.env.QMD_STATUS_DEVICE_PROBE !== "1") {
+    console.log(`  Status:   ${c.dim}not probed${c.reset} (set QMD_STATUS_DEVICE_PROBE=1 to test GPU/CPU backend)`);
+  } else {
+    console.log(`  Status:   probing native llama backend...`);
    try {
      const llm = getDefaultLlamaCpp();
      const device = await llm.getDeviceInfo({ allowBuild: false });
@ -1794,7 +1803,35 @@ function parseChunkStrategy(value: unknown): ChunkStrategy | undefined {
 }

 export function resolveEmbedModelForCli(): string {
-  return process.env.QMD_EMBED_MODEL ?? DEFAULT_EMBED_MODEL_URI;
+  try {
+    return resolveEmbedModel(loadConfig().models);
+  } catch {
+    return resolveEmbedModel();
+  }
+}
+
+export function resolveGenerateModelForCli(): string {
+  try {
+    return resolveGenerateModel(loadConfig().models);
+  } catch {
+    return resolveGenerateModel();
+  }
+}
+
+export function resolveRerankModelForCli(): string {
+  try {
+    return resolveRerankModel(loadConfig().models);
+  } catch {
+    return resolveRerankModel();
+  }
+}
+
+function resolveModelsForCli(): { embed: string; generate: string; rerank: string } {
+  try {
+    return resolveModels(loadConfig().models);
+  } catch {
+    return resolveModels();
+  }
 }

 async function vectorIndex(
@ -3531,10 +3568,11 @@ if (isMain) {

    case "pull": {
      const refresh = cli.values.refresh === undefined ? false : Boolean(cli.values.refresh);
+      const activeModels = resolveModelsForCli();
      const models = [
-        DEFAULT_EMBED_MODEL_URI,
-        DEFAULT_GENERATE_MODEL_URI,
-        DEFAULT_RERANK_MODEL_URI,
+        activeModels.embed,
+        activeModels.generate,
+        activeModels.rerank,
      ];
      console.log(`${c.bold}Pulling models${c.reset}`);
      const results = await pullModels(models, {
--- a/src/index.ts
+++ b/src/index.ts
@ -23,7 +23,6 @@ import {
  structuredSearch,
  extractSnippet,
  addLineNumbers,
-  DEFAULT_EMBED_MODEL,
  DEFAULT_MULTI_GET_MAX_BYTES,
  reindexCollection,
  generateEmbeddings,
@ -423,7 +422,7 @@ export async function createStore(options: StoreOptions): Promise<QMDStore> {
      });
    },
    searchLex: async (q, opts) => internal.searchFTS(q, opts?.limit, opts?.collection),
-    searchVector: async (q, opts) => internal.searchVec(q, DEFAULT_EMBED_MODEL, opts?.limit, opts?.collection),
+    searchVector: async (q, opts) => internal.searchVec(q, llm.embedModelName, opts?.limit, opts?.collection),
    expandQuery: async (q, opts) => internal.expandQuery(q, undefined, opts?.intent),
    get: async (pathOrDocid, opts) => internal.findDocument(pathOrDocid, opts),
    getDocumentBody: async (pathOrDocid, opts) => {
--- a/src/llm.ts
+++ b/src/llm.ts
@ -31,6 +31,7 @@ async function loadNodeLlamaCpp(): Promise<NodeLlamaCppModule> {
 export function setNodeLlamaCppModuleForTest(module: NodeLlamaCppModule | null): void {
  nodeLlamaCppImport = module ? Promise.resolve(module) : null;
  failedGpuInitModes.clear();
+  noGpuAccelerationWarningShown = false;
 }

 type StdoutWrite = typeof process.stdout.write;
@ -83,7 +84,7 @@ export function isQwen3EmbeddingModel(modelUri: string): boolean {
 * Uses Qwen3-Embedding instruct format when a Qwen embedding model is active.
 */
 export function formatQueryForEmbedding(query: string, modelUri?: string): string {
-  const uri = modelUri ?? process.env.QMD_EMBED_MODEL ?? DEFAULT_EMBED_MODEL;
+  const uri = modelUri ?? resolveEmbedModel();
  if (isQwen3EmbeddingModel(uri)) {
    return `Instruct: Retrieve relevant documents for the given query\nQuery: ${query}`;
  }
@ -96,7 +97,7 @@ export function formatQueryForEmbedding(query: string, modelUri?: string): strin
 * Qwen3-Embedding encodes documents as raw text without special prefixes.
 */
 export function formatDocForEmbedding(text: string, title?: string, modelUri?: string): string {
-  const uri = modelUri ?? process.env.QMD_EMBED_MODEL ?? DEFAULT_EMBED_MODEL;
+  const uri = modelUri ?? resolveEmbedModel();
  if (isQwen3EmbeddingModel(uri)) {
    // Qwen3-Embedding: documents are raw text, no task prefix
    return title ? `${title}\n${text}` : text;
@ -255,6 +256,32 @@ export const DEFAULT_EMBED_MODEL_URI = DEFAULT_EMBED_MODEL;
 export const DEFAULT_RERANK_MODEL_URI = DEFAULT_RERANK_MODEL;
 export const DEFAULT_GENERATE_MODEL_URI = DEFAULT_GENERATE_MODEL;

+export type ModelResolutionConfig = {
+  embed?: string;
+  generate?: string;
+  rerank?: string;
+};
+
+export function resolveEmbedModel(config?: ModelResolutionConfig): string {
+  return config?.embed || process.env.QMD_EMBED_MODEL || DEFAULT_EMBED_MODEL;
+}
+
+export function resolveGenerateModel(config?: ModelResolutionConfig): string {
+  return config?.generate || process.env.QMD_GENERATE_MODEL || DEFAULT_GENERATE_MODEL;
+}
+
+export function resolveRerankModel(config?: ModelResolutionConfig): string {
+  return config?.rerank || process.env.QMD_RERANK_MODEL || DEFAULT_RERANK_MODEL;
+}
+
+export function resolveModels(config?: ModelResolutionConfig): Required<ModelResolutionConfig> {
+  return {
+    embed: resolveEmbedModel(config),
+    generate: resolveGenerateModel(config),
+    rerank: resolveRerankModel(config),
+  };
+}
+
 // Local model cache directory
 const MODEL_CACHE_DIR = process.env.XDG_CACHE_HOME
  ? join(process.env.XDG_CACHE_HOME, "qmd", "models")
@ -579,6 +606,7 @@ function resolveExpandContextSize(configValue?: number): number {
 }

 const failedGpuInitModes = new Set<LlamaGpuMode>();
+let noGpuAccelerationWarningShown = false;

 export class LlamaCpp implements LLM {
  private readonly _ciMode = !!process.env.CI;
@ -610,9 +638,9 @@ export class LlamaCpp implements LLM {


  constructor(config: LlamaCppConfig = {}) {
-    this.embedModelUri = config.embedModel || process.env.QMD_EMBED_MODEL || DEFAULT_EMBED_MODEL;
-    this.generateModelUri = config.generateModel || process.env.QMD_GENERATE_MODEL || DEFAULT_GENERATE_MODEL;
-    this.rerankModelUri = config.rerankModel || process.env.QMD_RERANK_MODEL || DEFAULT_RERANK_MODEL;
+    this.embedModelUri = resolveEmbedModel({ embed: config.embedModel });
+    this.generateModelUri = resolveGenerateModel({ generate: config.generateModel });
+    this.rerankModelUri = resolveRerankModel({ rerank: config.rerankModel });
    this.modelCacheDir = config.modelCacheDir || MODEL_CACHE_DIR;
    this.expandContextSize = resolveExpandContextSize(config.expandContextSize);
    this.inactivityTimeoutMs = config.inactivityTimeoutMs ?? DEFAULT_INACTIVITY_TIMEOUT_MS;
@ -623,6 +651,14 @@ export class LlamaCpp implements LLM {
    return this.embedModelUri;
  }

+  get generateModelName(): string {
+    return this.generateModelUri;
+  }
+
+  get rerankModelName(): string {
+    return this.rerankModelUri;
+  }
+
  /**
   * Reset the inactivity timer. Called after each model operation.
   * When timer fires, models are unloaded to free memory (if no active sessions).
@ -760,9 +796,10 @@ export class LlamaCpp implements LLM {
        }
      }

-      if (llama.gpu === false) {
+      if (llama.gpu === false && !noGpuAccelerationWarningShown) {
+        noGpuAccelerationWarningShown = true;
        process.stderr.write(
-          "QMD Warning: no GPU acceleration, running on CPU (slow). Run 'qmd status' for details.\n"
+          "QMD Warning: no GPU acceleration, running on CPU (slow). Run 'QMD_STATUS_DEVICE_PROBE=1 qmd status' for device details.\n"
        );
      }
      this.llama = llama;
--- a/src/store.ts
+++ b/src/store.ts
@ -25,6 +25,9 @@ import {
  formatQueryForEmbedding,
  formatDocForEmbedding,
  withLLMSessionForLlm,
+  DEFAULT_EMBED_MODEL_URI,
+  DEFAULT_RERANK_MODEL_URI,
+  DEFAULT_GENERATE_MODEL_URI,
  type RerankDocument,
  type ILLMSession,
 } from "./llm.js";
@ -39,9 +42,9 @@ import type {
 // Configuration
 // =============================================================================

-export const DEFAULT_EMBED_MODEL = "embeddinggemma";
-export const DEFAULT_RERANK_MODEL = "ExpedientFalcon/qwen3-reranker:0.6b-q8_0";
-export const DEFAULT_QUERY_MODEL = "Qwen/Qwen3-1.7B";
+export const DEFAULT_EMBED_MODEL = DEFAULT_EMBED_MODEL_URI;
+export const DEFAULT_RERANK_MODEL = DEFAULT_RERANK_MODEL_URI;
+export const DEFAULT_QUERY_MODEL = DEFAULT_GENERATE_MODEL_URI;
 export const DEFAULT_GLOB = "**/*.md";
 export const DEFAULT_MULTI_GET_MAX_BYTES = 10 * 1024; // 10KB
 export const DEFAULT_EMBED_MAX_DOCS_PER_BATCH = 64;
@ -1749,8 +1752,8 @@ export function createStore(dbPath?: string): Store {
    searchVec: (query: string, model: string, limit?: number, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[]) => searchVec(db, query, model, limit, collectionName, session, precomputedEmbedding),

    // Query expansion & reranking
-    expandQuery: (query: string, model?: string, intent?: string) => expandQuery(query, model, db, intent, store.llm),
-    rerank: (query: string, documents: { file: string; text: string }[], model?: string, intent?: string) => rerank(query, documents, model, db, intent, store.llm),
+    expandQuery: (query: string, model?: string, intent?: string) => expandQuery(query, model ?? store.llm?.generateModelName ?? DEFAULT_QUERY_MODEL, db, intent, store.llm),
+    rerank: (query: string, documents: { file: string; text: string }[], model?: string, intent?: string) => rerank(query, documents, model ?? store.llm?.rerankModelName ?? DEFAULT_RERANK_MODEL, db, intent, store.llm),

    // Document retrieval
    findDocument: (filename: string, options?: { includeBody?: boolean }) => findDocument(db, filename, options),
--- a/test/cli.test.ts
+++ b/test/cli.test.ts
@ -27,14 +27,15 @@ let testCounter = 0; // Unique counter for each test run
 const thisDir = dirname(fileURLToPath(import.meta.url));
 const projectRoot = join(thisDir, "..");
 const qmdScript = join(projectRoot, "src", "cli", "qmd.ts");
-// Resolve tsx binary from project's node_modules (not cwd-dependent)
-const tsxBin = (() => {
-  const candidate = join(projectRoot, "node_modules", ".bin", "tsx");
-  if (existsSync(candidate)) {
-    return candidate;
-  }
-  return join(process.cwd(), "node_modules", ".bin", "tsx");
-})();
+const isBunRuntime = typeof (globalThis as { Bun?: unknown }).Bun !== "undefined";
+const tsxCli = join(projectRoot, "node_modules", "tsx", "dist", "cli.mjs");
+const qmdCommand = isBunRuntime
+  ? { command: process.execPath, args: [qmdScript] }
+  : { command: process.execPath, args: [tsxCli, qmdScript] };
+
+function qmdRunnerArgs(args: string[]): { command: string; args: string[] } {
+  return { command: qmdCommand.command, args: [...qmdCommand.args, ...args] };
+}

 // Helper to run qmd command with test database
 async function runQmd(
@ -44,7 +45,8 @@ async function runQmd(
  const workingDir = options.cwd || fixturesDir;
  const dbPath = options.dbPath || testDbPath;
  const configDir = options.configDir || testConfigDir;
-  const proc = spawn(tsxBin, [qmdScript, ...args], {
+  const runner = qmdRunnerArgs(args);
+  const proc = spawn(runner.command, runner.args, {
    cwd: workingDir,
    env: {
      ...process.env,
@ -252,15 +254,15 @@ describe("CLI Skills", () => {
    expect(stderr).toBe("");
    expect(exitCode).toBe(0);
    expect(stdout).toContain("qmd");
-    expect(stdout).toContain("Search markdown knowledge bases");
+    expect(stdout).toContain("Search local markdown knowledge bases");
  });

  test("gets version-matched runtime skill content", async () => {
    const { stdout, stderr, exitCode } = await runQmd(["skills", "get", "qmd"]);
    expect(stderr).toBe("");
    expect(exitCode).toBe(0);
-    expect(stdout).toContain("# QMD - Quick Markdown Search");
-    expect(stdout).toContain("## MCP: `query`");
+    expect(stdout).toContain("# QMD - Query Markdown Documents");
+    expect(stdout).toContain("## MCP Tool: `query`");
    expect(stdout).not.toContain("This file is a discovery stub");
  });

@ -268,7 +270,7 @@ describe("CLI Skills", () => {
    const { stdout, stderr, exitCode } = await runQmd(["skills", "get", "qmd", "--full"]);
    expect(stderr).toBe("");
    expect(exitCode).toBe(0);
-    expect(stdout).toContain("# QMD - Quick Markdown Search");
+    expect(stdout).toContain("# QMD - Query Markdown Documents");
    expect(stdout).toContain("--- references/mcp-setup.md ---");
    expect(stdout).toContain("# QMD MCP Server Setup");
  });
@ -284,8 +286,8 @@ describe("CLI Skills", () => {
    const { stdout, stderr, exitCode } = await runQmd(["skill", "show"]);
    expect(stderr).toBe("");
    expect(exitCode).toBe(0);
-    expect(stdout).toContain("# QMD - Quick Markdown Search");
-    expect(stdout).toContain("## MCP: `query`");
+    expect(stdout).toContain("# QMD - Query Markdown Documents");
+    expect(stdout).toContain("## MCP Tool: `query`");
    expect(stdout).not.toContain("This file is a discovery stub");
  });

@ -300,8 +302,8 @@ describe("CLI Skills", () => {

    const installedSkillDir = join(installDir, ".agents", "skills", "qmd");
    const installed = readFileSync(join(installedSkillDir, "SKILL.md"), "utf8");
-    expect(installed).toContain("# QMD - Quick Markdown Search");
-    expect(installed).toContain("## MCP: `query`");
+    expect(installed).toContain("# QMD - Query Markdown Documents");
+    expect(installed).toContain("## MCP Tool: `query`");
    expect(installed).not.toContain("This file is a discovery stub");
    expect(readFileSync(join(installedSkillDir, "references", "mcp-setup.md"), "utf8")).toContain("# QMD MCP Server Setup");
  });
@ -370,7 +372,7 @@ describe("CLI Skill Commands", () => {
    expect(exitCode).toBe(0);

    const skillDir = join(projectDir, ".agents", "skills", "qmd");
-    expect(readFileSync(join(skillDir, "SKILL.md"), "utf-8")).toContain("# QMD - Quick Markdown Search");
+    expect(readFileSync(join(skillDir, "SKILL.md"), "utf-8")).toContain("# QMD - Query Markdown Documents");
    expect(existsSync(join(projectDir, ".claude", "skills", "qmd"))).toBe(false);
    expect(stdout).toContain(`✓ Installed QMD skill to ${skillDir}`);
    expect(stdout).toContain("Tip: create a Claude symlink manually");
@ -388,9 +390,9 @@ describe("CLI Skill Commands", () => {
    const skillDir = join(fakeHome, ".agents", "skills", "qmd");
    const claudeLink = join(fakeHome, ".claude", "skills", "qmd");

-    expect(readFileSync(join(skillDir, "SKILL.md"), "utf-8")).toContain("# QMD - Quick Markdown Search");
+    expect(readFileSync(join(skillDir, "SKILL.md"), "utf-8")).toContain("# QMD - Query Markdown Documents");
    expect(lstatSync(claudeLink).isSymbolicLink()).toBe(true);
-    expect(readFileSync(join(claudeLink, "SKILL.md"), "utf-8")).toContain("# QMD - Quick Markdown Search");
+    expect(readFileSync(join(claudeLink, "SKILL.md"), "utf-8")).toContain("# QMD - Query Markdown Documents");
    expect(stdout).toContain(`✓ Installed QMD skill to ${skillDir}`);
    expect(stdout).toContain(`✓ Linked Claude skill at ${claudeLink}`);
  });
@ -408,7 +410,7 @@ describe("CLI Skill Commands", () => {

    const skillDir = join(fakeHome, ".agents", "skills", "qmd");
    expect(lstatSync(skillDir).isSymbolicLink()).toBe(false);
-    expect(readFileSync(join(skillDir, "SKILL.md"), "utf-8")).toContain("# QMD - Quick Markdown Search");
+    expect(readFileSync(join(skillDir, "SKILL.md"), "utf-8")).toContain("# QMD - Query Markdown Documents");
    expect(stdout).toContain(`✓ Claude already sees the skill via ${join(fakeHome, ".claude", "skills")}`);
  });

@ -470,10 +472,13 @@ describe("CLI Status Command", () => {
    expect(stdout).toContain("Collection");
  });

-  test("skips device probing by default", async () => {
+  test("shows device mode without native probing by default", async () => {
    const { stdout, exitCode } = await runQmd(["status"]);
    expect(exitCode).toBe(0);
-    expect(stdout).not.toContain("Device");
+    expect(stdout).toContain("Device");
+    expect(stdout).toContain("Mode:");
+    expect(stdout).toContain("not probed");
+    expect(stdout).toContain("QMD_STATUS_DEVICE_PROBE=1");
  });
 });

@ -1577,7 +1582,8 @@ describe("mcp http daemon", () => {
    port: number,
    options: { args?: string[]; env?: Record<string, string> } = {},
  ): import("child_process").ChildProcess {
-    const proc = spawn(tsxBin, [qmdScript, ...(options.args ?? []), "mcp", "--http", "--port", String(port)], {
+    const runner = qmdRunnerArgs([...(options.args ?? []), "mcp", "--http", "--port", String(port)]);
+    const proc = spawn(runner.command, runner.args, {
      cwd: fixturesDir,
      env: {
        ...process.env,
--- a/test/llm.test.ts
+++ b/test/llm.test.ts
@ -17,6 +17,10 @@ import {
  withNativeStdoutRedirectedToStderr,
  resolveParallelismOverride,
  resolveSafeParallelism,
+  resolveEmbedModel,
+  resolveGenerateModel,
+  resolveRerankModel,
+  resolveModels,
  withLLMSession,
  canUnloadLLM,
  SessionReleasedError,
@ -24,6 +28,63 @@ import {
  type ILLMSession,
 } from "../src/llm.js";

+describe("model name resolution", () => {
+  function withModelEnv(env: Record<string, string | undefined>, fn: () => void): void {
+    const previous = {
+      QMD_EMBED_MODEL: process.env.QMD_EMBED_MODEL,
+      QMD_GENERATE_MODEL: process.env.QMD_GENERATE_MODEL,
+      QMD_RERANK_MODEL: process.env.QMD_RERANK_MODEL,
+    };
+    try {
+      for (const [key, value] of Object.entries(env)) {
+        if (value === undefined) delete process.env[key];
+        else process.env[key] = value;
+      }
+      fn();
+    } finally {
+      for (const [key, value] of Object.entries(previous)) {
+        if (value === undefined) delete process.env[key];
+        else process.env[key] = value;
+      }
+    }
+  }
+
+  test("all model roles resolve config hints before env fallbacks", () => {
+    withModelEnv({
+      QMD_EMBED_MODEL: "env-embed",
+      QMD_GENERATE_MODEL: "env-generate",
+      QMD_RERANK_MODEL: "env-rerank",
+    }, () => {
+      const config = {
+        embed: "config-embed",
+        generate: "config-generate",
+        rerank: "config-rerank",
+      };
+      expect(resolveEmbedModel(config)).toBe("config-embed");
+      expect(resolveGenerateModel(config)).toBe("config-generate");
+      expect(resolveRerankModel(config)).toBe("config-rerank");
+      expect(resolveModels(config)).toEqual(config);
+    });
+  });
+
+  test("LlamaCpp constructor uses the same resolver as status/embed/query helpers", () => {
+    withModelEnv({
+      QMD_EMBED_MODEL: "env-embed",
+      QMD_GENERATE_MODEL: "env-generate",
+      QMD_RERANK_MODEL: "env-rerank",
+    }, () => {
+      const llm = new LlamaCpp({
+        embedModel: "config-embed",
+        generateModel: "config-generate",
+        rerankModel: "config-rerank",
+      });
+      expect(llm.embedModelName).toBe(resolveEmbedModel({ embed: "config-embed" }));
+      expect(llm.generateModelName).toBe(resolveGenerateModel({ generate: "config-generate" }));
+      expect(llm.rerankModelName).toBe(resolveRerankModel({ rerank: "config-rerank" }));
+    });
+  });
+});
+
 // =============================================================================
 // Singleton Tests (no model loading required)
 // =============================================================================
@ -178,6 +239,40 @@ describe("native llama stdout containment", () => {
      else process.env.QMD_FORCE_CPU = prevForceCpu;
    }
  });
+
+  test("warns about CPU fallback only once per process", async () => {
+    const prevGpu = process.env.QMD_LLAMA_GPU;
+    const prevForceCpu = process.env.QMD_FORCE_CPU;
+    process.env.QMD_LLAMA_GPU = "false";
+    delete process.env.QMD_FORCE_CPU;
+
+    setNodeLlamaCppModuleForTest({
+      LlamaLogLevel: { error: "error" },
+      resolveModelFile: vi.fn(),
+      LlamaChatSession: vi.fn() as any,
+      getLlama: vi.fn(async () => ({ gpu: false, cpuMathCores: 4 }) as any),
+    });
+
+    const stderrSpy = vi.spyOn(process.stderr, "write").mockReturnValue(true);
+    try {
+      const first = new LlamaCpp();
+      const second = new LlamaCpp();
+
+      await (first as any).ensureLlama();
+      await (second as any).ensureLlama();
+
+      const stderr = String(stderrSpy.mock.calls.map(call => call[0]).join(""));
+      expect(stderr.match(/no GPU acceleration/g)?.length).toBe(1);
+      expect(stderr).toContain("QMD_STATUS_DEVICE_PROBE=1 qmd status");
+    } finally {
+      stderrSpy.mockRestore();
+      setNodeLlamaCppModuleForTest(null);
+      if (prevGpu === undefined) delete process.env.QMD_LLAMA_GPU;
+      else process.env.QMD_LLAMA_GPU = prevGpu;
+      if (prevForceCpu === undefined) delete process.env.QMD_FORCE_CPU;
+      else process.env.QMD_FORCE_CPU = prevForceCpu;
+    }
+  });
 });

 describe("LLM context parallelism safety", () => {
--- a/test/local-config.test.ts
+++ b/test/local-config.test.ts
@ -5,6 +5,17 @@ import { tmpdir } from "node:os";
 import { afterEach, describe, expect, test } from "vitest";
 import { findLocalConfigPath, getLocalDbPath } from "../src/collections.js";

+function cliCommandArgs(command: string): { bin: string; args: string[] } {
+  const cliPath = join(process.cwd(), "src/cli/qmd.ts");
+  if (process.versions.bun) {
+    return { bin: process.execPath, args: [cliPath, command] };
+  }
+  return {
+    bin: process.execPath,
+    args: [join(process.cwd(), "node_modules/tsx/dist/cli.mjs"), cliPath, command],
+  };
+}
+
 const roots: string[] = [];

 function tempProject(): string {
@ -56,12 +67,11 @@ describe("local .qmd project config", () => {
    mkdirSync(join(root, ".qmd"), { recursive: true });
    mkdirSync(join(root, "docs"), { recursive: true });
    writeFileSync(join(root, "docs", "a.md"), "# A\n\nLocal test document.\n");
-    writeFileSync(join(root, ".qmd", "index.yaml"), `collections:\n  docs:\n    path: ${JSON.stringify(join(root, "docs"))}\n    pattern: "**/*.md"\n    context:\n      /: Local test docs\n`);
+    writeFileSync(join(root, ".qmd", "index.yaml"), `collections:\n  docs:\n    path: ${JSON.stringify(join(root, "docs"))}\n    pattern: "**/*.md"\n    context:\n      /: Local test docs\nmodels:\n  embed: local-embed-model\n  rerank: local-rerank-model\n  generate: local-generate-model\n`);

    const home = join(root, "home");
-    const tsxBin = join(process.cwd(), "node_modules", ".bin", "tsx");
-    const runner = existsSync(tsxBin) ? tsxBin : "bun";
-    const output = execFileSync(runner, [join(process.cwd(), "src/cli/qmd.ts"), "status"], {
+    const { bin, args } = cliCommandArgs("status");
+    const output = execFileSync(bin, args, {
      cwd: root,
      encoding: "utf-8",
      env: {
@ -69,12 +79,19 @@ describe("local .qmd project config", () => {
        HOME: home,
        XDG_CONFIG_HOME: join(home, ".config"),
        XDG_CACHE_HOME: join(home, ".cache"),
+        QMD_EMBED_MODEL: "env-embed-model",
+        QMD_RERANK_MODEL: "env-rerank-model",
+        QMD_GENERATE_MODEL: "env-generate-model",
      },
    });

    const localIndex = join(root, ".qmd", "index.sqlite");
    expect(output).toContain(`Index: ${realpathSync(localIndex)}`);
    expect(output).toContain("docs (qmd://docs/)");
+    expect(output).toContain("Embedding:   local-embed-model");
+    expect(output).toContain("Reranking:   local-rerank-model");
+    expect(output).toContain("Generation:  local-generate-model");
+    expect(output).not.toContain("env-embed-model");
    expect(existsSync(localIndex)).toBe(true);
    expect(existsSync(join(home, ".cache", "qmd", "index.sqlite"))).toBe(false);
  });
--- a/test/mcp.test.ts
+++ b/test/mcp.test.ts
@ -186,7 +186,7 @@ function seedTestData(db: Database): void {
  for (let i = 0; i < 768; i++) embedding[i] = Math.random();

  for (const doc of docs.slice(0, 4)) { // Skip large file for embeddings
-    db.prepare(`INSERT INTO content_vectors (hash, seq, pos, model, embedded_at) VALUES (?, 0, 0, 'embeddinggemma', ?)`).run(doc.hash, now);
+    db.prepare(`INSERT INTO content_vectors (hash, seq, pos, model, embedded_at) VALUES (?, 0, 0, ?, ?)`).run(doc.hash, DEFAULT_EMBED_MODEL, now);
    db.prepare(`INSERT INTO vectors_vec (hash_seq, embedding) VALUES (?, ?)`).run(`${doc.hash}_0`, embedding);
  }
 }
--- a/test/package.test.ts
+++ b/test/package.test.ts
@ -20,8 +20,8 @@ describe("package grammar distribution", () => {
    expect(pkg.files, "published package files").toContain("scripts/check-package-grammars.mjs");
    expect(pkg.files, "published package files").toContain("skills/");
    const qmdSkill = readFileSync(new URL("skills/qmd/SKILL.md", root), "utf8");
-    expect(qmdSkill).toContain("# QMD - Quick Markdown Search");
-    expect(qmdSkill).toContain("## MCP: `query`");
+    expect(qmdSkill).toContain("# QMD - Query Markdown Documents");
+    expect(qmdSkill).toContain("## MCP Tool: `query`");
    expect(qmdSkill).not.toContain("This file is a discovery stub");

    const scriptPath = join(root.pathname, "scripts", "check-package-grammars.mjs");