Merge pull request #502 from JohnRichardEnders/feat/yaml-model-config

feat: support model configuration in index.yml
2026-04-05 18:02:29 -04:00 · 2026-04-05 18:02:29 -04:00 · 54fc7b01a9
commit 54fc7b01a9
parent c940ce19d0 4f11517fb4
8 changed files with 100 additions and 16 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -18,6 +18,10 @@
  Measures precision@k, recall, MRR, and F1 across BM25, vector, hybrid,
  and full pipeline backends. Ships with an example fixture against
  the eval-docs test collection.
+- `models:` section in `index.yml` lets you configure `embed`, `rerank`,
+  and `generate` model URIs per config. Resolution order is
+  config > env var (`QMD_EMBED_MODEL`, `QMD_RERANK_MODEL`,
+  `QMD_GENERATE_MODEL`) > built-in default.
 - CLI search output now emits clickable OSC 8 terminal hyperlinks when
  stdout is a TTY. Links resolve `qmd://` paths to absolute filesystem
  paths and open in editors via URI templates (default:
--- a/src/cli/qmd.ts
+++ b/src/cli/qmd.ts
@ -77,7 +77,7 @@ import {
  type ReindexResult,
  type ChunkStrategy,
 } from "../store.js";
-import { disposeDefaultLlamaCpp, getDefaultLlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "../llm.js";
+import { disposeDefaultLlamaCpp, getDefaultLlamaCpp, setDefaultLlamaCpp, LlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "../llm.js";
 import {
  formatSearchResults,
  formatDocuments,
@ -118,6 +118,13 @@ function getStore(): ReturnType<typeof createStore> {
    try {
      const config = loadConfig();
      syncConfigToDb(store.db, config);
+      if (config.models) {
+        setDefaultLlamaCpp(new LlamaCpp({
+          embedModel: config.models.embed,
+          generateModel: config.models.generate,
+          rerankModel: config.models.rerank,
+        }));
+      }
    } catch {
      // Config may not exist yet — that's fine, DB works without it
    }
--- a/src/collections.ts
+++ b/src/collections.ts
@ -33,6 +33,15 @@ export interface Collection {
  includeByDefault?: boolean; // Include in queries by default (default: true)
 }

+/**
+ * Model configuration for embedding, reranking, and generation
+ */
+export interface ModelsConfig {
+  embed?: string;
+  rerank?: string;
+  generate?: string;
+}
+
 /**
 * The complete configuration file structure
 */
@ -41,6 +50,7 @@ export interface CollectionConfig {
  editor_uri?: string;                        // Editor URI template for terminal hyperlinks
  editor_uri_template?: string;               // Alias for editor_uri
  collections: Record<string, Collection>;    // Collection name -> config
+  models?: ModelsConfig;
 }

 /**
--- a/src/index.ts
+++ b/src/index.ts
@ -351,21 +351,26 @@ export async function createStore(options: StoreOptions): Promise<QMDStore> {
  const hasYamlConfig = !!options.configPath;

  // Sync config into SQLite store_collections
+  let config: CollectionConfig | undefined;
  if (options.configPath) {
    // YAML mode: inject config source for write-through, sync to DB
    setConfigSource({ configPath: options.configPath });
-    const config = loadConfig();
+    config = loadConfig();
    syncConfigToDb(db, config);
  } else if (options.config) {
    // Inline config mode: inject config source for mutations, sync to DB
    setConfigSource({ config: options.config });
-    syncConfigToDb(db, options.config);
+    config = options.config;
+    syncConfigToDb(db, config);
  }
  // else: DB-only mode — no external config, use existing store_collections

  // Create a per-store LlamaCpp instance — lazy-loads models on first use,
  // auto-unloads after 5 min inactivity to free VRAM.
  const llm = new LlamaCpp({
+    embedModel: config?.models?.embed,
+    generateModel: config?.models?.generate,
+    rerankModel: config?.models?.rerank,
    inactivityTimeoutMs: 5 * 60 * 1000,
    disposeModelsOnInactivity: true,
  });
--- a/src/llm.ts
+++ b/src/llm.ts
@ -193,7 +193,7 @@ export type RerankDocument = {
 // HuggingFace model URIs for node-llama-cpp
 // Format: hf:<user>/<repo>/<file>
 // Override via QMD_EMBED_MODEL env var (e.g. hf:Qwen/Qwen3-Embedding-0.6B-GGUF/Qwen3-Embedding-0.6B-Q8_0.gguf)
-const DEFAULT_EMBED_MODEL = process.env.QMD_EMBED_MODEL ?? "hf:ggml-org/embeddinggemma-300M-GGUF/embeddinggemma-300M-Q8_0.gguf";
+const DEFAULT_EMBED_MODEL = "hf:ggml-org/embeddinggemma-300M-GGUF/embeddinggemma-300M-Q8_0.gguf";
 const DEFAULT_RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
 // const DEFAULT_GENERATE_MODEL = "hf:ggml-org/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q8_0.gguf";
 const DEFAULT_GENERATE_MODEL = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";
@ -436,15 +436,19 @@ export class LlamaCpp implements LLM {


  constructor(config: LlamaCppConfig = {}) {
-    this.embedModelUri = config.embedModel || DEFAULT_EMBED_MODEL;
-    this.generateModelUri = config.generateModel || DEFAULT_GENERATE_MODEL;
-    this.rerankModelUri = config.rerankModel || DEFAULT_RERANK_MODEL;
+    this.embedModelUri = config.embedModel || process.env.QMD_EMBED_MODEL || DEFAULT_EMBED_MODEL;
+    this.generateModelUri = config.generateModel || process.env.QMD_GENERATE_MODEL || DEFAULT_GENERATE_MODEL;
+    this.rerankModelUri = config.rerankModel || process.env.QMD_RERANK_MODEL || DEFAULT_RERANK_MODEL;
    this.modelCacheDir = config.modelCacheDir || MODEL_CACHE_DIR;
    this.expandContextSize = resolveExpandContextSize(config.expandContextSize);
    this.inactivityTimeoutMs = config.inactivityTimeoutMs ?? DEFAULT_INACTIVITY_TIMEOUT_MS;
    this.disposeModelsOnInactivity = config.disposeModelsOnInactivity ?? false;
  }

+  get embedModelName(): string {
+    return this.embedModelUri;
+  }
+
  /**
   * Reset the inactivity timer. Called after each model operation.
   * When timer fires, models are unloaded to free memory (if no active sessions).
@ -1559,8 +1563,7 @@ let defaultLlamaCpp: LlamaCpp | null = null;
 */
 export function getDefaultLlamaCpp(): LlamaCpp {
  if (!defaultLlamaCpp) {
-    const embedModel = process.env.QMD_EMBED_MODEL;
-    defaultLlamaCpp = new LlamaCpp(embedModel ? { embedModel } : {});
+    defaultLlamaCpp = new LlamaCpp();
  }
  return defaultLlamaCpp;
 }
--- a/src/mcp/server.ts
+++ b/src/mcp/server.ts
@ -18,6 +18,7 @@ import { WebStandardStreamableHTTPServerTransport }
  from "@modelcontextprotocol/sdk/server/webStandardStreamableHttp.js";
 import { isInitializeRequest } from "@modelcontextprotocol/sdk/types.js";
 import { z } from "zod";
+import { existsSync } from "fs";
 import {
  createStore,
  extractSnippet,
@ -28,6 +29,7 @@ import {
  type ExpandedQuery,
  type IndexStatus,
 } from "../index.js";
+import { getConfigPath } from "../collections.js";

 // =============================================================================
 // Types for structured content
@ -536,7 +538,11 @@ Intent-aware lex (C++ performance, not sports):
 // =============================================================================

 export async function startMcpServer(): Promise<void> {
-  const store = await createStore({ dbPath: getDefaultDbPath() });
+  const configPath = getConfigPath();
+  const store = await createStore({
+    dbPath: getDefaultDbPath(),
+    ...(existsSync(configPath) ? { configPath } : {}),
+  });
  const server = await createMcpServer(store);
  const transport = new StdioServerTransport();
  await server.connect(transport);
@ -557,7 +563,11 @@ export type HttpServerHandle = {
 * Binds to localhost only. Returns a handle for shutdown and port discovery.
 */
 export async function startMcpHttpServer(port: number, options?: { quiet?: boolean }): Promise<HttpServerHandle> {
-  const store = await createStore({ dbPath: getDefaultDbPath() });
+  const configPath = getConfigPath();
+  const store = await createStore({
+    dbPath: getDefaultDbPath(),
+    ...(existsSync(configPath) ? { configPath } : {}),
+  });

  // Pre-fetch default collection names for REST endpoint
  const defaultCollectionNames = await store.getDefaultCollectionNames();
--- a/src/store.ts
+++ b/src/store.ts
@ -1414,6 +1414,7 @@ export async function generateEmbeddings(

  // Use store's LlamaCpp or global singleton, wrapped in a session
  const llm = getLlm(store);
+  const embedModelUri = llm.embedModelName;

  // Create a session manager for this llm instance
  const result = await withLLMSessionForLlm(llm, async (session) => {
@ -1471,7 +1472,7 @@ export async function generateEmbeddings(

      if (!vectorTableInitialized) {
        const firstChunk = batchChunks[0]!;
-        const firstText = formatDocForEmbedding(firstChunk.text, firstChunk.title, model);
+        const firstText = formatDocForEmbedding(firstChunk.text, firstChunk.title, embedModelUri);
        const firstResult = await session.embed(firstText, { model });
        if (!firstResult) {
          throw new Error("Failed to get embedding dimensions from first chunk");
@ -1503,7 +1504,7 @@ export async function generateEmbeddings(

        const batchEnd = Math.min(batchStart + BATCH_SIZE, batchChunks.length);
        const chunkBatch = batchChunks.slice(batchStart, batchEnd);
-        const texts = chunkBatch.map(chunk => formatDocForEmbedding(chunk.text, chunk.title, model));
+        const texts = chunkBatch.map(chunk => formatDocForEmbedding(chunk.text, chunk.title, embedModelUri));

        try {
          const embeddings = await session.embedBatch(texts, { model });
@ -1527,7 +1528,7 @@ export async function generateEmbeddings(
          } else {
            for (const chunk of chunkBatch) {
              try {
-                const text = formatDocForEmbedding(chunk.text, chunk.title, model);
+                const text = formatDocForEmbedding(chunk.text, chunk.title, embedModelUri);
                const result = await session.embed(text, { model });
                if (result) {
                  insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(result.embedding), model, now);
@ -3985,7 +3986,7 @@ export async function hybridQuery(

    // Batch embed all vector queries in a single call
    const llm = getLlm(store);
-    const textsToEmbed = vecQueries.map(q => formatQueryForEmbedding(q.text));
+    const textsToEmbed = vecQueries.map(q => formatQueryForEmbedding(q.text, llm.embedModelName));
    hooks?.onEmbedStart?.(textsToEmbed.length);
    const embedStart = Date.now();
    const embeddings = await llm.embedBatch(textsToEmbed);
@ -4368,7 +4369,7 @@ export async function structuredSearch(
    );
    if (vecSearches.length > 0) {
      const llm = getLlm(store);
-      const textsToEmbed = vecSearches.map(s => formatQueryForEmbedding(s.query));
+      const textsToEmbed = vecSearches.map(s => formatQueryForEmbedding(s.query, llm.embedModelName));
      hooks?.onEmbedStart?.(textsToEmbed.length);
      const embedStart = Date.now();
      const embeddings = await llm.embedBatch(textsToEmbed);
--- a/test/llm.test.ts
+++ b/test/llm.test.ts
@ -117,6 +117,50 @@ describe("LlamaCpp expand context size config", () => {
  });
 });

+describe("LlamaCpp model resolution (config > env > default)", () => {
+  const HARDCODED_EMBED = "hf:ggml-org/embeddinggemma-300M-GGUF/embeddinggemma-300M-Q8_0.gguf";
+  const HARDCODED_RERANK = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
+  const HARDCODED_GENERATE = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";
+
+  test("uses hardcoded default when no config or env is set", () => {
+    const prev = process.env.QMD_EMBED_MODEL;
+    delete process.env.QMD_EMBED_MODEL;
+    try {
+      const llm = new LlamaCpp({}) as any;
+      expect(llm.embedModelUri).toBe(HARDCODED_EMBED);
+      expect(llm.rerankModelUri).toBe(HARDCODED_RERANK);
+      expect(llm.generateModelUri).toBe(HARDCODED_GENERATE);
+    } finally {
+      if (prev === undefined) delete process.env.QMD_EMBED_MODEL;
+      else process.env.QMD_EMBED_MODEL = prev;
+    }
+  });
+
+  test("env var overrides hardcoded default", () => {
+    const prev = process.env.QMD_EMBED_MODEL;
+    process.env.QMD_EMBED_MODEL = "hf:custom/embed-model.gguf";
+    try {
+      const llm = new LlamaCpp({}) as any;
+      expect(llm.embedModelUri).toBe("hf:custom/embed-model.gguf");
+    } finally {
+      if (prev === undefined) delete process.env.QMD_EMBED_MODEL;
+      else process.env.QMD_EMBED_MODEL = prev;
+    }
+  });
+
+  test("config overrides env var", () => {
+    const prev = process.env.QMD_EMBED_MODEL;
+    process.env.QMD_EMBED_MODEL = "hf:env/model.gguf";
+    try {
+      const llm = new LlamaCpp({ embedModel: "hf:config/model.gguf" }) as any;
+      expect(llm.embedModelUri).toBe("hf:config/model.gguf");
+    } finally {
+      if (prev === undefined) delete process.env.QMD_EMBED_MODEL;
+      else process.env.QMD_EMBED_MODEL = prev;
+    }
+  });
+});
+
 describe("LlamaCpp rerank deduping", () => {
  test("deduplicates identical document texts before scoring", async () => {
    const llm = new LlamaCpp({}) as any;