From 8ebe5ab31be40b72d4a93abaa2d0fe23efae7f4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tobi=20L=C3=BCtke?= Date: Wed, 18 Feb 2026 19:58:41 -0500 Subject: [PATCH] feat(cli): add structured query syntax to qmd query Lines prefixed with lex:, vec:, or hyde: route directly to structured search, skipping automatic query expansion. Examples: qmd query 'lex: CAP theorem' qmd query $'lex: keywords\nvec: natural language question' qmd query $'lex: terms\nvec: question\nhyde: hypothetical answer...' Plain queries (single line, no prefix) still use automatic expansion. Multiple plain lines without prefixes error with helpful message. This lets CLI users leverage the same structured search as MCP, useful when piping from scripts or when you know exactly what query variations you want. --- src/qmd.ts | 148 +++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 128 insertions(+), 20 deletions(-) diff --git a/src/qmd.ts b/src/qmd.ts index 1a15ab9..090b5c1 100755 --- a/src/qmd.ts +++ b/src/qmd.ts @@ -59,8 +59,10 @@ import { handelize, hybridQuery, vectorSearchQuery, + structuredSearch, addLineNumbers, type ExpandedQuery, + type StructuredSubSearch, DEFAULT_EMBED_MODEL, DEFAULT_RERANK_MODEL, DEFAULT_GLOB, @@ -1939,6 +1941,64 @@ function filterByCollections(res }); } +/** + * Parse structured search query syntax. + * Lines starting with lex:, vec:, or hyde: are routed directly. + * Plain lines without prefix go through query expansion. + * + * Returns null if this is a plain query (single line, no prefix). + * Returns StructuredSubSearch[] if structured syntax detected. + * Throws if multiple plain lines (ambiguous). + * + * Examples: + * "CAP theorem" -> null (plain query, use expansion) + * "lex: CAP theorem" -> [{ type: 'lex', query: 'CAP theorem' }] + * "lex: CAP\nvec: consistency" -> [{ type: 'lex', ... }, { type: 'vec', ... }] + * "CAP\nconsistency" -> throws (multiple plain lines) + */ +function parseStructuredQuery(query: string): StructuredSubSearch[] | null { + const lines = query.split('\n').map(l => l.trim()).filter(l => l.length > 0); + if (lines.length === 0) return null; + + const prefixRe = /^(lex|vec|hyde):\s*/i; + const searches: StructuredSubSearch[] = []; + const plainLines: string[] = []; + + for (const line of lines) { + const match = line.match(prefixRe); + if (match) { + const type = match[1]!.toLowerCase() as 'lex' | 'vec' | 'hyde'; + const text = line.slice(match[0].length).trim(); + if (text.length > 0) { + searches.push({ type, query: text }); + } + } else { + plainLines.push(line); + } + } + + // All plain lines, no prefixes -> null (use normal expansion) + if (searches.length === 0 && plainLines.length === 1) { + return null; + } + + // Multiple plain lines without prefixes -> ambiguous, error + if (plainLines.length > 1) { + throw new Error( + `Ambiguous query: multiple lines without lex:/vec:/hyde: prefix.\n` + + `Either use a single line (for query expansion) or prefix each line.\n` + + `Example:\n lex: keyword terms\n vec: natural language question\n hyde: hypothetical answer passage` + ); + } + + // Mix of prefixed and one plain line -> treat plain as lex + if (plainLines.length === 1) { + searches.unshift({ type: 'lex', query: plainLines[0]! }); + } + + return searches.length > 0 ? searches : null; +} + function search(query: string, opts: OutputOptions): void { const db = getDb(); @@ -2055,28 +2115,63 @@ async function querySearch(query: string, opts: OutputOptions, _embedModel: stri checkIndexHealth(store.db); + // Check for structured query syntax (lex:/vec:/hyde: prefixes) + const structuredQueries = parseStructuredQuery(query); + await withLLMSession(async () => { - let results = await hybridQuery(store, query, { - collection: singleCollection, - limit: opts.all ? 500 : (opts.limit || 10), - minScore: opts.minScore || 0, - hooks: { - onStrongSignal: (score) => { - process.stderr.write(`${c.dim}Strong BM25 signal (${score.toFixed(2)}) — skipping expansion${c.reset}\n`); + let results; + + if (structuredQueries) { + // Structured search — user provided their own query expansions + const typeLabels = structuredQueries.map(s => s.type).join('+'); + process.stderr.write(`${c.dim}Structured search: ${structuredQueries.length} queries (${typeLabels})${c.reset}\n`); + + // Log each sub-query + for (const s of structuredQueries) { + let preview = s.query.replace(/\n/g, ' '); + if (preview.length > 72) preview = preview.substring(0, 69) + '...'; + process.stderr.write(`${c.dim}├─ ${s.type}: ${preview}${c.reset}\n`); + } + process.stderr.write(`${c.dim}└─ Searching...${c.reset}\n`); + + results = await structuredSearch(store, structuredQueries, { + collection: singleCollection, + limit: opts.all ? 500 : (opts.limit || 10), + minScore: opts.minScore || 0, + hooks: { + onRerankStart: (chunkCount) => { + process.stderr.write(`${c.dim}Reranking ${chunkCount} chunks...${c.reset}\n`); + progress.indeterminate(); + }, + onRerankDone: () => { + progress.clear(); + }, }, - onExpand: (original, expanded) => { - logExpansionTree(original, expanded); - process.stderr.write(`${c.dim}Searching ${expanded.length + 1} queries...${c.reset}\n`); + }); + } else { + // Standard hybrid query with automatic expansion + results = await hybridQuery(store, query, { + collection: singleCollection, + limit: opts.all ? 500 : (opts.limit || 10), + minScore: opts.minScore || 0, + hooks: { + onStrongSignal: (score) => { + process.stderr.write(`${c.dim}Strong BM25 signal (${score.toFixed(2)}) — skipping expansion${c.reset}\n`); + }, + onExpand: (original, expanded) => { + logExpansionTree(original, expanded); + process.stderr.write(`${c.dim}Searching ${expanded.length + 1} queries...${c.reset}\n`); + }, + onRerankStart: (chunkCount) => { + process.stderr.write(`${c.dim}Reranking ${chunkCount} chunks...${c.reset}\n`); + progress.indeterminate(); + }, + onRerankDone: () => { + progress.clear(); + }, }, - onRerankStart: (chunkCount) => { - process.stderr.write(`${c.dim}Reranking ${chunkCount} chunks...${c.reset}\n`); - progress.indeterminate(); - }, - onRerankDone: () => { - progress.clear(); - }, - }, - }); + }); + } // Post-filter for multi-collection if (collectionNames.length > 1) { @@ -2097,6 +2192,11 @@ async function querySearch(query: string, opts: OutputOptions, _embedModel: stri return; } + // Use first lex/vec query for output context, or original query + const displayQuery = structuredQueries + ? (structuredQueries.find(s => s.type === 'lex')?.query || structuredQueries.find(s => s.type === 'vec')?.query || query) + : query; + // Map to CLI output format — use bestChunk for snippet display outputResults(results.map(r => ({ file: r.file, @@ -2107,7 +2207,7 @@ async function querySearch(query: string, opts: OutputOptions, _embedModel: stri score: r.score, context: r.context, docid: r.docid, - })), query, { ...opts, limit: results.length }); + })), displayQuery, { ...opts, limit: results.length }); }, { maxDuration: 10 * 60 * 1000, name: 'querySearch' }); } @@ -2214,6 +2314,7 @@ function showHelp(): void { console.log(" qmd embed [-f] - Create vector embeddings (900 tokens/chunk, 15% overlap)"); console.log(" qmd cleanup - Remove cache and orphaned data, vacuum DB"); console.log(" qmd query - Search with query expansion + reranking (recommended)"); + console.log(" qmd query 'lex:..\\nvec:...' - Structured search (you provide lex/vec/hyde queries)"); console.log(" qmd search - Full-text keyword search (BM25, no LLM)"); console.log(" qmd vsearch - Vector similarity search (no reranking)"); console.log(" qmd mcp - Start MCP server (stdio transport)"); @@ -2237,6 +2338,13 @@ function showHelp(): void { console.log(" --xml - XML output"); console.log(" -c, --collection - Filter results to a specific collection"); console.log(""); + console.log("Structured queries (qmd query):"); + console.log(" Prefix lines with lex:, vec:, or hyde: to skip automatic expansion."); + console.log(" lex: BM25 keyword search (exact terms)"); + console.log(" vec: Vector similarity (natural language question)"); + console.log(" hyde: Vector similarity (hypothetical answer passage)"); + console.log(" Example: qmd query $'lex: CAP theorem\\nvec: consistency vs availability tradeoff'"); + console.log(""); console.log("Multi-get options:"); console.log(" -l - Maximum lines per file"); console.log(" --max-bytes - Skip files larger than N bytes (default: 10240)");