feat: add collections array filter + improve query writing docs

- structured_search now accepts collections[] for OR filtering
- Updated skill docs with detailed query writing guidance
- lex: 2-5 keywords, include synonyms, exact names
- vec: full natural language questions with context
- hyde: 50-100 word hypothetical answer passages
This commit is contained in:
Tobi Lütke 2026-02-18 22:09:24 -05:00
parent 6d6bdff09c
commit d1ec31eab8
No known key found for this signature in database
3 changed files with 93 additions and 54 deletions

View File

@ -11,54 +11,87 @@ allowed-tools: Bash(qmd:*), mcp__qmd__*
# QMD - Quick Markdown Search
Local search engine for markdown content. Indexes notes, docs, and knowledge bases.
Local search engine for markdown content.
## Status
!`qmd status 2>/dev/null || echo "Not installed: npm install -g @tobilu/qmd"`
## MCP Search — `structured_search`
Pass 1-4 sub-queries with type `lex`, `vec`, or `hyde`:
## MCP: `structured_search`
```json
{
"searches": [
{ "type": "lex", "query": "CAP theorem consistency" },
{ "type": "vec", "query": "tradeoff between consistency and availability" }
]
],
"collections": ["notes", "docs"],
"limit": 10
}
```
| Type | Method | What to Write |
|------|--------|---------------|
| `lex` | BM25 keywords | Short phrases — exact terms, names, code |
| `vec` | Vector search | Natural language question |
| `hyde` | Vector search | Hypothetical answer (50-100 words) |
### Search Types
**Tips:**
- Quick lookup → single `lex` query
- Don't know exact terms → use `vec`
- Best results → combine `lex` + `vec` (+ `hyde` for complex topics)
- First query gets 2x weight
| Type | Method | Input |
|------|--------|-------|
| `lex` | BM25 | Keywords — exact terms, names, code |
| `vec` | Vector | Question — natural language |
| `hyde` | Vector | Answer — hypothetical result (50-100 words) |
## MCP Tools
### Writing Good Queries
**lex (keyword)**
- 2-5 terms, no filler words
- Include synonyms: `"auth authentication login"`
- Use exact names: `"PostgreSQL connection pool"`
- Code identifiers work: `"handleError async"`
**vec (semantic)**
- Full natural language question
- Be specific: `"how does the rate limiter handle burst traffic"` not `"rate limiting"`
- Include context: `"in the payment service, how are refunds processed"`
**hyde (hypothetical document)**
- Write 50-100 words of what the *answer* looks like
- Use the vocabulary you expect in the result
- Example: `"The rate limiter uses a sliding window algorithm with a 60-second window. When a client exceeds 100 requests per minute, subsequent requests return 429 Too Many Requests until the window resets."`
### Combining Types
| Goal | Approach |
|------|----------|
| Know exact terms | `lex` only |
| Don't know vocabulary | `vec` only |
| Best recall | `lex` + `vec` |
| Complex topic | `lex` + `vec` + `hyde` |
First query gets 2x weight in fusion — put your best guess first.
### Collection Filtering
```json
{ "collection": "docs" } // Single collection
{ "collections": ["docs", "notes"] } // Multiple (OR)
```
Omit both to search all collections.
## Other MCP Tools
| Tool | Use |
|------|-----|
| `structured_search` | Search with lex/vec/hyde queries |
| `get` | Retrieve doc by path or `#docid` |
| `multi_get` | Retrieve multiple docs by glob/list |
| `status` | Index health and collections |
| `multi_get` | Retrieve multiple by glob/list |
| `status` | Collections and health |
## CLI
```bash
qmd search "keywords" # BM25 keyword search
qmd vsearch "question" # Vector similarity
qmd query "question" # Auto-expand + rerank
qmd query $'lex: X\nvec: Y' # Structured (same as MCP)
qmd get "#abc123" # Retrieve by docid
qmd query "question" # Auto-expand + rerank
qmd query $'lex: X\nvec: Y' # Structured
qmd search "keywords" # BM25 only
qmd vsearch "question" # Vector only
qmd get "#abc123" # By docid
```
## Setup
@ -66,10 +99,5 @@ qmd get "#abc123" # Retrieve by docid
```bash
npm install -g @tobilu/qmd
qmd collection add ~/notes --name notes
qmd embed # Generate embeddings
```
MCP config for Claude Code (`~/.claude/settings.json`):
```json
{ "mcpServers": { "qmd": { "command": "qmd", "args": ["mcp"] } } }
qmd embed
```

View File

@ -261,11 +261,11 @@ function createMcpServer(store: Store): McpServer {
),
limit: z.number().optional().default(10).describe("Maximum number of results (default: 10)"),
minScore: z.number().optional().default(0).describe("Minimum relevance score 0-1 (default: 0)"),
collection: z.string().optional().describe("Filter to a specific collection by name"),
intent: z.string().optional().describe("(Future) Domain intent hint, e.g., 'distributed systems', 'startup finances'"),
collection: z.string().optional().describe("Filter to a single collection by name"),
collections: z.array(z.string()).optional().describe("Filter to multiple collections (OR match)"),
},
},
async ({ searches, limit, minScore, collection, intent }) => {
async ({ searches, limit, minScore, collection, collections }) => {
// Map to internal format
const subSearches: StructuredSubSearch[] = searches.map(s => ({
type: s.type,
@ -274,9 +274,9 @@ function createMcpServer(store: Store): McpServer {
const results = await structuredSearch(store, subSearches, {
collection,
collections,
limit,
minScore,
intent,
});
// Use first lex or vec query for snippet extraction
@ -582,9 +582,9 @@ export async function startMcpHttpServer(port: number, options?: { quiet?: boole
const results = await structuredSearch(store, subSearches, {
collection: params.collection,
collections: params.collections,
limit: params.limit ?? 10,
minScore: params.minScore ?? 0,
intent: params.intent,
});
// Use first lex or vec query for snippet extraction

View File

@ -3072,7 +3072,8 @@ export interface StructuredSubSearch {
}
export interface StructuredSearchOptions {
collection?: string;
collection?: string; // Single collection filter
collections?: string[]; // Multiple collections filter (OR)
limit?: number; // default 10
minScore?: number; // default 0
candidateLimit?: number; // default RERANK_CANDIDATE_LIMIT
@ -3107,9 +3108,12 @@ export async function structuredSearch(
const limit = options?.limit ?? 10;
const minScore = options?.minScore ?? 0;
const candidateLimit = options?.candidateLimit ?? RERANK_CANDIDATE_LIMIT;
const collection = options?.collection;
const hooks = options?.hooks;
// Normalize collection filter to array (undefined = all collections)
const collections: string[] | undefined = options?.collections
?? (options?.collection ? [options.collection] : undefined);
if (searches.length === 0) return [];
const rankedLists: RankedResult[][] = [];
@ -3118,16 +3122,21 @@ export async function structuredSearch(
`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`
).get();
// Helper to run search across collections (or all if undefined)
const collectionList = collections ?? [undefined]; // undefined = all collections
// Step 1: Run FTS for all lex searches (sync, instant)
for (const search of searches) {
if (search.type === 'lex') {
const ftsResults = store.searchFTS(search.query, 20, collection);
if (ftsResults.length > 0) {
for (const r of ftsResults) docidMap.set(r.filepath, r.docid);
rankedLists.push(ftsResults.map(r => ({
file: r.filepath, displayPath: r.displayPath,
title: r.title, body: r.body || "", score: r.score,
})));
for (const coll of collectionList) {
const ftsResults = store.searchFTS(search.query, 20, coll);
if (ftsResults.length > 0) {
for (const r of ftsResults) docidMap.set(r.filepath, r.docid);
rankedLists.push(ftsResults.map(r => ({
file: r.filepath, displayPath: r.displayPath,
title: r.title, body: r.body || "", score: r.score,
})));
}
}
}
}
@ -3144,16 +3153,18 @@ export async function structuredSearch(
const embedding = embeddings[i]?.embedding;
if (!embedding) continue;
const vecResults = await store.searchVec(
vecSearches[i]!.query, DEFAULT_EMBED_MODEL, 20, collection,
undefined, embedding
);
if (vecResults.length > 0) {
for (const r of vecResults) docidMap.set(r.filepath, r.docid);
rankedLists.push(vecResults.map(r => ({
file: r.filepath, displayPath: r.displayPath,
title: r.title, body: r.body || "", score: r.score,
})));
for (const coll of collectionList) {
const vecResults = await store.searchVec(
vecSearches[i]!.query, DEFAULT_EMBED_MODEL, 20, coll,
undefined, embedding
);
if (vecResults.length > 0) {
for (const r of vecResults) docidMap.set(r.filepath, r.docid);
rankedLists.push(vecResults.map(r => ({
file: r.filepath, displayPath: r.displayPath,
title: r.title, body: r.body || "", score: r.score,
})));
}
}
}
}