Add opt-in AST-aware chunk boundary detection for code files using
web-tree-sitter. When enabled with `--chunk-strategy auto`, code files
(.ts, .tsx, .js, .jsx, .py, .go, .rs) are chunked at function, class,
and import boundaries instead of arbitrary text positions. Default
behavior (`regex`) is unchanged — no surprises on upgrade.
In testing on QMD's own codebase, AST mode split 42% fewer function
bodies across chunk boundaries compared to regex-only chunking.
Usage:
qmd embed --chunk-strategy auto
qmd query "search terms" --chunk-strategy auto
What's included:
- Language detection from file extension with support for TypeScript,
JavaScript (including arrow functions and function expressions),
Python, Go, and Rust
- Per-language tree-sitter queries with scored break points aligned to
the existing markdown scale (class=100, function=90, type=80, import=60)
- AST break points merged with regex break points — highest score wins
at each position, so embedded markdown (comments, docstrings) still
benefits from regex patterns
- Refactored chunking core: chunkDocumentWithBreakPoints() extracted,
mergeBreakPoints() added, async chunkDocumentAsync() wrapper for AST
- ChunkStrategy type ("auto" | "regex") threaded through
generateEmbeddings(), hybridQuery(), structuredSearch(), CLI, and SDK
- getASTStatus() health check wired into `qmd status`
- Parse failures log a warning and fall back to regex — never crash
Hardening:
- Grammar packages are optionalDependencies with pinned versions to
prevent ABI breaks from semver drift
- web-tree-sitter is a direct dependency (pinned)
- Errors are logged (not silently swallowed) for debuggability
- Tested on both Node.js and Bun (Bun is actually faster)
Testing:
- 26 unit tests (test/ast.test.ts) — all 4 languages, error handling
- 7 integration tests (test/store.test.ts) — merge, equivalence, bypass
- Standalone test-ast-chunking.mjs with 63 synthetic tests and a
real-collection performance scanner (npx tsx test-ast-chunking.mjs ~/code)
- Validated end-to-end with qmd embed + qmd query on QMD's own codebase
- Zero markdown regressions across all test paths
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
100 lines
2.6 KiB
JSON
100 lines
2.6 KiB
JSON
{
|
|
"name": "@tobilu/qmd",
|
|
"version": "2.0.1",
|
|
"description": "Query Markup Documents - On-device hybrid search for markdown files with BM25, vector search, and LLM reranking",
|
|
"type": "module",
|
|
"main": "dist/index.js",
|
|
"types": "dist/index.d.ts",
|
|
"exports": {
|
|
".": {
|
|
"import": "./dist/index.js",
|
|
"types": "./dist/index.d.ts"
|
|
}
|
|
},
|
|
"bin": {
|
|
"qmd": "bin/qmd"
|
|
},
|
|
"files": [
|
|
"bin/",
|
|
"dist/",
|
|
"LICENSE",
|
|
"CHANGELOG.md"
|
|
],
|
|
"scripts": {
|
|
"prepare": "[ -d .git ] && ./scripts/install-hooks.sh || true",
|
|
"build": "tsc -p tsconfig.build.json && printf '#!/usr/bin/env node\n' | cat - dist/cli/qmd.js > dist/cli/qmd.tmp && mv dist/cli/qmd.tmp dist/cli/qmd.js && chmod +x dist/cli/qmd.js",
|
|
"test": "vitest run --reporter=verbose test/",
|
|
"qmd": "tsx src/cli/qmd.ts",
|
|
"index": "tsx src/cli/qmd.ts index",
|
|
"vector": "tsx src/cli/qmd.ts vector",
|
|
"search": "tsx src/cli/qmd.ts search",
|
|
"vsearch": "tsx src/cli/qmd.ts vsearch",
|
|
"rerank": "tsx src/cli/qmd.ts rerank",
|
|
"inspector": "npx @modelcontextprotocol/inspector tsx src/cli/qmd.ts mcp",
|
|
"release": "./scripts/release.sh"
|
|
},
|
|
"publishConfig": {
|
|
"access": "public"
|
|
},
|
|
"repository": {
|
|
"type": "git",
|
|
"url": "git+https://github.com/tobi/qmd.git"
|
|
},
|
|
"homepage": "https://github.com/tobi/qmd#readme",
|
|
"bugs": {
|
|
"url": "https://github.com/tobi/qmd/issues"
|
|
},
|
|
"dependencies": {
|
|
"@modelcontextprotocol/sdk": "^1.25.1",
|
|
"better-sqlite3": "^12.4.5",
|
|
"fast-glob": "^3.3.0",
|
|
"node-llama-cpp": "^3.17.1",
|
|
"picomatch": "^4.0.0",
|
|
"sqlite-vec": "^0.1.7-alpha.2",
|
|
"web-tree-sitter": "0.26.7",
|
|
"yaml": "^2.8.2",
|
|
"zod": "4.2.1"
|
|
},
|
|
"optionalDependencies": {
|
|
"sqlite-vec-darwin-arm64": "^0.1.7-alpha.2",
|
|
"sqlite-vec-darwin-x64": "^0.1.7-alpha.2",
|
|
"sqlite-vec-linux-arm64": "^0.1.7-alpha.2",
|
|
"sqlite-vec-linux-x64": "^0.1.7-alpha.2",
|
|
"sqlite-vec-windows-x64": "^0.1.7-alpha.2",
|
|
"tree-sitter-go": "0.23.4",
|
|
"tree-sitter-python": "0.23.4",
|
|
"tree-sitter-rust": "0.24.0",
|
|
"tree-sitter-typescript": "0.23.2"
|
|
},
|
|
"devDependencies": {
|
|
"@types/better-sqlite3": "^7.6.0",
|
|
"tsx": "^4.0.0",
|
|
"vitest": "^3.0.0"
|
|
},
|
|
"peerDependencies": {
|
|
"typescript": "^5.9.3"
|
|
},
|
|
"engines": {
|
|
"node": ">=22.0.0"
|
|
},
|
|
"keywords": [
|
|
"markdown",
|
|
"search",
|
|
"fts",
|
|
"full-text-search",
|
|
"vector",
|
|
"semantic-search",
|
|
"sqlite",
|
|
"bm25",
|
|
"embeddings",
|
|
"rag",
|
|
"mcp",
|
|
"reranking",
|
|
"knowledge-base",
|
|
"local-ai",
|
|
"llm"
|
|
],
|
|
"author": "Tobi Lutke <tobi@lutke.com>",
|
|
"license": "MIT"
|
|
}
|