67 lines
2.2 KiB
SQL
67 lines
2.2 KiB
SQL
-- ================================================
|
||
-- init.sql - Stable RAG schema with Hybrid Search
|
||
-- For pgvector ≥ 0.5, BGE-M3 (1024 dims), zhparser+english
|
||
-- ================================================
|
||
|
||
-- 1. 避免锁表/阻塞
|
||
SET lock_timeout = '5s';
|
||
SET statement_timeout = '0';
|
||
|
||
-- 2. 必要扩展(向量 + 中文分词)
|
||
CREATE EXTENSION IF NOT EXISTS vector;
|
||
CREATE EXTENSION IF NOT EXISTS zhparser;
|
||
|
||
-- 3. 中文+ 英文混合全文检索配置(zhparser + simple)
|
||
-- 自定义配置名:zhcn_search
|
||
DO $$
|
||
BEGIN
|
||
IF NOT EXISTS (SELECT 1 FROM pg_ts_config WHERE cfgname = 'zhcn_search') THEN
|
||
CREATE TEXT SEARCH CONFIGURATION zhcn_search (PARSER = zhparser);
|
||
ALTER TEXT SEARCH CONFIGURATION zhcn_search
|
||
ADD MAPPING FOR n,v,a,i,e,l WITH simple;
|
||
END IF;
|
||
END$$;
|
||
|
||
-- 4. 创建主表
|
||
CREATE TABLE IF NOT EXISTS public.documents (
|
||
id BIGSERIAL PRIMARY KEY,
|
||
repo TEXT NOT NULL,
|
||
path TEXT NOT NULL,
|
||
chunk_id INT NOT NULL,
|
||
content TEXT NOT NULL,
|
||
embedding VECTOR(1024), -- 向量字段(bge-m3)
|
||
metadata JSONB,
|
||
content_sha TEXT NOT NULL,
|
||
|
||
-- 中文+英文全文搜索字段
|
||
content_tsv tsvector GENERATED ALWAYS AS (
|
||
setweight(to_tsvector('zhcn_search', coalesce(content, '')), 'A')
|
||
) STORED,
|
||
|
||
-- 文档唯一标识(组合键 doc_key)
|
||
doc_key TEXT GENERATED ALWAYS AS (
|
||
repo || ':' || path || ':' || chunk_id
|
||
) STORED,
|
||
|
||
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||
updated_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
||
);
|
||
|
||
-- 5. 唯一约束(支持 UPSERT)
|
||
CREATE UNIQUE INDEX IF NOT EXISTS documents_doc_key_uk
|
||
ON public.documents (doc_key);
|
||
|
||
-- 6. 向量索引(仅索引非空 embedding)
|
||
CREATE INDEX IF NOT EXISTS documents_embedding_idx
|
||
ON public.documents
|
||
USING hnsw (embedding vector_cosine_ops)
|
||
WHERE embedding IS NOT NULL;
|
||
|
||
-- 7. 全文索引(中文 + 英文,使用 zhcn_search)
|
||
CREATE INDEX IF NOT EXISTS idx_documents_tsv
|
||
ON public.documents USING gin (content_tsv);
|
||
|
||
-- 8. 复合过滤索引(适配 repo + path 检索场景)
|
||
CREATE INDEX IF NOT EXISTS idx_documents_repo_path
|
||
ON public.documents (repo, path);
|