accounts/docs/init.sql

162 lines
4.9 KiB
PL/PgSQL
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

-- ================================================
-- init.sql (idempotent / repeatable)
-- Vector RAG schema for BGE-M3 (1024 dims)
-- ================================================
-- 可选,但建议:避免长时间阻塞
SET lock_timeout = '5s';
SET statement_timeout = '0';
-- 1) 扩展
CREATE EXTENSION IF NOT EXISTS vector;
-- 2) 表(如果不存在则创建最小骨架)
CREATE TABLE IF NOT EXISTS public.documents (
id BIGSERIAL PRIMARY KEY,
repo TEXT NOT NULL,
path TEXT NOT NULL,
chunk_id INT NOT NULL,
content TEXT NOT NULL,
embedding VECTOR(1024),
metadata JSONB,
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT now()
);
-- 2.1) 幂等补列content_sha
ALTER TABLE public.documents
ADD COLUMN IF NOT EXISTS content_sha TEXT;
-- 2.2) 幂等补列content_tsv
ALTER TABLE public.documents
ADD COLUMN IF NOT EXISTS content_tsv tsvector;
-- 2.3) 幂等补列doc_key优先创建为生成列表达式若已存在普通列也可共存
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1
FROM information_schema.columns
WHERE table_schema='public' AND table_name='documents' AND column_name='doc_key'
) THEN
EXECUTE $SQL$
ALTER TABLE public.documents
ADD COLUMN doc_key TEXT GENERATED ALWAYS AS (repo || ':' || path || ':' || chunk_id) STORED
$SQL$;
END IF;
END$$;
-- 2.4) 兜底触发器:若 doc_key 不是生成列或存在旧脏数据,则在写入时自动修正
CREATE OR REPLACE FUNCTION public.documents_doc_key_fill()
RETURNS trigger AS $$
BEGIN
IF NEW.doc_key IS NULL OR NEW.doc_key <> (NEW.repo || ':' || NEW.path || ':' || NEW.chunk_id) THEN
NEW.doc_key := (NEW.repo || ':' || NEW.path || ':' || NEW.chunk_id);
END IF;
RETURN NEW;
END
$$ LANGUAGE plpgsql;
DO $$
BEGIN
IF NOT EXISTS (SELECT 1 FROM pg_trigger WHERE tgname='trg_documents_doc_key_fill') THEN
CREATE TRIGGER trg_documents_doc_key_fill
BEFORE INSERT OR UPDATE OF repo, path, chunk_id
ON public.documents
FOR EACH ROW
EXECUTE FUNCTION public.documents_doc_key_fill();
END IF;
END$$;
-- 3) 唯一约束 / 唯一索引(满足 ON CONFLICT (doc_key)
-- 用唯一索引实现,幂等
CREATE UNIQUE INDEX IF NOT EXISTS documents_doc_key_uk
ON public.documents (doc_key);
-- 4) HNSW 索引cosine 距离 + 仅索引非空向量)
-- 某些 pgvector 旧版不支持 WITH(m, ef_construction),做兼容回退
DO $$
BEGIN
-- 先尝试带参数版本
BEGIN
EXECUTE $SQL$
CREATE INDEX IF NOT EXISTS documents_embedding_idx
ON public.documents
USING hnsw (embedding vector_cosine_ops)
WITH (m = 16, ef_construction = 64)
WHERE embedding IS NOT NULL
$SQL$;
EXCEPTION WHEN others THEN
-- 参数不支持则退化为无参数版本
PERFORM 1;
EXECUTE 'DROP INDEX IF EXISTS documents_embedding_idx';
EXECUTE $SQL$
CREATE INDEX IF NOT EXISTS documents_embedding_idx
ON public.documents
USING hnsw (embedding vector_cosine_ops)
WHERE embedding IS NOT NULL
$SQL$;
END;
END$$;
-- 5) 元数据 & 常用过滤索引
CREATE INDEX IF NOT EXISTS idx_documents_metadata
ON public.documents USING gin (metadata);
CREATE INDEX IF NOT EXISTS idx_documents_repo
ON public.documents (repo);
CREATE INDEX IF NOT EXISTS idx_documents_path
ON public.documents (path);
CREATE INDEX IF NOT EXISTS idx_documents_repo_path
ON public.documents (repo, path);
-- 6) 全文检索(英文示例;中文可换词典/外部分词)
CREATE OR REPLACE FUNCTION public.documents_tsv_trigger()
RETURNS trigger AS $$
BEGIN
NEW.content_tsv := setweight(to_tsvector('english', coalesce(NEW.content,'')), 'A');
RETURN NEW;
END
$$ LANGUAGE plpgsql;
DO $$
BEGIN
IF NOT EXISTS (SELECT 1 FROM pg_trigger WHERE tgname='documents_tsv_update') THEN
CREATE TRIGGER documents_tsv_update
BEFORE INSERT OR UPDATE OF content
ON public.documents
FOR EACH ROW
EXECUTE FUNCTION public.documents_tsv_trigger();
END IF;
END$$;
CREATE INDEX IF NOT EXISTS idx_documents_tsv
ON public.documents USING gin (content_tsv);
-- 7) updated_at 自动维护
CREATE OR REPLACE FUNCTION public.set_updated_at()
RETURNS trigger AS $$
BEGIN
NEW.updated_at := now();
RETURN NEW;
END
$$ LANGUAGE plpgsql;
DO $$
BEGIN
IF NOT EXISTS (SELECT 1 FROM pg_trigger WHERE tgname='trg_documents_set_updated_at') THEN
CREATE TRIGGER trg_documents_set_updated_at
BEFORE UPDATE ON public.documents
FOR EACH ROW
EXECUTE FUNCTION public.set_updated_at();
END IF;
END$$;
-- 8) 可选:一次性回填已有行的 content_tsv / doc_key首次迁移时执行之后可注释掉
-- UPDATE public.documents
-- SET content_tsv = setweight(to_tsvector('english', coalesce(content,'')), 'A'),
-- doc_key = (repo || ':' || path || ':' || chunk_id)
-- WHERE content_tsv IS NULL OR doc_key IS NULL;