37 lines
1.2 KiB
PL/PgSQL
37 lines
1.2 KiB
PL/PgSQL
-- 007: Full-text search on paper_questions.question_text
|
|
--
|
|
-- Adds a tsvector generated column (auto-maintained by PostgreSQL on every
|
|
-- INSERT/UPDATE), a GIN index for fast @@ queries, and a batch-scoring RPC
|
|
-- used by the similar-question retrieval endpoint.
|
|
|
|
ALTER TABLE paper_questions
|
|
ADD COLUMN IF NOT EXISTS search_text tsvector
|
|
GENERATED ALWAYS AS (
|
|
to_tsvector('english', coalesce(question_text, ''))
|
|
) STORED;
|
|
|
|
CREATE INDEX IF NOT EXISTS idx_pq_search_text
|
|
ON paper_questions USING gin(search_text);
|
|
|
|
-- text_similarity_scores(query_text, candidate_ids)
|
|
-- Returns one row per candidate ID with a ts_rank_cd score normalised by
|
|
-- unique word count (normalization flag = 1). Questions that share no
|
|
-- lexemes with the query still appear in the result with score = 0 so the
|
|
-- caller always gets a complete score map for every candidate.
|
|
CREATE OR REPLACE FUNCTION text_similarity_scores(
|
|
query_text text,
|
|
candidate_ids uuid[]
|
|
)
|
|
RETURNS TABLE (question_id uuid, text_score float4)
|
|
LANGUAGE sql STABLE AS $$
|
|
SELECT
|
|
id,
|
|
ts_rank_cd(
|
|
search_text,
|
|
plainto_tsquery('english', query_text),
|
|
1 -- normalise by unique word count
|
|
)::float4
|
|
FROM paper_questions
|
|
WHERE id = ANY(candidate_ids);
|
|
$$;
|