"""Pre-compute similar_questions for all COMP2211 course-library questions. For each question, runs the same similarity logic as the API and writes the result into paper_questions.similar_questions (JSONB). The API will then return this pre-computed value directly with no computation overhead. Run from the backend directory: uv run python backfill_similar_questions.py [--dry-run] """ from __future__ import annotations import sys from collections import Counter from app.services.supabase_client import get_supabase from app.routers.questions import ( similarity_score, question_family, display_topics, ) def run(dry_run: bool = False) -> None: sb = get_supabase() # Fetch all ready COMP2211 papers papers = ( sb.table("papers") .select("id, year, term, exam_type, part_label") .eq("course_code", "COMP2211") .eq("status", "ready") .execute() .data ) if not papers: print("No ready COMP2211 papers found.") return papers_by_id = {p["id"]: p for p in papers} paper_ids = list(papers_by_id.keys()) # Fetch all questions for these papers all_questions = ( sb.table("paper_questions") .select( "id, paper_id, question_number, question_type, question_format, " "question_text, score, topics, analytics_topic, topic_tags, skill_tags, " "difficulty, knowledge_reminder, ai_hint, solution" ) .in_("paper_id", paper_ids) .execute() .data ) print(f"Found {len(all_questions)} questions across {len(papers)} papers.") # Batch full-text scores not practical here; skip RPC, rely on tag/topic scoring # (text_score = 0 for all, still produces good tag-based results) updated = 0 skipped = 0 for i, target in enumerate(all_questions, 1): target_paper_id = target["paper_id"] target_topic = target.get("analytics_topic") # Candidates: same course, different paper candidates = [ q for q in all_questions if q["paper_id"] != target_paper_id ] # Pre-filter by analytics_topic if available if target_topic: candidates = [c for c in candidates if c.get("analytics_topic") == target_topic] if not candidates: skipped += 1 print(f" [{i}/{len(all_questions)}] {target['question_number']} — no candidates, skip") continue ranked = [] for candidate in candidates: match_percent, reasons = similarity_score(target, candidate, text_score=0.0) if match_percent < 20: continue paper = papers_by_id.get(candidate["paper_id"], {}) source = ( f"{paper.get('year', '')} {paper.get('term', '').title()} " f"{paper.get('exam_type', '').title()}" ).strip() if paper.get("part_label"): source = f"{source} Part {paper['part_label']}" ranked.append({ "id": candidate["id"], "paper_id": candidate["paper_id"], "source": source, "question_number": candidate["question_number"], "match_percent": match_percent, "match_reasons": reasons, "question_type": question_family(candidate), "question_text": candidate["question_text"], "topics": display_topics(candidate), "difficulty": candidate.get("difficulty"), "knowledge_reminder": candidate.get("knowledge_reminder", ""), "ai_hint": candidate.get("ai_hint", ""), "solution": candidate.get("solution", ""), }) ranked.sort(key=lambda item: (-item["match_percent"], item["source"], item["question_number"])) # Deduplicate: best per paper seen_papers: set[str] = set() deduped = [] for item in ranked: if item["paper_id"] not in seen_papers: seen_papers.add(item["paper_id"]) deduped.append(item) deduped = deduped[:12] print(f" [{i}/{len(all_questions)}] {target['question_number']} → {len(deduped)} similar", end="") if dry_run: print(" [dry-run]") continue sb.table("paper_questions").update({"similar_questions": deduped}).eq("id", target["id"]).execute() updated += 1 print() print(f"\nDone. {updated} updated, {skipped} skipped (no candidates).") if __name__ == "__main__": dry_run = "--dry-run" in sys.argv run(dry_run=dry_run)