Initial commit: PastPaper Master full stack

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-21 12:15:35 +07:00
commit 7a09167261
105 changed files with 24799 additions and 0 deletions
--- a/backend/backfill_similar_questions.py
+++ b/backend/backfill_similar_questions.py
@@ -0,0 +1,135 @@
+"""Pre-compute similar_questions for all COMP2211 course-library questions.
+
+For each question, runs the same similarity logic as the API and writes the result
+into paper_questions.similar_questions (JSONB). The API will then return this
+pre-computed value directly with no computation overhead.
+
+Run from the backend directory:
+    uv run python backfill_similar_questions.py [--dry-run]
+"""
+
+from __future__ import annotations
+
+import sys
+from collections import Counter
+from app.services.supabase_client import get_supabase
+from app.routers.questions import (
+    similarity_score,
+    question_family,
+    display_topics,
+)
+
+
+def run(dry_run: bool = False) -> None:
+    sb = get_supabase()
+
+    # Fetch all ready COMP2211 papers
+    papers = (
+        sb.table("papers")
+        .select("id, year, term, exam_type, part_label")
+        .eq("course_code", "COMP2211")
+        .eq("status", "ready")
+        .execute()
+        .data
+    )
+    if not papers:
+        print("No ready COMP2211 papers found.")
+        return
+
+    papers_by_id = {p["id"]: p for p in papers}
+    paper_ids = list(papers_by_id.keys())
+
+    # Fetch all questions for these papers
+    all_questions = (
+        sb.table("paper_questions")
+        .select(
+            "id, paper_id, question_number, question_type, question_format, "
+            "question_text, score, topics, analytics_topic, topic_tags, skill_tags, "
+            "difficulty, knowledge_reminder, ai_hint, solution"
+        )
+        .in_("paper_id", paper_ids)
+        .execute()
+        .data
+    )
+    print(f"Found {len(all_questions)} questions across {len(papers)} papers.")
+
+    # Batch full-text scores not practical here; skip RPC, rely on tag/topic scoring
+    # (text_score = 0 for all, still produces good tag-based results)
+
+    updated = 0
+    skipped = 0
+
+    for i, target in enumerate(all_questions, 1):
+        target_paper_id = target["paper_id"]
+        target_topic = target.get("analytics_topic")
+
+        # Candidates: same course, different paper
+        candidates = [
+            q for q in all_questions
+            if q["paper_id"] != target_paper_id
+        ]
+
+        # Pre-filter by analytics_topic if available
+        if target_topic:
+            candidates = [c for c in candidates if c.get("analytics_topic") == target_topic]
+
+        if not candidates:
+            skipped += 1
+            print(f"  [{i}/{len(all_questions)}] {target['question_number']} — no candidates, skip")
+            continue
+
+        ranked = []
+        for candidate in candidates:
+            match_percent, reasons = similarity_score(target, candidate, text_score=0.0)
+            if match_percent < 20:
+                continue
+            paper = papers_by_id.get(candidate["paper_id"], {})
+            source = (
+                f"{paper.get('year', '')} {paper.get('term', '').title()} "
+                f"{paper.get('exam_type', '').title()}"
+            ).strip()
+            if paper.get("part_label"):
+                source = f"{source} Part {paper['part_label']}"
+            ranked.append({
+                "id": candidate["id"],
+                "paper_id": candidate["paper_id"],
+                "source": source,
+                "question_number": candidate["question_number"],
+                "match_percent": match_percent,
+                "match_reasons": reasons,
+                "question_type": question_family(candidate),
+                "question_text": candidate["question_text"],
+                "topics": display_topics(candidate),
+                "difficulty": candidate.get("difficulty"),
+                "knowledge_reminder": candidate.get("knowledge_reminder", ""),
+                "ai_hint": candidate.get("ai_hint", ""),
+                "solution": candidate.get("solution", ""),
+            })
+
+        ranked.sort(key=lambda item: (-item["match_percent"], item["source"], item["question_number"]))
+
+        # Deduplicate: best per paper
+        seen_papers: set[str] = set()
+        deduped = []
+        for item in ranked:
+            if item["paper_id"] not in seen_papers:
+                seen_papers.add(item["paper_id"])
+                deduped.append(item)
+        deduped = deduped[:12]
+
+        print(f"  [{i}/{len(all_questions)}] {target['question_number']} → {len(deduped)} similar", end="")
+
+        if dry_run:
+            print(" [dry-run]")
+            continue
+
+        sb.table("paper_questions").update({"similar_questions": deduped}).eq("id", target["id"]).execute()
+        updated += 1
+        print()
+
+    print(f"\nDone. {updated} updated, {skipped} skipped (no candidates).")
+
+
+if __name__ == "__main__":
+    dry_run = "--dry-run" in sys.argv
+    run(dry_run=dry_run)