Initial commit: PastPaper Master full stack
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
135
backend/backfill_similar_questions.py
Normal file
135
backend/backfill_similar_questions.py
Normal file
@@ -0,0 +1,135 @@
|
||||
"""Pre-compute similar_questions for all COMP2211 course-library questions.
|
||||
|
||||
For each question, runs the same similarity logic as the API and writes the result
|
||||
into paper_questions.similar_questions (JSONB). The API will then return this
|
||||
pre-computed value directly with no computation overhead.
|
||||
|
||||
Run from the backend directory:
|
||||
uv run python backfill_similar_questions.py [--dry-run]
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
from collections import Counter
|
||||
from app.services.supabase_client import get_supabase
|
||||
from app.routers.questions import (
|
||||
similarity_score,
|
||||
question_family,
|
||||
display_topics,
|
||||
)
|
||||
|
||||
|
||||
def run(dry_run: bool = False) -> None:
|
||||
sb = get_supabase()
|
||||
|
||||
# Fetch all ready COMP2211 papers
|
||||
papers = (
|
||||
sb.table("papers")
|
||||
.select("id, year, term, exam_type, part_label")
|
||||
.eq("course_code", "COMP2211")
|
||||
.eq("status", "ready")
|
||||
.execute()
|
||||
.data
|
||||
)
|
||||
if not papers:
|
||||
print("No ready COMP2211 papers found.")
|
||||
return
|
||||
|
||||
papers_by_id = {p["id"]: p for p in papers}
|
||||
paper_ids = list(papers_by_id.keys())
|
||||
|
||||
# Fetch all questions for these papers
|
||||
all_questions = (
|
||||
sb.table("paper_questions")
|
||||
.select(
|
||||
"id, paper_id, question_number, question_type, question_format, "
|
||||
"question_text, score, topics, analytics_topic, topic_tags, skill_tags, "
|
||||
"difficulty, knowledge_reminder, ai_hint, solution"
|
||||
)
|
||||
.in_("paper_id", paper_ids)
|
||||
.execute()
|
||||
.data
|
||||
)
|
||||
print(f"Found {len(all_questions)} questions across {len(papers)} papers.")
|
||||
|
||||
# Batch full-text scores not practical here; skip RPC, rely on tag/topic scoring
|
||||
# (text_score = 0 for all, still produces good tag-based results)
|
||||
|
||||
updated = 0
|
||||
skipped = 0
|
||||
|
||||
for i, target in enumerate(all_questions, 1):
|
||||
target_paper_id = target["paper_id"]
|
||||
target_topic = target.get("analytics_topic")
|
||||
|
||||
# Candidates: same course, different paper
|
||||
candidates = [
|
||||
q for q in all_questions
|
||||
if q["paper_id"] != target_paper_id
|
||||
]
|
||||
|
||||
# Pre-filter by analytics_topic if available
|
||||
if target_topic:
|
||||
candidates = [c for c in candidates if c.get("analytics_topic") == target_topic]
|
||||
|
||||
if not candidates:
|
||||
skipped += 1
|
||||
print(f" [{i}/{len(all_questions)}] {target['question_number']} — no candidates, skip")
|
||||
continue
|
||||
|
||||
ranked = []
|
||||
for candidate in candidates:
|
||||
match_percent, reasons = similarity_score(target, candidate, text_score=0.0)
|
||||
if match_percent < 20:
|
||||
continue
|
||||
paper = papers_by_id.get(candidate["paper_id"], {})
|
||||
source = (
|
||||
f"{paper.get('year', '')} {paper.get('term', '').title()} "
|
||||
f"{paper.get('exam_type', '').title()}"
|
||||
).strip()
|
||||
if paper.get("part_label"):
|
||||
source = f"{source} Part {paper['part_label']}"
|
||||
ranked.append({
|
||||
"id": candidate["id"],
|
||||
"paper_id": candidate["paper_id"],
|
||||
"source": source,
|
||||
"question_number": candidate["question_number"],
|
||||
"match_percent": match_percent,
|
||||
"match_reasons": reasons,
|
||||
"question_type": question_family(candidate),
|
||||
"question_text": candidate["question_text"],
|
||||
"topics": display_topics(candidate),
|
||||
"difficulty": candidate.get("difficulty"),
|
||||
"knowledge_reminder": candidate.get("knowledge_reminder", ""),
|
||||
"ai_hint": candidate.get("ai_hint", ""),
|
||||
"solution": candidate.get("solution", ""),
|
||||
})
|
||||
|
||||
ranked.sort(key=lambda item: (-item["match_percent"], item["source"], item["question_number"]))
|
||||
|
||||
# Deduplicate: best per paper
|
||||
seen_papers: set[str] = set()
|
||||
deduped = []
|
||||
for item in ranked:
|
||||
if item["paper_id"] not in seen_papers:
|
||||
seen_papers.add(item["paper_id"])
|
||||
deduped.append(item)
|
||||
deduped = deduped[:12]
|
||||
|
||||
print(f" [{i}/{len(all_questions)}] {target['question_number']} → {len(deduped)} similar", end="")
|
||||
|
||||
if dry_run:
|
||||
print(" [dry-run]")
|
||||
continue
|
||||
|
||||
sb.table("paper_questions").update({"similar_questions": deduped}).eq("id", target["id"]).execute()
|
||||
updated += 1
|
||||
print()
|
||||
|
||||
print(f"\nDone. {updated} updated, {skipped} skipped (no candidates).")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
dry_run = "--dry-run" in sys.argv
|
||||
run(dry_run=dry_run)
|
||||
Reference in New Issue
Block a user