Initial commit: PastPaper Master full stack

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-21 12:15:35 +07:00
commit 7a09167261
105 changed files with 24799 additions and 0 deletions
--- a/backend/backfill_null_ai_trio.py
+++ b/backend/backfill_null_ai_trio.py
@@ -0,0 +1,169 @@
+"""Backfill AI trio for questions where knowledge_reminder IS NULL.
+
+For each question, generates fields in two separate LLM calls to avoid token truncation:
+  Call 1 → knowledge_reminder + ai_hint  (short, ~500 tokens output)
+  Call 2 → solution                      (long, up to 4096 tokens output)
+
+Run from the backend directory:
+    uv run python backfill_null_ai_trio.py [--dry-run]
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import sys
+from app.services.supabase_client import get_supabase
+from app.services.paper_processor import qwen_json_completion
+
+
+KNOWLEDGE_HINT_PROMPT = """\
+You are an expert tutor. Given a past-paper question, produce two short study aids in English.
+
+Return JSON exactly:
+{{
+  "knowledge_reminder": "2-4 sentences summarising the key concept or formula the student must recall.",
+  "ai_hint": "1-3 sentence nudge that guides WITHOUT giving the answer away."
+}}
+
+Question:
+{payload}
+"""
+
+SOLUTION_PROMPT = """\
+You are an expert tutor. Given a past-paper question and its reference answer, write a clear, \
+step-by-step model solution in English. Show all working. Be thorough but stop when the answer \
+is complete — do not pad.
+
+Return JSON exactly:
+{{
+  "solution": "<full step-by-step solution as a single string, use \\n for line breaks>"
+}}
+
+Question:
+{payload}
+"""
+
+
+def build_payload(q: dict) -> dict:
+    ref = ""
+    if q.get("raw_answer_text"):
+        ref = q["raw_answer_text"]
+    elif q.get("correct_option"):
+        ref = f"Correct option: {q['correct_option']}"
+    elif q.get("correct_answer"):
+        ref = f"Correct answer: {q['correct_answer']}"
+
+    return {
+        "question_number": q["question_number"],
+        "question_type": q["question_type"] or "long_question",
+        "score": q.get("score") or "unknown",
+        "question_text": q.get("question_text") or "",
+        "topics": q.get("topics") or [],
+        "reference_answer": ref,
+    }
+
+
+async def process_one(sb, q: dict, dry_run: bool) -> bool:
+    payload_str = json.dumps(build_payload(q), ensure_ascii=False)
+    row_id = q["id"]
+    qnum = q["question_number"]
+
+    if dry_run:
+        print(f"    [dry-run] would process {qnum}")
+        return True
+
+    update: dict = {}
+
+    # ── Call 1: knowledge_reminder + ai_hint ─────────────────────────
+    try:
+        r1 = await qwen_json_completion(
+            system_prompt=KNOWLEDGE_HINT_PROMPT.format(payload=payload_str),
+            temperature=0.3,
+            max_tokens=1024,
+        )
+        if r1.get("knowledge_reminder"):
+            update["knowledge_reminder"] = r1["knowledge_reminder"]
+        if r1.get("ai_hint"):
+            update["ai_hint"] = r1["ai_hint"]
+    except Exception as e:
+        print(f"    WARN call-1 failed for {qnum}: {e}")
+
+    await asyncio.sleep(1)
+
+    # ── Call 2: solution ──────────────────────────────────────────────
+    try:
+        r2 = await qwen_json_completion(
+            system_prompt=SOLUTION_PROMPT.format(payload=payload_str),
+            temperature=0.3,
+            max_tokens=4096,
+        )
+        if r2.get("solution"):
+            update["solution"] = r2["solution"]
+    except Exception as e:
+        print(f"    WARN call-2 failed for {qnum}: {e}")
+
+    if not update:
+        print(f"    SKIP {qnum}: both calls returned nothing")
+        return False
+
+    sb.table("paper_questions").update(update).eq("id", row_id).execute()
+    return True
+
+
+async def backfill(dry_run: bool = False) -> None:
+    sb = get_supabase()
+
+    papers = (
+        sb.table("papers")
+        .select("id")
+        .eq("course_code", "COMP2211")
+        .eq("source_kind", "course_library")
+        .execute()
+        .data
+    )
+    paper_ids = [p["id"] for p in papers]
+    if not paper_ids:
+        print("No COMP2211 course-library papers found.")
+        return
+
+    questions = (
+        sb.table("paper_questions")
+        .select("id, paper_id, question_number, question_type, score, question_text, topics, raw_answer_text, correct_option, correct_answer")
+        .in_("paper_id", paper_ids)
+        .is_("knowledge_reminder", "null")
+        .order("paper_id")
+        .order("display_order")
+        .execute()
+        .data
+    )
+
+    if not questions:
+        print("No NULL questions found — all done!")
+        return
+
+    print(f"Found {len(questions)} questions with NULL knowledge_reminder.")
+
+    # Group by paper for cleaner output
+    from collections import defaultdict
+    by_paper: dict[str, list] = defaultdict(list)
+    for q in questions:
+        by_paper[q["paper_id"]].append(q)
+
+    total_updated = 0
+    for paper_idx, (paper_id, qs) in enumerate(by_paper.items(), 1):
+        print(f"\n[{paper_idx}/{len(by_paper)}] paper_id={paper_id} — {len(qs)} NULL questions")
+        for q in qs:
+            print(f"  Processing {q['question_number']}...", end=" ", flush=True)
+            ok = await process_one(sb, q, dry_run)
+            if ok:
+                total_updated += 1
+                print("done")
+            await asyncio.sleep(1.5)
+
+    print(f"\nDone. {total_updated}/{len(questions)} questions updated.")
+
+
+if __name__ == "__main__":
+    dry_run = "--dry-run" in sys.argv
+    asyncio.run(backfill(dry_run=dry_run))