Initial commit: PastPaper Master full stack

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-21 12:15:35 +07:00
commit 7a09167261
105 changed files with 24799 additions and 0 deletions
--- a/backend/regen_ai_trio_comp2211.py
+++ b/backend/regen_ai_trio_comp2211.py
@@ -0,0 +1,174 @@
+"""Regenerate AI trio (knowledge_reminder, ai_hint, solution) for all COMP2211 course-library questions.
+
+Reads existing paper_questions rows and runs the same BATCH_ANALYSIS_PROMPT used by
+paper_processor.py — but does UPDATE instead of INSERT, so question structure is untouched.
+
+Run from the backend directory:
+    uv run python regen_ai_trio_comp2211.py
+
+Pass --dry-run to print batches without calling the LLM or writing to the database.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import sys
+from app.services.supabase_client import get_supabase
+from app.services.paper_processor import BATCH_ANALYSIS_PROMPT, qwen_json_completion, chunked
+
+
+def build_reference_answer(q: dict) -> str:
+    if q.get("raw_answer_text"):
+        return q["raw_answer_text"]
+    if q.get("correct_option"):
+        return f"Correct option: {q['correct_option']}"
+    if q.get("correct_answer"):
+        return f"Correct answer: {q['correct_answer']}"
+    return ""
+
+
+async def regen(dry_run: bool = False) -> None:
+    sb = get_supabase()
+
+    papers = (
+        sb.table("papers")
+        .select("id")
+        .eq("course_code", "COMP2211")
+        .eq("source_kind", "course_library")
+        .execute()
+        .data
+    )
+    paper_ids = [p["id"] for p in papers]
+    if not paper_ids:
+        print("No COMP2211 course-library papers found.")
+        return
+
+    questions = (
+        sb.table("paper_questions")
+        .select("id, paper_id, question_number, question_type, score, question_text, topics, raw_answer_text, correct_option, correct_answer")
+        .in_("paper_id", paper_ids)
+        .order("paper_id")
+        .order("display_order")
+        .execute()
+        .data
+    )
+    print(f"Found {len(questions)} questions across {len(paper_ids)} papers.")
+
+    payloads = [
+        {
+            "question_number": q["question_number"],
+            "question_type": q["question_type"] or "long_question",
+            "score": q.get("score") or "unknown",
+            "question_text": q.get("question_text") or "",
+            "topics": q.get("topics") or [],
+            "reference_answer": build_reference_answer(q),
+        }
+        for q in questions
+    ]
+
+    id_by_qnum_paper: dict[tuple[str, str], str] = {
+        (q["paper_id"], q["question_number"]): q["id"]
+        for q in questions
+    }
+    paper_id_by_qnum: dict[str, str] = {
+        q["question_number"]: q["paper_id"] for q in questions
+    }
+
+    # Group payloads by paper so batches don't mix papers (cleaner context for LLM)
+    from collections import defaultdict
+    payloads_by_paper: dict[str, list[dict]] = defaultdict(list)
+    for q, payload in zip(questions, payloads):
+        payloads_by_paper[q["paper_id"]].append((q["id"], payload))
+
+    total_updated = 0
+    total_papers = len(payloads_by_paper)
+
+    for paper_idx, (paper_id, items) in enumerate(payloads_by_paper.items(), 1):
+        ids = [item[0] for item in items]
+        batch_payloads = [item[1] for item in items]
+
+        print(f"\n[{paper_idx}/{total_papers}] paper_id={paper_id} — {len(batch_payloads)} questions")
+
+        for batch_idx, batch in enumerate(chunked(batch_payloads, 3), 1):
+            print(f"  Batch {batch_idx}: questions {[b['question_number'] for b in batch]}", end="", flush=True)
+
+            if dry_run:
+                print(" [dry-run, skipped]")
+                continue
+
+            batch_start = (batch_idx - 1) * 3
+            batch_ids = ids[batch_start: batch_start + 3]
+
+            async def run_single(row_id: str, payload: dict) -> bool:
+                try:
+                    r = await qwen_json_completion(
+                        system_prompt=BATCH_ANALYSIS_PROMPT.format(
+                            questions_payload=json.dumps([payload], ensure_ascii=False),
+                        ),
+                        temperature=0.3,
+                        max_tokens=8192,
+                    )
+                    items = r.get("analyses", [])
+                    if not items:
+                        return False
+                    analysis = items[0]
+                    sb.table("paper_questions").update({
+                        "knowledge_reminder": analysis.get("knowledge_reminder", ""),
+                        "ai_hint": analysis.get("ai_hint", ""),
+                        "solution": analysis.get("solution", ""),
+                    }).eq("id", row_id).execute()
+                    return True
+                except Exception:
+                    return False
+
+            try:
+                result = await qwen_json_completion(
+                    system_prompt=BATCH_ANALYSIS_PROMPT.format(
+                        questions_payload=json.dumps(batch, ensure_ascii=False),
+                    ),
+                    temperature=0.3,
+                    max_tokens=8192,
+                )
+                analyses = {item["question_number"]: item for item in result.get("analyses", [])}
+                written = 0
+                for row_id, payload in zip(batch_ids, batch):
+                    qnum = payload["question_number"]
+                    analysis = analyses.get(qnum)
+                    if not analysis:
+                        # fallback: retry this single question alone
+                        ok = await run_single(row_id, payload)
+                        if ok:
+                            written += 1
+                            total_updated += 1
+                        else:
+                            print(f"\n  SKIP: {qnum}")
+                    else:
+                        sb.table("paper_questions").update({
+                            "knowledge_reminder": analysis.get("knowledge_reminder", ""),
+                            "ai_hint": analysis.get("ai_hint", ""),
+                            "solution": analysis.get("solution", ""),
+                        }).eq("id", row_id).execute()
+                        written += 1
+                        total_updated += 1
+                print(f" → {written} written")
+            except Exception as exc:
+                # batch failed entirely — retry each question individually
+                print(f" [batch error, retrying 1-by-1]")
+                written = 0
+                for row_id, payload in zip(batch_ids, batch):
+                    ok = await run_single(row_id, payload)
+                    if ok:
+                        written += 1
+                        total_updated += 1
+                    await asyncio.sleep(1)
+                print(f" → {written}/{len(batch)} written")
+
+            await asyncio.sleep(2.5)
+
+    print(f"\nDone. {total_updated} questions updated.")
+
+
+if __name__ == "__main__":
+    dry_run = "--dry-run" in sys.argv
+    asyncio.run(regen(dry_run=dry_run))