Initial commit: PastPaper Master full stack

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-21 12:15:35 +07:00
commit 7a09167261
105 changed files with 24799 additions and 0 deletions
--- a/backend/backfill_vision.py
+++ b/backend/backfill_vision.py
@@ -0,0 +1,238 @@
+"""
+用 Vision 模式重新处理所有已 ready 的试卷：
+- 从 Supabase Storage 拉 PDF → 图片 → Vision 拆题 → exec → AI trio → 更新 DB
+
+用法:
+  python backfill_vision.py --course COMP2211
+  python backfill_vision.py --paper-id <uuid>
+"""
+
+import asyncio
+import argparse
+import requests
+from app.services.supabase_client import get_supabase
+from app.services.paper_processor import (
+    process_paper,
+    strip_nulls,
+    pdf_to_images,
+    gemini_vision_json,
+    deepseek_json_completion,
+    parse_json_response,
+    extract_code_lines,
+    try_exec_python,
+    chunked,
+    sort_questions,
+    STRUCTURE_PROMPT,
+    ANSWER_MATCH_PROMPT,
+    BATCH_ANALYSIS_PROMPT,
+)
+import json
+import traceback
+
+
+async def reprocess_paper(paper: dict):
+    """重新处理单张试卷（Vision 模式）"""
+    sb = get_supabase()
+    paper_id = paper["id"]
+    label = f"{paper['course_code']} {paper['year']} {paper['term']} {paper['exam_type']}"
+    print(f"\n=== {label} ({paper_id[:8]}) ===")
+
+    # 1. 拉 PDF
+    try:
+        pdf_bytes = requests.get(paper["paper_file_url"], timeout=60).content
+    except Exception as e:
+        print(f"  SKIP: failed to fetch PDF: {e}")
+        return
+
+    answer_bytes = None
+    if paper.get("answer_file_url"):
+        try:
+            answer_bytes = requests.get(paper["answer_file_url"], timeout=60).content
+        except Exception:
+            pass
+
+    # 2. PDF → 图片
+    print(f"  Rendering {len(pdf_to_images(pdf_bytes))} pages...", end=" ", flush=True)
+    paper_images = pdf_to_images(pdf_bytes)
+    print("done")
+
+    # 3. Vision 拆题（分批，每批 8 页）
+    PAGE_BATCH = 8
+    all_questions: list = []
+    meta: dict = {}
+    print(f"  Vision extraction ({len(paper_images)} pages, {-(-len(paper_images)//PAGE_BATCH)} batches)...")
+    for i in range(0, len(paper_images), PAGE_BATCH):
+        batch_imgs = paper_images[i:i + PAGE_BATCH]
+        print(f"    Pages {i+1}-{i+len(batch_imgs)}...", end=" ", flush=True)
+        try:
+            batch_result = await gemini_vision_json(
+                system_prompt=STRUCTURE_PROMPT,
+                images=batch_imgs,
+                user_text=f"Pages {i+1}-{i+len(batch_imgs)} of the exam paper. Extract all questions visible on these pages.",
+                temperature=0,
+            )
+            if not meta:
+                meta = {k: batch_result.get(k) for k in ("total_score", "difficulty_level", "topics_summary")}
+            qs = batch_result.get("questions", [])
+            all_questions.extend(qs)
+            print(f"done ({len(qs)} questions)")
+        except Exception as e:
+            print(f"FAILED: {e}")
+    structure = {**meta, "questions": all_questions}
+    questions = sort_questions(all_questions)
+    print(f"  Total: {len(questions)} questions extracted")
+
+    # 4. 答案匹配
+    answers_map = {}
+    if answer_bytes:
+        print("  Vision answer matching...", end=" ", flush=True)
+        answer_images = pdf_to_images(answer_bytes)
+        questions_json = json.dumps(
+            [{"question_number": q["question_number"], "question_type": q["question_type"]}
+             for q in questions], ensure_ascii=False
+        )
+        try:
+            match_result = await gemini_vision_json(
+                system_prompt=ANSWER_MATCH_PROMPT.format(
+                    questions_json=questions_json, answer_text="(See images)"
+                ),
+                images=answer_images,
+                user_text=f"Match answers to these questions: {questions_json}",
+                temperature=0,
+            )
+            answers_map = {a["question_number"]: a for a in match_result.get("answers", [])}
+            print(f"done ({len(answers_map)} matched)")
+        except Exception as e:
+            print(f"FAILED: {e}")
+
+    # 5. 构建 payloads（exec Python）
+    import numpy as np
+    exec_namespaces: dict = {}
+    batched_payloads = []
+
+    for q in questions:
+        qnum = q["question_number"]
+        answer = answers_map.get(qnum, {})
+        full_text = q["question_text"] or ""
+
+        answer_section = ""
+        if answer.get("raw_answer_text"):
+            answer_section = answer["raw_answer_text"]
+        elif answer.get("correct_option"):
+            answer_section = f"Correct option: {answer['correct_option']}"
+        elif answer.get("correct_answer"):
+            answer_section = f"Correct answer: {answer['correct_answer']}"
+
+        if not answer_section:
+            parent_q = q.get("parent_question")
+            group_key = parent_q or qnum
+            if group_key not in exec_namespaces:
+                ns: dict = {"np": np}
+                setup = extract_code_lines(full_text)
+                try_exec_python(setup, ns)
+                exec_namespaces[group_key] = ns
+            ns = exec_namespaces[group_key]
+            print_lines = [l.strip() for l in full_text.splitlines() if l.strip().startswith("print(")]
+            if print_lines:
+                out = try_exec_python(print_lines[-1], ns)
+                if out is not None:
+                    answer_section = f"Executed output: {out}"
+                    print(f"    [exec] {qnum}: {out[:60]}")
+
+        batched_payloads.append({
+            "question_number": qnum,
+            "question_type": q["question_type"],
+            "score": q.get("score", "unknown"),
+            "question_text": full_text,
+            "topics": q.get("topics", []),
+            "reference_answer": answer_section,
+        })
+
+    # 6. AI trio
+    print(f"  Generating AI trio ({len(batched_payloads)} questions, {len(list(chunked(batched_payloads, 3)))} batches)...")
+    analyses: dict = {}
+    for batch in chunked(batched_payloads, 3):
+        nums = [p["question_number"] for p in batch]
+        print(f"    Batch {nums}...", end=" ", flush=True)
+        try:
+            result = await deepseek_json_completion(
+                system_prompt=BATCH_ANALYSIS_PROMPT.format(
+                    questions_payload=json.dumps(batch, ensure_ascii=False)
+                ),
+                temperature=0.3,
+            )
+            for item in result.get("analyses", []):
+                if item.get("question_number"):
+                    analyses[item["question_number"]] = item
+            print(f"done ({len(result.get('analyses', []))})")
+        except Exception as e:
+            print(f"FAILED: {e}")
+        await asyncio.sleep(1)
+
+    # 7. 删除旧题目，写入新题目
+    print("  Writing to DB...", end=" ", flush=True)
+    sb.table("paper_questions").delete().eq("paper_id", paper_id).execute()
+
+    for i, q in enumerate(questions):
+        qnum = q["question_number"]
+        answer = answers_map.get(qnum, {})
+        analysis = analyses.get(qnum, {})
+        sb.table("paper_questions").insert(strip_nulls({
+            "paper_id": paper_id,
+            "question_number": qnum,
+            "parent_question": q.get("parent_question"),
+            "display_order": i,
+            "question_type": q["question_type"],
+            "question_text": q["question_text"],
+            "score": q.get("score"),
+            "page_number": q.get("page_number"),
+            "options": q.get("options"),
+            "correct_option": answer.get("correct_option"),
+            "correct_answer": answer.get("correct_answer"),
+            "raw_answer_text": answer.get("raw_answer_text"),
+            "topics": q.get("topics", []),
+            "analytics_topic": q.get("topics", [None])[0],
+            "topic_tags": q.get("topics", []),
+            "difficulty": q.get("difficulty"),
+            "knowledge_reminder": analysis.get("knowledge_reminder", ""),
+            "ai_hint": analysis.get("ai_hint", ""),
+            "solution": analysis.get("solution", ""),
+        })).execute()
+
+    sb.table("papers").update({
+        "question_count": len(questions),
+        "total_score": structure.get("total_score"),
+        "topics_summary": structure.get("topics_summary"),
+        "difficulty_level": structure.get("difficulty_level"),
+    }).eq("id", paper_id).execute()
+
+    print(f"done ({len(questions)} questions written)")
+
+
+async def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--course", help="Course code")
+    parser.add_argument("--paper-id", help="Single paper ID")
+    args = parser.parse_args()
+
+    sb = get_supabase()
+    query = sb.table("papers").select("*").eq("status", "ready")
+    if args.paper_id:
+        query = query.eq("id", args.paper_id)
+    elif args.course:
+        query = query.eq("course_code", args.course.upper())
+    papers = query.order("created_at").execute().data
+
+    print(f"Papers to reprocess: {len(papers)}")
+    for paper in papers:
+        try:
+            await reprocess_paper(paper)
+        except Exception as e:
+            print(f"  ERROR: {e}")
+            traceback.print_exc()
+
+    print("\nAll done.")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())