Initial commit: PastPaper Master full stack

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-21 12:15:35 +07:00
commit 7a09167261
105 changed files with 24799 additions and 0 deletions
--- a/backend/split_comp2211_2022_spring_final_part_b.py
+++ b/backend/split_comp2211_2022_spring_final_part_b.py
@@ -0,0 +1,232 @@
+"""Split COMP2211 Spring 2022 final part B into subquestions."""
+
+from __future__ import annotations
+
+import json
+import re
+from dataclasses import dataclass
+from pathlib import Path
+
+from app.services.supabase_client import get_supabase
+
+
+EXAM_KEY = "COMP2211-2022-spring-final-part-b"
+PROBLEM_SEED_PATH = (
+    Path(__file__).resolve().parent.parent
+    / "pastpaper-scraper"
+    / "reviews"
+    / "COMP2211"
+    / "problem_seed.json"
+)
+
+
+@dataclass(frozen=True)
+class ChildSpec:
+    question_number: str
+    parent_question: str
+    top_level_number: str
+    path: tuple[str, ...]
+    score: float
+    question_type: str
+    question_format: str | None = None
+    analytics_topic: str | None = None
+    topic_primary: str | None = None
+    topic_tags: tuple[str, ...] | None = None
+    skill_tags: tuple[str, ...] | None = None
+    options: tuple[tuple[str, str], ...] | None = None
+    correct_option: str | None = None
+    correct_answer: str | None = None
+    page_number: int = 1
+
+
+def short_answer(
+    question_number: str,
+    parent_question: str,
+    top_level_number: str,
+    path: tuple[str, ...],
+    score: float,
+    *,
+    analytics_topic: str | None = None,
+    topic_primary: str | None = None,
+    topic_tags: tuple[str, ...] | None = None,
+    skill_tags: tuple[str, ...] | None = None,
+    correct_answer: str | None = None,
+    page_number: int,
+) -> ChildSpec:
+    return ChildSpec(
+        question_number=question_number,
+        parent_question=parent_question,
+        top_level_number=top_level_number,
+        path=path,
+        score=score,
+        question_type="long_question",
+        question_format="short_answer",
+        analytics_topic=analytics_topic,
+        topic_primary=topic_primary,
+        topic_tags=topic_tags,
+        skill_tags=skill_tags,
+        correct_answer=correct_answer,
+        page_number=page_number,
+    )
+
+
+def mc(
+    question_number: str,
+    parent_question: str,
+    top_level_number: str,
+    path: tuple[str, ...],
+    score: float,
+    *,
+    options: tuple[tuple[str, str], ...],
+    correct_option: str,
+    analytics_topic: str,
+    skill_tags: tuple[str, ...],
+    page_number: int,
+) -> ChildSpec:
+    return ChildSpec(
+        question_number=question_number,
+        parent_question=parent_question,
+        top_level_number=top_level_number,
+        path=path,
+        score=score,
+        question_type="mc",
+        question_format="mc",
+        analytics_topic=analytics_topic,
+        topic_primary=analytics_topic,
+        topic_tags=(analytics_topic,),
+        skill_tags=skill_tags,
+        options=options,
+        correct_option=correct_option,
+        page_number=page_number,
+    )
+
+
+ETHICS_ABCD = (
+    ("A", "A"),
+    ("B", "B"),
+    ("C", "C"),
+    ("D", "D"),
+)
+
+
+CHILDREN: list[ChildSpec] = [
+    ChildSpec("1a", "1", "1", ("a",), 1.5, "long_question", "long_answer", page_number=2),
+    short_answer("1b", "1", "1", ("b",), 1.5, analytics_topic="Vision and CNN", topic_primary="Vision and CNN", topic_tags=("Vision and CNN",), skill_tags=("concept_explanation", "data_augmentation"), page_number=2),
+    ChildSpec("1c", "1", "1", ("c",), 4.5, "long_question", "long_answer", page_number=2),
+    short_answer("1d", "1", "1", ("d",), 2, analytics_topic="Vision and CNN", topic_primary="Vision and CNN", topic_tags=("Vision and CNN",), skill_tags=("architecture_reasoning", "parameter_reduction"), page_number=3),
+    ChildSpec("1e", "1", "1", ("e",), 2.5, "fill_blank", "fill_blank", correct_answer="1558656", page_number=3),
+    ChildSpec("1f_i", "1f", "1", ("f", "i"), 2.5, "fill_blank", "fill_blank", correct_answer="2071656", page_number=3),
+    ChildSpec("1f_ii", "1f", "1", ("f", "ii"), 2.5, "fill_blank", "fill_blank", correct_answer="150529000", page_number=4),
+    short_answer("1g", "1", "1", ("g",), 2, analytics_topic="Vision and CNN", topic_primary="Vision and CNN", topic_tags=("Vision and CNN",), skill_tags=("architecture_reasoning", "comparison"), page_number=4),
+    ChildSpec("2a", "2", "2", ("a",), 9, "long_question", "coding", page_number=5),
+    short_answer("2b", "2", "2", ("b",), 4, analytics_topic="Vision and CNN", topic_primary="Vision and CNN", topic_tags=("Vision and CNN",), skill_tags=("architecture_reasoning", "regression_reasoning"), page_number=6),
+    ChildSpec("3a", "3", "3", ("a",), 3.5, "long_question", "long_answer", page_number=9),
+    short_answer("3b", "3", "3", ("b",), 0.5, analytics_topic="Search and Games", topic_primary="Search and Games", topic_tags=("Search and Games",), skill_tags=("game_reasoning",), correct_answer="E-a", page_number=9),
+    short_answer("3c", "3", "3", ("c",), 1.5, analytics_topic="Search and Games", topic_primary="Search and Games", topic_tags=("Search and Games",), skill_tags=("concept_explanation", "game_reasoning"), page_number=9),
+    short_answer("3d", "3", "3", ("d",), 2.5, analytics_topic="Search and Games", topic_primary="Search and Games", topic_tags=("Search and Games",), skill_tags=("pruning_reasoning",), correct_answer="E-j and E-f", page_number=9),
+    mc("4a", "4", "4", ("a",), 1, options=ETHICS_ABCD, correct_option="C", analytics_topic="Ethics of AI", skill_tags=("concept_check", "ethical_reasoning"), page_number=10),
+    mc("4b", "4", "4", ("b",), 1, options=ETHICS_ABCD, correct_option="A", analytics_topic="Ethics of AI", skill_tags=("concept_check", "bias_reasoning"), page_number=10),
+    mc("4c", "4", "4", ("c",), 1, options=ETHICS_ABCD, correct_option="C", analytics_topic="Ethics of AI", skill_tags=("concept_check", "ethical_reasoning"), page_number=10),
+    mc("4d", "4", "4", ("d",), 1, options=ETHICS_ABCD, correct_option="B", analytics_topic="Ethics of AI", skill_tags=("concept_check", "bias_reasoning"), page_number=10),
+    short_answer("4e", "4", "4", ("e",), 3, analytics_topic="Ethics of AI", topic_primary="Ethics of AI", topic_tags=("Ethics of AI",), skill_tags=("argumentation", "concept_explanation"), page_number=11),
+]
+
+
+MARKER_RE = re.compile(r"(?m)^\(([a-z]+|[ivx]+)\)\s*")
+
+
+def split_sections(text: str) -> tuple[str, dict[str, str]]:
+    matches = list(MARKER_RE.finditer(text))
+    if not matches:
+        return text.strip(), {}
+    intro = text[: matches[0].start()].strip()
+    sections: dict[str, str] = {}
+    for idx, match in enumerate(matches):
+        marker = match.group(1)
+        end = matches[idx + 1].start() if idx + 1 < len(matches) else len(text)
+        sections[marker] = text[match.start() : end].strip()
+    return intro, sections
+
+
+def extract_segment(text: str, path: tuple[str, ...]) -> str:
+    current = text.strip()
+    carried_intro: list[str] = []
+    for depth, marker in enumerate(path):
+        intro, sections = split_sections(current)
+        if depth == 0 and intro:
+            carried_intro.append(intro)
+        current = sections.get(marker, current)
+    return "\n".join(part for part in [*carried_intro, current] if part).strip()
+
+
+def load_seed_rows() -> dict[str, dict]:
+    data = json.loads(PROBLEM_SEED_PATH.read_text())
+    return {
+        row["question_number"]: row
+        for row in data
+        if row["source_exam_key"] == EXAM_KEY
+    }
+
+
+def main() -> None:
+    sb = get_supabase()
+    paper = sb.table("papers").select("id").eq("source_exam_key", EXAM_KEY).execute().data[0]
+    paper_id = paper["id"]
+
+    current_rows = (
+        sb.table("paper_questions")
+        .select("*")
+        .eq("paper_id", paper_id)
+        .order("display_order")
+        .execute()
+        .data
+    )
+    existing_by_number = {row["question_number"]: row for row in current_rows}
+    parent_rows = load_seed_rows()
+
+    inserts = []
+    for display_order, child in enumerate(CHILDREN, start=1):
+        parent = parent_rows[child.top_level_number]
+        existing = existing_by_number.get(child.question_number, {})
+        question_text = extract_segment(parent["question_text"] or "", child.path)
+        raw_answer_text = extract_segment(parent["raw_answer_text"] or "", child.path)
+        options = None
+        if child.options:
+            options = [{"label": label, "text": text} for label, text in child.options]
+
+        inserts.append(
+            {
+                "paper_id": paper_id,
+                "question_number": child.question_number,
+                "parent_question": child.parent_question,
+                "display_order": display_order,
+                "question_type": child.question_type,
+                "question_format": child.question_format,
+                "question_text": question_text,
+                "score": child.score,
+                "page_number": child.page_number,
+                "page_y_ratio": existing.get("page_y_ratio"),
+                "options": options,
+                "correct_option": child.correct_option,
+                "correct_answer": child.correct_answer,
+                "raw_answer_text": raw_answer_text,
+                "topics": existing.get("topics") or (list(child.topic_tags) if child.topic_tags else parent.get("topics")),
+                "topic_primary": existing.get("topic_primary") or child.topic_primary or parent.get("topic_primary"),
+                "analytics_topic": existing.get("analytics_topic") or child.analytics_topic or parent.get("analytics_topic"),
+                "topic_tags": existing.get("topic_tags") or (list(child.topic_tags) if child.topic_tags else parent.get("topic_tags")),
+                "skill_tags": existing.get("skill_tags") or (list(child.skill_tags) if child.skill_tags else parent.get("skill_tags")),
+                "difficulty": existing.get("difficulty") or parent.get("difficulty"),
+                "knowledge_reminder": existing.get("knowledge_reminder", ""),
+                "ai_hint": existing.get("ai_hint", ""),
+                "solution": existing.get("solution", ""),
+            }
+        )
+
+    sb.table("paper_questions").delete().eq("paper_id", paper_id).execute()
+    sb.table("paper_questions").insert(inserts).execute()
+    sb.table("papers").update({"question_count": len(inserts), "status": "processing"}).eq("id", paper_id).execute()
+    print(f"Inserted {len(inserts)} rows for {EXAM_KEY}.")
+
+
+if __name__ == "__main__":
+    main()