PastpaperMaster/backend/split_comp2211_2022_spring_final_part_a.py

"""Split COMP2211 Spring 2022 final part A into subquestions."""

from __future__ import annotations

import json
import re
from dataclasses import dataclass
from pathlib import Path

from app.services.supabase_client import get_supabase


EXAM_KEY = "COMP2211-2022-spring-final-part-a"
TRUE_FALSE_OPTIONS = [{"label": "True", "text": "True"}, {"label": "False", "text": "False"}]
PROBLEM_SEED_PATH = (
    Path(__file__).resolve().parent.parent
    / "pastpaper-scraper"
    / "reviews"
    / "COMP2211"
    / "problem_seed.json"
)


@dataclass(frozen=True)
class ChildSpec:
    question_number: str
    parent_question: str
    top_level_number: str
    path: tuple[str, ...]
    score: float
    question_type: str
    question_format: str | None = None
    analytics_topic: str | None = None
    topic_primary: str | None = None
    topic_tags: tuple[str, ...] | None = None
    skill_tags: tuple[str, ...] | None = None
    page_number: int = 1


def short_answer(
    question_number: str,
    parent_question: str,
    top_level_number: str,
    path: tuple[str, ...],
    score: float,
    *,
    analytics_topic: str | None = None,
    topic_primary: str | None = None,
    topic_tags: tuple[str, ...] | None = None,
    skill_tags: tuple[str, ...] | None = None,
    page_number: int,
) -> ChildSpec:
    return ChildSpec(
        question_number=question_number,
        parent_question=parent_question,
        top_level_number=top_level_number,
        path=path,
        score=score,
        question_type="long_question",
        question_format="short_answer",
        analytics_topic=analytics_topic,
        topic_primary=topic_primary,
        topic_tags=topic_tags,
        skill_tags=skill_tags,
        page_number=page_number,
    )


CHILDREN: list[ChildSpec] = [
    ChildSpec("1a", "1", "1", ("a",), 1, "true_false", "true_false", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("concept_check", "algorithm_property"), page_number=2),
    ChildSpec("1b", "1", "1", ("b",), 1, "true_false", "true_false", "Perceptron and MLP", "Perceptron and MLP", ("Perceptron and MLP",), ("concept_check", "architecture_reasoning"), page_number=2),
    ChildSpec("1c", "1", "1", ("c",), 1, "true_false", "true_false", "Perceptron and MLP", "Perceptron and MLP", ("Perceptron and MLP",), ("concept_check", "activation_selection"), page_number=2),
    ChildSpec("1d", "1", "1", ("d",), 1, "true_false", "true_false", "Evaluation and Validation", "Evaluation and Validation", ("Evaluation and Validation",), ("concept_check", "metric_reasoning"), page_number=2),
    ChildSpec("1e", "1", "1", ("e",), 1, "true_false", "true_false", "Vision and CNN", "Vision and CNN", ("Vision and CNN",), ("concept_check", "hardware_reasoning"), page_number=2),
    ChildSpec("1f", "1", "1", ("f",), 1, "true_false", "true_false", "Vision and CNN", "Vision and CNN", ("Vision and CNN",), ("concept_check", "image_processing"), page_number=2),
    ChildSpec("1g", "1", "1", ("g",), 1, "true_false", "true_false", "Vision and CNN", "Vision and CNN", ("Vision and CNN",), ("concept_check", "cnn_architecture"), page_number=2),
    ChildSpec("1h", "1", "1", ("h",), 1, "true_false", "true_false", "Perceptron and MLP", "Perceptron and MLP", ("Perceptron and MLP",), ("concept_check", "regularization"), page_number=2),
    ChildSpec("1i", "1", "1", ("i",), 1, "true_false", "true_false", "Search and Games", "Search and Games", ("Search and Games",), ("concept_check", "game_reasoning"), page_number=2),
    ChildSpec("1j", "1", "1", ("j",), 1, "true_false", "true_false", "Search and Games", "Search and Games", ("Search and Games",), ("concept_check", "pruning_reasoning"), page_number=2),
    ChildSpec("2a", "2", "2", ("a",), 6.5, "long_question", "long_answer", "Probabilistic Models", "Probabilistic Models", ("Probabilistic Models",), ("manual_computation", "probability_reasoning", "classification_decision"), page_number=4),
    ChildSpec("2b", "2", "2", ("b",), 7.5, "long_question", "long_answer", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("distance_calculation", "algorithm_tracing", "classification_decision"), page_number=4),
    short_answer("3a", "3", "3", ("a",), 3, analytics_topic="Evaluation and Validation", topic_primary="Evaluation and Validation", topic_tags=("Evaluation and Validation",), skill_tags=("concept_explanation", "metric_reasoning"), page_number=6),
    short_answer("3b", "3", "3", ("b",), 2, analytics_topic="Perceptron and MLP", topic_primary="Perceptron and MLP", topic_tags=("Perceptron and MLP",), skill_tags=("concept_explanation", "activation_selection"), page_number=6),
    short_answer("3c", "3", "3", ("c",), 2, analytics_topic="Perceptron and MLP", topic_primary="Perceptron and MLP", topic_tags=("Perceptron and MLP",), skill_tags=("architecture_reasoning", "output_layer_design"), page_number=6),
    short_answer("3d", "3", "3", ("d",), 3, analytics_topic="Perceptron and MLP", topic_primary="Perceptron and MLP", topic_tags=("Perceptron and MLP",), skill_tags=("concept_explanation", "optimization_reasoning"), page_number=6),
    short_answer("3e_i", "3e", "3", ("e", "i"), 1, analytics_topic="Perceptron and MLP", topic_primary="Perceptron and MLP", topic_tags=("Perceptron and MLP",), skill_tags=("optimization_reasoning",), page_number=6),
    short_answer("3e_ii", "3e", "3", ("e", "ii"), 1, analytics_topic="Perceptron and MLP", topic_primary="Perceptron and MLP", topic_tags=("Perceptron and MLP",), skill_tags=("optimization_reasoning",), page_number=6),
    short_answer("3f", "3", "3", ("f",), 2, analytics_topic="Perceptron and MLP", topic_primary="Perceptron and MLP", topic_tags=("Perceptron and MLP",), skill_tags=("regularization", "concept_explanation"), page_number=6),
    ChildSpec("4a_i", "4a", "4", ("a", "i"), 2, "fill_blank", "fill_blank", page_number=7),
    ChildSpec("4a_ii", "4a", "4", ("a", "ii"), 2, "long_question", "long_answer", page_number=7),
    ChildSpec("4b_i", "4b", "4", ("b", "i"), 3, "fill_blank", "fill_blank", page_number=7),
    ChildSpec("4b_ii", "4b", "4", ("b", "ii"), 4, "fill_blank", "fill_blank", page_number=7),
    ChildSpec("4b_iii", "4b", "4", ("b", "iii"), 4, "long_question", "long_answer", page_number=7),
]


MARKER_RE = re.compile(r"(?m)^\(([a-z]+|[ivx]+)\)\s*")


def split_sections(text: str) -> tuple[str, dict[str, str]]:
    matches = list(MARKER_RE.finditer(text))
    if not matches:
        return text.strip(), {}
    intro = text[: matches[0].start()].strip()
    sections: dict[str, str] = {}
    for idx, match in enumerate(matches):
        marker = match.group(1)
        end = matches[idx + 1].start() if idx + 1 < len(matches) else len(text)
        sections[marker] = text[match.start() : end].strip()
    return intro, sections


def extract_segment(text: str, path: tuple[str, ...]) -> str:
    current = text.strip()
    carried_intro: list[str] = []
    for depth, marker in enumerate(path):
        intro, sections = split_sections(current)
        if depth == 0 and intro:
            carried_intro.append(intro)
        current = sections.get(marker, current)
    return "\n".join(part for part in [*carried_intro, current] if part).strip()


def extract_true_false_answers(answer_text: str) -> dict[str, str]:
    answers: dict[str, str] = {}
    matches = list(re.finditer(r"(?m)^\(([a-j])\)\s*\n?([TF])\b", answer_text))
    for match in matches:
        answers[match.group(1)] = match.group(2)
    return answers


def derive_correct_answer(answer_text: str) -> str | None:
    if not answer_text:
        return None
    tail = answer_text.split("Answer:", 1)[1] if "Answer:" in answer_text else answer_text
    lines = [line.strip() for line in tail.splitlines() if line.strip()]
    if not lines:
        return None
    first = lines[0]
    if first.lower().startswith("marking scheme"):
        return None
    if len(first) <= 240:
        return first
    return None


def load_seed_rows() -> dict[str, dict]:
    data = json.loads(PROBLEM_SEED_PATH.read_text())
    return {
        row["question_number"]: row
        for row in data
        if row["source_exam_key"] == EXAM_KEY
    }


def main() -> None:
    sb = get_supabase()
    paper = sb.table("papers").select("id").eq("source_exam_key", EXAM_KEY).execute().data[0]
    paper_id = paper["id"]

    current_rows = (
        sb.table("paper_questions")
        .select("*")
        .eq("paper_id", paper_id)
        .order("display_order")
        .execute()
        .data
    )
    existing_by_number = {row["question_number"]: row for row in current_rows}
    parent_rows = load_seed_rows()
    tf_answers = extract_true_false_answers(parent_rows["1"]["raw_answer_text"] or "")

    inserts = []
    for display_order, child in enumerate(CHILDREN, start=1):
        parent = parent_rows[child.top_level_number]
        existing = existing_by_number.get(child.question_number, {})
        question_text = extract_segment(parent["question_text"] or "", child.path)
        raw_answer_text = extract_segment(parent["raw_answer_text"] or "", child.path)

        correct_option = None
        correct_answer = None
        options = None
        if child.question_type == "true_false":
            correct_option = tf_answers.get(child.path[0])
            options = TRUE_FALSE_OPTIONS
        elif child.question_type == "fill_blank":
            correct_answer = derive_correct_answer(raw_answer_text)

        inserts.append(
            {
                "paper_id": paper_id,
                "question_number": child.question_number,
                "parent_question": child.parent_question,
                "display_order": display_order,
                "question_type": child.question_type,
                "question_format": child.question_format,
                "question_text": question_text,
                "score": child.score,
                "page_number": child.page_number,
                "page_y_ratio": existing.get("page_y_ratio"),
                "options": options,
                "correct_option": correct_option,
                "correct_answer": correct_answer,
                "raw_answer_text": raw_answer_text,
                "topics": existing.get("topics") or (list(child.topic_tags) if child.topic_tags else parent.get("topics")),
                "topic_primary": existing.get("topic_primary") or child.topic_primary or parent.get("topic_primary"),
                "analytics_topic": existing.get("analytics_topic") or child.analytics_topic or parent.get("analytics_topic"),
                "topic_tags": existing.get("topic_tags") or (list(child.topic_tags) if child.topic_tags else parent.get("topic_tags")),
                "skill_tags": existing.get("skill_tags") or (list(child.skill_tags) if child.skill_tags else parent.get("skill_tags")),
                "difficulty": existing.get("difficulty") or parent.get("difficulty"),
                "knowledge_reminder": existing.get("knowledge_reminder", ""),
                "ai_hint": existing.get("ai_hint", ""),
                "solution": existing.get("solution", ""),
            }
        )

    sb.table("paper_questions").delete().eq("paper_id", paper_id).execute()
    sb.table("paper_questions").insert(inserts).execute()
    sb.table("papers").update({"question_count": len(inserts), "status": "processing"}).eq("id", paper_id).execute()
    print(f"Inserted {len(inserts)} rows for {EXAM_KEY}.")


if __name__ == "__main__":
    main()