Files
PastpaperMaster/backend/split_comp2211_2024_spring_midterm.py
Zhao 7a09167261 Initial commit: PastPaper Master full stack
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-21 12:27:47 +07:00

292 lines
18 KiB
Python

"""Rebuild COMP2211 Spring 2024 midterm into subquestions."""
from __future__ import annotations
import json
import re
from dataclasses import dataclass
from pathlib import Path
import fitz
from app.services.supabase_client import get_supabase
EXAM_KEY = "COMP2211-2024-spring-midterm"
ROOT = Path(__file__).resolve().parent.parent
QUESTION_PDF = ROOT / "pastpaper-scraper" / "papers" / "COMP2211" / "(COMP2211)[2024](s)midterm~=rcidkjgf^_82003.pdf"
ANSWER_PDF = ROOT / "pastpaper-scraper" / "papers" / "COMP2211" / "(COMP2211)[2024](s)midterm~=ubrzkjmz^_90406.pdf"
PROBLEM_SEED_PATH = ROOT / "pastpaper-scraper" / "reviews" / "COMP2211" / "problem_seed.json"
TRUE_FALSE_OPTIONS = [{"label": "True", "text": "True"}, {"label": "False", "text": "False"}]
@dataclass(frozen=True)
class ChildSpec:
question_number: str
parent_question: str
top_level_number: str
path: tuple[str, ...]
score: float
question_type: str
question_format: str | None = None
analytics_topic: str | None = None
topic_primary: str | None = None
topic_tags: tuple[str, ...] | None = None
skill_tags: tuple[str, ...] | None = None
page_number: int = 1
def short_answer(
question_number: str,
parent_question: str,
top_level_number: str,
path: tuple[str, ...],
score: float,
*,
analytics_topic: str | None = None,
topic_primary: str | None = None,
topic_tags: tuple[str, ...] | None = None,
skill_tags: tuple[str, ...] | None = None,
page_number: int,
) -> ChildSpec:
return ChildSpec(
question_number=question_number,
parent_question=parent_question,
top_level_number=top_level_number,
path=path,
score=score,
question_type="long_question",
question_format="short_answer",
analytics_topic=analytics_topic,
topic_primary=topic_primary,
topic_tags=topic_tags,
skill_tags=skill_tags,
page_number=page_number,
)
CHILDREN: list[ChildSpec] = [
ChildSpec("1a", "1", "1", ("a",), 0.5, "true_false", "true_false", "Python Fundamentals", "Python Fundamentals", ("Python Fundamentals",), ("concept_check", "code_tracing"), page_number=3),
ChildSpec("1b", "1", "1", ("b",), 0.5, "true_false", "true_false", "Python Fundamentals", "Python Fundamentals", ("Python Fundamentals",), ("concept_check", "broadcasting"), page_number=3),
ChildSpec("1c", "1", "1", ("c",), 0.5, "true_false", "true_false", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("concept_check", "algorithm_property"), page_number=3),
ChildSpec("1d", "1", "1", ("d",), 0.5, "true_false", "true_false", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("concept_check", "tie_reasoning"), page_number=3),
ChildSpec("1e", "1", "1", ("e",), 0.5, "true_false", "true_false", "Evaluation and Validation", "Evaluation and Validation", ("Evaluation and Validation",), ("concept_check", "cross_validation"), page_number=3),
ChildSpec("1f", "1", "1", ("f",), 0.5, "true_false", "true_false", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("concept_check", "clustering_property"), page_number=3),
ChildSpec("1g", "1", "1", ("g",), 0.5, "true_false", "true_false", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("concept_check", "robustness_reasoning"), page_number=3),
ChildSpec("1h", "1", "1", ("h",), 0.5, "true_false", "true_false", "Perceptron and MLP", "Perceptron and MLP", ("Perceptron and MLP",), ("concept_check", "decision_boundary"), page_number=3),
ChildSpec("1i", "1", "1", ("i",), 0.5, "true_false", "true_false", "Perceptron and MLP", "Perceptron and MLP", ("Perceptron and MLP",), ("concept_check", "optimization_reasoning"), page_number=3),
ChildSpec("1j", "1", "1", ("j",), 0.5, "true_false", "true_false", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("concept_check", "clustering_property"), page_number=3),
short_answer("2a_i", "2a", "2", ("a", "i"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("code_tracing",), page_number=4),
short_answer("2a_ii", "2a", "2", ("a", "ii"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("code_tracing",), page_number=4),
short_answer("2a_iii", "2a", "2", ("a", "iii"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("array_manipulation",), page_number=5),
short_answer("2a_iv", "2a", "2", ("a", "iv"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("array_construction",), page_number=5),
short_answer("2a_v", "2a", "2", ("a", "v"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("aggregation",), page_number=5),
short_answer("2a_vi", "2a", "2", ("a", "vi"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("transpose",), page_number=6),
short_answer("2a_vii", "2a", "2", ("a", "vii"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("matrix_multiplication",), page_number=6),
short_answer("2a_viii", "2a", "2", ("a", "viii"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("dot_product",), page_number=6),
short_answer("2a_ix", "2a", "2", ("a", "ix"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("broadcasting",), page_number=6),
short_answer("2a_x", "2a", "2", ("a", "x"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("error_reasoning",), page_number=7),
short_answer("2a_xi", "2a", "2", ("a", "xi"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("broadcasting",), page_number=7),
short_answer("2a_xii", "2a", "2", ("a", "xii"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("slicing",), page_number=7),
short_answer("2a_xiii", "2a", "2", ("a", "xiii"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("views_vs_copies",), page_number=7),
ChildSpec("2b", "2", "2", ("b",), 6, "long_question", "coding", "Python Fundamentals", "Python Fundamentals", ("Python Fundamentals",), ("implementation", "vectorization", "similarity_computation"), page_number=8),
ChildSpec("3a", "3", "3", ("a",), 5.5, "long_question", "long_answer", "Evaluation and Validation", "Evaluation and Validation", ("Evaluation and Validation",), ("manual_computation", "metric_reasoning"), page_number=10),
short_answer("3b", "3", "3", ("b",), 1, analytics_topic="Evaluation and Validation", topic_primary="Evaluation and Validation", topic_tags=("Evaluation and Validation",), skill_tags=("metric_reasoning",), page_number=11),
ChildSpec("3c", "3", "3", ("c",), 2.5, "long_question", "long_answer", "Evaluation and Validation", "Evaluation and Validation", ("Evaluation and Validation",), ("manual_computation", "metric_reasoning"), page_number=11),
short_answer("3d", "3", "3", ("d",), 1, analytics_topic="Evaluation and Validation", topic_primary="Evaluation and Validation", topic_tags=("Evaluation and Validation",), skill_tags=("metric_reasoning",), page_number=12),
ChildSpec("3e", "3", "3", ("e",), 6, "long_question", "coding", "Evaluation and Validation", "Evaluation and Validation", ("Evaluation and Validation",), ("implementation", "metrics", "vectorization"), page_number=12),
ChildSpec("4a", "4", "4", ("a",), 4, "long_question", "long_answer", "Probabilistic Models", "Probabilistic Models", ("Probabilistic Models",), ("manual_computation", "gaussian_nb"), page_number=15),
ChildSpec("4b", "4", "4", ("b",), 3, "long_question", "long_answer", "Probabilistic Models", "Probabilistic Models", ("Probabilistic Models",), ("manual_computation", "likelihood_reasoning"), page_number=15),
ChildSpec("4c", "4", "4", ("c",), 4, "long_question", "long_answer", "Probabilistic Models", "Probabilistic Models", ("Probabilistic Models",), ("laplace_smoothing", "likelihood_reasoning"), page_number=16),
short_answer("4d", "4", "4", ("d",), 2, analytics_topic="Probabilistic Models", topic_primary="Probabilistic Models", topic_tags=("Probabilistic Models",), skill_tags=("prior_reasoning",), page_number=17),
ChildSpec("4e", "4", "4", ("e",), 3, "long_question", "long_answer", "Probabilistic Models", "Probabilistic Models", ("Probabilistic Models",), ("posterior_reasoning", "classification_decision"), page_number=17),
ChildSpec("5a", "5", "5", ("a",), 3, "long_question", "long_answer", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("distance_calculation", "weighted_knn"), page_number=18),
ChildSpec("5b", "5", "5", ("b",), 13, "long_question", "long_answer", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("cross_validation", "manual_tracing", "model_selection"), page_number=18),
short_answer("5c", "5", "5", ("c",), 2, analytics_topic="KNN and Clustering", topic_primary="KNN and Clustering", topic_tags=("KNN and Clustering",), skill_tags=("test_error", "model_selection"), page_number=20),
ChildSpec("6a", "6", "6", ("a",), 6, "long_question", "long_answer", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("manual_computation", "clustering"), page_number=21),
ChildSpec("6b", "6", "6", ("b",), 6, "long_question", "long_answer", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("manual_computation", "clustering"), page_number=22),
short_answer("6c", "6", "6", ("c",), 2, analytics_topic="KNN and Clustering", topic_primary="KNN and Clustering", topic_tags=("KNN and Clustering",), skill_tags=("outlier_reasoning",), page_number=22),
short_answer("6d", "6", "6", ("d",), 2, analytics_topic="KNN and Clustering", topic_primary="KNN and Clustering", topic_tags=("KNN and Clustering",), skill_tags=("model_selection", "threshold_reasoning"), page_number=22),
ChildSpec("7", "7", "7", (), 10, "long_question", "long_answer", "Evaluation and Validation", "Evaluation and Validation", ("Evaluation and Validation",), ("cross_validation", "data_leakage_reasoning"), page_number=23),
]
MARKER_RE = re.compile(r"(?m)^\(([a-z]+|[ivx]+)\)\s*")
def split_sections(text: str) -> tuple[str, dict[str, str]]:
matches = list(MARKER_RE.finditer(text))
if not matches:
return text.strip(), {}
intro = text[: matches[0].start()].strip()
sections: dict[str, str] = {}
for idx, match in enumerate(matches):
marker = match.group(1)
end = matches[idx + 1].start() if idx + 1 < len(matches) else len(text)
sections[marker] = text[match.start() : end].strip()
return intro, sections
def extract_segment(text: str, path: tuple[str, ...]) -> str:
if not path:
return text.strip()
current = text.strip()
carried_intro: list[str] = []
for depth, marker in enumerate(path):
intro, sections = split_sections(current)
if depth == 0 and intro:
carried_intro.append(intro)
current = sections.get(marker, current)
return "\n".join(part for part in [*carried_intro, current] if part).strip()
def extract_pages(pdf_path: Path, start: int, end: int) -> str:
doc = fitz.open(pdf_path)
try:
return "\n".join(doc[i].get_text("text") for i in range(start - 1, end))
finally:
doc.close()
def load_seed_rows() -> dict[str, dict]:
data = json.loads(PROBLEM_SEED_PATH.read_text())
return {row["question_number"]: row for row in data if row["source_exam_key"] == EXAM_KEY}
def build_source_rows(existing_rows: dict[str, dict]) -> dict[str, dict]:
seed_rows = load_seed_rows()
rows = dict(seed_rows)
if "5" in rows:
rows["5"] = {
**rows["5"],
"question_text": extract_pages(QUESTION_PDF, 18, 20),
"raw_answer_text": extract_pages(ANSWER_PDF, 21, 25),
"page_number": 18,
"analytics_topic": "KNN and Clustering",
"topic_primary": "KNN and Clustering",
"topic_tags": ["KNN and Clustering"],
"skill_tags": ["manual_computation", "distance_calculation", "algorithm_tracing"],
"difficulty": "medium",
}
else:
rows["5"] = {
**seed_rows["5"],
"question_text": extract_pages(QUESTION_PDF, 18, 20),
"raw_answer_text": extract_pages(ANSWER_PDF, 21, 25),
"page_number": 18,
}
if "7" in rows:
rows["7"] = {
**rows["7"],
"question_text": extract_pages(QUESTION_PDF, 23, 24),
"raw_answer_text": extract_pages(ANSWER_PDF, 31, 34),
"page_number": 23,
"analytics_topic": "Evaluation and Validation",
"topic_primary": "Evaluation and Validation",
"topic_tags": ["Evaluation and Validation"],
"skill_tags": ["cross_validation", "data_leakage_reasoning"],
"difficulty": "medium",
}
else:
rows["7"] = {
**seed_rows["7"],
"question_text": extract_pages(QUESTION_PDF, 23, 24),
"raw_answer_text": extract_pages(ANSWER_PDF, 31, 34),
"page_number": 23,
}
return rows
def extract_true_false_answers(answer_text: str) -> dict[str, str]:
answers: dict[str, str] = {}
table_match = re.search(r"Answer\s+([TF\s]+)", answer_text, re.S)
if table_match:
seq = re.findall(r"[TF]", table_match.group(1))
if len(seq) >= 10:
for idx, val in enumerate(seq[:10]):
answers[chr(ord("a") + idx)] = val
return answers
lines = [line.strip() for line in answer_text.splitlines() if line.strip()]
current_letter: str | None = None
for line in lines:
m = re.fullmatch(r"\(([a-j])\)", line)
if m:
current_letter = m.group(1)
continue
if current_letter and line in {"T", "F"}:
answers[current_letter] = line
current_letter = None
if answers:
return answers
seq = re.findall(r"\b([TF])\b", answer_text)
if len(seq) >= 10:
for idx, val in enumerate(seq[:10]):
answers[chr(ord("a") + idx)] = val
return answers
def main() -> None:
sb = get_supabase()
paper = sb.table("papers").select("id").eq("source_exam_key", EXAM_KEY).execute().data[0]
paper_id = paper["id"]
current_rows = (
sb.table("paper_questions")
.select("*")
.eq("paper_id", paper_id)
.order("display_order")
.execute()
.data
)
existing_by_number = {row["question_number"]: row for row in current_rows}
parent_rows = build_source_rows(existing_by_number)
tf_answers = extract_true_false_answers(parent_rows["1"]["raw_answer_text"] or "")
inserts = []
for display_order, child in enumerate(CHILDREN, start=1):
parent = parent_rows[child.top_level_number]
existing = existing_by_number.get(child.question_number, {})
question_text = extract_segment(parent["question_text"] or "", child.path)
raw_answer_text = extract_segment(parent["raw_answer_text"] or "", child.path) if child.path else (parent["raw_answer_text"] or "")
options = None
correct_option = None
if child.question_type == "true_false":
options = TRUE_FALSE_OPTIONS
correct_option = tf_answers.get(child.path[0])
inserts.append(
{
"paper_id": paper_id,
"question_number": child.question_number,
"parent_question": child.parent_question,
"display_order": display_order,
"question_type": child.question_type,
"question_format": child.question_format,
"question_text": question_text,
"score": child.score,
"page_number": child.page_number,
"page_y_ratio": existing.get("page_y_ratio"),
"options": options,
"correct_option": correct_option,
"correct_answer": None,
"raw_answer_text": raw_answer_text,
"topics": existing.get("topics") or (list(child.topic_tags) if child.topic_tags else parent.get("topics")),
"topic_primary": existing.get("topic_primary") or child.topic_primary or parent.get("topic_primary"),
"analytics_topic": existing.get("analytics_topic") or child.analytics_topic or parent.get("analytics_topic"),
"topic_tags": existing.get("topic_tags") or (list(child.topic_tags) if child.topic_tags else parent.get("topic_tags")),
"skill_tags": existing.get("skill_tags") or (list(child.skill_tags) if child.skill_tags else parent.get("skill_tags")),
"difficulty": existing.get("difficulty") or parent.get("difficulty"),
"knowledge_reminder": existing.get("knowledge_reminder", ""),
"ai_hint": existing.get("ai_hint", ""),
"solution": existing.get("solution", ""),
}
)
sb.table("paper_questions").delete().eq("paper_id", paper_id).execute()
sb.table("paper_questions").insert(inserts).execute()
sb.table("papers").update({"question_count": len(inserts), "status": "processing"}).eq("id", paper_id).execute()
print(f"Inserted {len(inserts)} rows for {EXAM_KEY}.")
if __name__ == "__main__":
main()