Initial commit: PastPaper Master full stack
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
268
backend/split_comp2211_2023_spring_midterm.py
Normal file
268
backend/split_comp2211_2023_spring_midterm.py
Normal file
@@ -0,0 +1,268 @@
|
||||
"""Split COMP2211 Spring 2023 midterm into subquestions."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
from app.services.supabase_client import get_supabase
|
||||
|
||||
|
||||
EXAM_KEY = "COMP2211-2023-spring-midterm"
|
||||
PROBLEM_SEED_PATH = (
|
||||
Path(__file__).resolve().parent.parent
|
||||
/ "pastpaper-scraper"
|
||||
/ "reviews"
|
||||
/ "COMP2211"
|
||||
/ "problem_seed.json"
|
||||
)
|
||||
TRUE_FALSE_OPTIONS = [{"label": "True", "text": "True"}, {"label": "False", "text": "False"}]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ChildSpec:
|
||||
question_number: str
|
||||
parent_question: str
|
||||
top_level_number: str
|
||||
path: tuple[str, ...]
|
||||
score: float
|
||||
question_type: str
|
||||
question_format: str | None = None
|
||||
analytics_topic: str | None = None
|
||||
topic_primary: str | None = None
|
||||
topic_tags: tuple[str, ...] | None = None
|
||||
skill_tags: tuple[str, ...] | None = None
|
||||
options: tuple[tuple[str, str], ...] | None = None
|
||||
correct_option: str | None = None
|
||||
correct_answer: str | None = None
|
||||
page_number: int = 1
|
||||
|
||||
|
||||
def short_answer(
|
||||
question_number: str,
|
||||
parent_question: str,
|
||||
top_level_number: str,
|
||||
path: tuple[str, ...],
|
||||
score: float,
|
||||
*,
|
||||
analytics_topic: str | None = None,
|
||||
topic_primary: str | None = None,
|
||||
topic_tags: tuple[str, ...] | None = None,
|
||||
skill_tags: tuple[str, ...] | None = None,
|
||||
correct_answer: str | None = None,
|
||||
page_number: int,
|
||||
) -> ChildSpec:
|
||||
return ChildSpec(
|
||||
question_number=question_number,
|
||||
parent_question=parent_question,
|
||||
top_level_number=top_level_number,
|
||||
path=path,
|
||||
score=score,
|
||||
question_type="long_question",
|
||||
question_format="short_answer",
|
||||
analytics_topic=analytics_topic,
|
||||
topic_primary=topic_primary,
|
||||
topic_tags=topic_tags,
|
||||
skill_tags=skill_tags,
|
||||
correct_answer=correct_answer,
|
||||
page_number=page_number,
|
||||
)
|
||||
|
||||
|
||||
def mc(
|
||||
question_number: str,
|
||||
parent_question: str,
|
||||
top_level_number: str,
|
||||
path: tuple[str, ...],
|
||||
score: float,
|
||||
*,
|
||||
options: tuple[tuple[str, str], ...],
|
||||
correct_option: str,
|
||||
analytics_topic: str,
|
||||
skill_tags: tuple[str, ...],
|
||||
page_number: int,
|
||||
) -> ChildSpec:
|
||||
return ChildSpec(
|
||||
question_number=question_number,
|
||||
parent_question=parent_question,
|
||||
top_level_number=top_level_number,
|
||||
path=path,
|
||||
score=score,
|
||||
question_type="mc",
|
||||
question_format="mc",
|
||||
analytics_topic=analytics_topic,
|
||||
topic_primary=analytics_topic,
|
||||
topic_tags=(analytics_topic,),
|
||||
skill_tags=skill_tags,
|
||||
options=options,
|
||||
correct_option=correct_option,
|
||||
page_number=page_number,
|
||||
)
|
||||
|
||||
|
||||
ABCDE = (("A", "A"), ("B", "B"), ("C", "C"), ("D", "D"), ("E", "E"))
|
||||
|
||||
|
||||
CHILDREN: list[ChildSpec] = [
|
||||
ChildSpec("1a", "1", "1", ("a",), 1, "true_false", "true_false", "Probabilistic Models", "Probabilistic Models", ("Probabilistic Models",), ("concept_check", "classification_decision"), page_number=3),
|
||||
ChildSpec("1b", "1", "1", ("b",), 1, "true_false", "true_false", "Probabilistic Models", "Probabilistic Models", ("Probabilistic Models",), ("concept_check", "classification_decision"), page_number=3),
|
||||
ChildSpec("1c", "1", "1", ("c",), 1, "true_false", "true_false", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("concept_check", "algorithm_property"), page_number=3),
|
||||
ChildSpec("1d", "1", "1", ("d",), 1, "true_false", "true_false", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("concept_check", "distance_reasoning"), page_number=3),
|
||||
ChildSpec("1e", "1", "1", ("e",), 1, "true_false", "true_false", "Evaluation and Validation", "Evaluation and Validation", ("Evaluation and Validation",), ("concept_check", "validation_reasoning"), page_number=3),
|
||||
ChildSpec("1f", "1", "1", ("f",), 1, "true_false", "true_false", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("concept_check", "algorithm_property"), page_number=3),
|
||||
ChildSpec("1g", "1", "1", ("g",), 1, "true_false", "true_false", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("concept_check", "robustness_reasoning"), page_number=3),
|
||||
ChildSpec("1h", "1", "1", ("h",), 1, "true_false", "true_false", "Perceptron and MLP", "Perceptron and MLP", ("Perceptron and MLP",), ("concept_check", "decision_boundary"), page_number=3),
|
||||
ChildSpec("1i", "1", "1", ("i",), 1, "true_false", "true_false", "Perceptron and MLP", "Perceptron and MLP", ("Perceptron and MLP",), ("concept_check", "optimization_reasoning"), page_number=3),
|
||||
ChildSpec("1j", "1", "1", ("j",), 1, "true_false", "true_false", "Perceptron and MLP", "Perceptron and MLP", ("Perceptron and MLP",), ("concept_check", "expressiveness_reasoning"), page_number=3),
|
||||
short_answer("2a_i", "2a", "2", ("a", "i"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("code_tracing",), page_number=4),
|
||||
short_answer("2a_ii", "2a", "2", ("a", "ii"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("code_tracing",), page_number=4),
|
||||
short_answer("2a_iii", "2a", "2", ("a", "iii"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("code_tracing",), page_number=4),
|
||||
short_answer("2a_iv", "2a", "2", ("a", "iv"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("code_tracing",), page_number=4),
|
||||
short_answer("2a_v", "2a", "2", ("a", "v"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("indexing", "code_tracing"), page_number=4),
|
||||
short_answer("2a_vi", "2a", "2", ("a", "vi"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("indexing", "error_reasoning"), page_number=5),
|
||||
short_answer("2a_vii", "2a", "2", ("a", "vii"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("masking", "code_tracing"), page_number=5),
|
||||
short_answer("2a_viii", "2a", "2", ("a", "viii"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("aggregation", "code_tracing"), page_number=5),
|
||||
short_answer("2a_ix", "2a", "2", ("a", "ix"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("transpose", "code_tracing"), page_number=5),
|
||||
short_answer("2b_i", "2b", "2", ("b", "i"), 2, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("broadcasting", "code_tracing"), page_number=6),
|
||||
short_answer("2b_ii", "2b", "2", ("b", "ii"), 2, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("broadcasting", "error_reasoning"), page_number=6),
|
||||
short_answer("2b_iii", "2b", "2", ("b", "iii"), 2, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("broadcasting", "code_tracing"), page_number=6),
|
||||
ChildSpec("2c", "2", "2", ("c",), 6, "long_question", "coding", "Python Fundamentals", "Python Fundamentals", ("Python Fundamentals",), ("implementation", "vectorization", "geometry_reasoning"), page_number=7),
|
||||
short_answer("3", "3", "3", (), 8, analytics_topic="Probabilistic Models", topic_primary="Probabilistic Models", topic_tags=("Probabilistic Models",), skill_tags=("concept_explanation", "missing_data_reasoning"), page_number=9),
|
||||
ChildSpec("4a", "4", "4", ("a",), 8, "long_question", "long_answer", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("distance_calculation", "classification_decision"), page_number=10),
|
||||
short_answer("4b", "4", "4", ("b",), 6, analytics_topic="KNN and Clustering", topic_primary="KNN and Clustering", topic_tags=("KNN and Clustering",), skill_tags=("distance_reasoning", "comparison"), page_number=11),
|
||||
ChildSpec("5a", "5", "5", ("a",), 7, "long_question", "long_answer", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("distance_calculation", "algorithm_tracing"), page_number=12),
|
||||
ChildSpec("5b", "5", "5", ("b",), 7, "long_question", "long_answer", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("centroid_update", "algorithm_tracing"), page_number=12),
|
||||
short_answer("5c", "5", "5", ("c",), 5, analytics_topic="KNN and Clustering", topic_primary="KNN and Clustering", topic_tags=("KNN and Clustering",), skill_tags=("concept_explanation", "model_selection"), page_number=14),
|
||||
short_answer("6a", "6", "6", ("a",), 2, analytics_topic="Perceptron and MLP", topic_primary="Perceptron and MLP", topic_tags=("Perceptron and MLP",), skill_tags=("convergence_reasoning",), page_number=15),
|
||||
mc("6b", "6", "6", ("b",), 2, options=ABCDE, correct_option="D", analytics_topic="Perceptron and MLP", skill_tags=("generalization_reasoning",), page_number=15),
|
||||
short_answer("6c", "6", "6", ("c",), 2, analytics_topic="Perceptron and MLP", topic_primary="Perceptron and MLP", topic_tags=("Perceptron and MLP",), skill_tags=("activation_reasoning",), page_number=16),
|
||||
ChildSpec("6d", "6", "6", ("d",), 6, "long_question", "coding", "Perceptron and MLP", "Perceptron and MLP", ("Perceptron and MLP",), ("debugging", "implementation", "weight_update"), page_number=16),
|
||||
short_answer("7a", "7", "7", ("a",), 4, analytics_topic="Perceptron and MLP", topic_primary="Perceptron and MLP", topic_tags=("Perceptron and MLP",), skill_tags=("decision_boundary", "linearity_reasoning"), page_number=18),
|
||||
short_answer("7b", "7", "7", ("b",), 2, analytics_topic="Perceptron and MLP", topic_primary="Perceptron and MLP", topic_tags=("Perceptron and MLP",), skill_tags=("decision_boundary", "linearity_reasoning"), page_number=18),
|
||||
ChildSpec("7c", "7", "7", ("c",), 10, "long_question", "long_answer", "Perceptron and MLP", "Perceptron and MLP", ("Perceptron and MLP",), ("architecture_reasoning", "parameter_design"), page_number=19),
|
||||
]
|
||||
|
||||
|
||||
MARKER_RE = re.compile(r"(?m)^\(([a-z]+|[ivx]+)\)\s*")
|
||||
|
||||
|
||||
def split_sections(text: str) -> tuple[str, dict[str, str]]:
|
||||
matches = list(MARKER_RE.finditer(text))
|
||||
if not matches:
|
||||
return text.strip(), {}
|
||||
intro = text[: matches[0].start()].strip()
|
||||
sections: dict[str, str] = {}
|
||||
for idx, match in enumerate(matches):
|
||||
marker = match.group(1)
|
||||
end = matches[idx + 1].start() if idx + 1 < len(matches) else len(text)
|
||||
sections[marker] = text[match.start() : end].strip()
|
||||
return intro, sections
|
||||
|
||||
|
||||
def extract_segment(text: str, path: tuple[str, ...]) -> str:
|
||||
current = text.strip()
|
||||
carried_intro: list[str] = []
|
||||
for depth, marker in enumerate(path):
|
||||
intro, sections = split_sections(current)
|
||||
if depth == 0 and intro:
|
||||
carried_intro.append(intro)
|
||||
current = sections.get(marker, current)
|
||||
return "\n".join(part for part in [*carried_intro, current] if part).strip()
|
||||
|
||||
|
||||
def extract_true_false_answers(answer_text: str) -> dict[str, str]:
|
||||
answers: dict[str, str] = {}
|
||||
matches = list(re.finditer(r"(?m)^\(([a-j])\)\s*\n?T\s*F", answer_text))
|
||||
if matches:
|
||||
return answers
|
||||
for match in re.finditer(r"(?m)^\(([a-j])\)\s*\n?([TF])\b", answer_text):
|
||||
answers[match.group(1)] = match.group(2)
|
||||
if answers:
|
||||
return answers
|
||||
lines = [line.strip() for line in answer_text.splitlines() if line.strip()]
|
||||
current = None
|
||||
for line in lines:
|
||||
m = re.fullmatch(r"\(([a-j])\)", line)
|
||||
if m:
|
||||
current = m.group(1)
|
||||
continue
|
||||
if current and line in {"T", "F"}:
|
||||
answers[current] = line
|
||||
current = None
|
||||
return answers
|
||||
|
||||
|
||||
def load_seed_rows() -> dict[str, dict]:
|
||||
data = json.loads(PROBLEM_SEED_PATH.read_text())
|
||||
return {row["question_number"]: row for row in data if row["source_exam_key"] == EXAM_KEY}
|
||||
|
||||
|
||||
def main() -> None:
|
||||
sb = get_supabase()
|
||||
paper = sb.table("papers").select("id").eq("source_exam_key", EXAM_KEY).execute().data[0]
|
||||
paper_id = paper["id"]
|
||||
current_rows = (
|
||||
sb.table("paper_questions")
|
||||
.select("*")
|
||||
.eq("paper_id", paper_id)
|
||||
.order("display_order")
|
||||
.execute()
|
||||
.data
|
||||
)
|
||||
existing_by_number = {row["question_number"]: row for row in current_rows}
|
||||
parent_rows = load_seed_rows()
|
||||
tf_answers = extract_true_false_answers(parent_rows["1"]["raw_answer_text"] or "")
|
||||
|
||||
inserts = []
|
||||
for display_order, child in enumerate(CHILDREN, start=1):
|
||||
parent = parent_rows[child.top_level_number]
|
||||
existing = existing_by_number.get(child.question_number, {})
|
||||
question_text = extract_segment(parent["question_text"] or "", child.path)
|
||||
raw_answer_text = extract_segment(parent["raw_answer_text"] or "", child.path) if child.path else (parent["raw_answer_text"] or "")
|
||||
|
||||
options = None
|
||||
correct_option = child.correct_option
|
||||
if child.options:
|
||||
options = [{"label": label, "text": text} for label, text in child.options]
|
||||
if child.question_type == "true_false":
|
||||
options = TRUE_FALSE_OPTIONS
|
||||
correct_option = tf_answers.get(child.path[0])
|
||||
|
||||
inserts.append(
|
||||
{
|
||||
"paper_id": paper_id,
|
||||
"question_number": child.question_number,
|
||||
"parent_question": child.parent_question,
|
||||
"display_order": display_order,
|
||||
"question_type": child.question_type,
|
||||
"question_format": child.question_format,
|
||||
"question_text": question_text,
|
||||
"score": child.score,
|
||||
"page_number": child.page_number,
|
||||
"page_y_ratio": existing.get("page_y_ratio"),
|
||||
"options": options,
|
||||
"correct_option": correct_option,
|
||||
"correct_answer": child.correct_answer,
|
||||
"raw_answer_text": raw_answer_text,
|
||||
"topics": existing.get("topics") or (list(child.topic_tags) if child.topic_tags else parent.get("topics")),
|
||||
"topic_primary": existing.get("topic_primary") or child.topic_primary or parent.get("topic_primary"),
|
||||
"analytics_topic": existing.get("analytics_topic") or child.analytics_topic or parent.get("analytics_topic"),
|
||||
"topic_tags": existing.get("topic_tags") or (list(child.topic_tags) if child.topic_tags else parent.get("topic_tags")),
|
||||
"skill_tags": existing.get("skill_tags") or (list(child.skill_tags) if child.skill_tags else parent.get("skill_tags")),
|
||||
"difficulty": existing.get("difficulty") or parent.get("difficulty"),
|
||||
"knowledge_reminder": existing.get("knowledge_reminder", ""),
|
||||
"ai_hint": existing.get("ai_hint", ""),
|
||||
"solution": existing.get("solution", ""),
|
||||
}
|
||||
)
|
||||
|
||||
sb.table("paper_questions").delete().eq("paper_id", paper_id).execute()
|
||||
sb.table("paper_questions").insert(inserts).execute()
|
||||
sb.table("papers").update({"question_count": len(inserts), "status": "processing"}).eq("id", paper_id).execute()
|
||||
print(f"Inserted {len(inserts)} rows for {EXAM_KEY}.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user