Initial commit: PastPaper Master full stack

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Zhao
2026-04-21 12:15:35 +07:00
commit 7a09167261
105 changed files with 24799 additions and 0 deletions

View File

@@ -0,0 +1,160 @@
"""Backfill page_y_ratio for COMP2211 subquestions."""
from __future__ import annotations
import re
import time
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
import fitz
import httpx
from app.services.supabase_client import get_supabase
ROOT = Path(__file__).resolve().parent.parent
PAPERS_DIR = ROOT / "pastpaper-scraper" / "papers" / "COMP2211"
PDF_BY_EXAM_KEY = {
"COMP2211-2022-fall-midterm": "(COMP2211)[2022](f)midterm~=yjz8dxdd^_27002.pdf",
"COMP2211-2022-spring-midterm": "(COMP2211)[2022](s)midterm~=b8bidkgs^_14629.pdf",
"COMP2211-2022-spring-final-part-a": "(COMP2211)[2022](s)final~=b8bidkgs^_33018.pdf",
"COMP2211-2022-spring-final-part-b": "(COMP2211)[2022](s)final~=b8bidkgs^_40627.pdf",
"COMP2211-2023-spring-midterm": "(COMP2211)[2023](s)midterm~=bxbidkmj^_26587.pdf",
"COMP2211-2024-spring-midterm": "(COMP2211)[2024](s)midterm~=rcidkjgf^_82003.pdf",
"COMP2211-2024-spring-final": "(COMP2211)[2024](s)final~=igk5mmg^_90365.pdf",
}
def marker_candidates(question_number: str) -> list[str]:
if "_" in question_number:
left, right = question_number.split("_", 1)
tokens: list[str] = []
m = re.fullmatch(r"(\d+)([a-z])", left)
if m:
tokens.append(f"({m.group(2)})")
elif re.fullmatch(r"\d+[a-z]+", left):
tokens.append(f"({re.sub(r'^\\d+', '', left)})")
tokens.append(f"({right})")
return tokens[::-1]
m = re.fullmatch(r"(\d+)([a-z])", question_number)
if m:
return [f"({m.group(2)})", f"Problem {m.group(1)}"]
if question_number.isdigit():
return [f"Problem {question_number}"]
return [question_number]
def line_matches(line_text: str, marker: str) -> bool:
text = re.sub(r"\s+", " ", line_text.strip())
if not text:
return False
if marker.startswith("("):
return text.startswith(marker)
return marker.lower() in text.lower()
def line_y_ratio(page: fitz.Page, marker: str) -> float | None:
data = page.get_text("dict")
hits: list[float] = []
for block in data.get("blocks", []):
if block.get("type") != 0:
continue
for line in block.get("lines", []):
line_text = "".join(
span.get("text", "")
for span in line.get("spans", [])
)
if line_matches(line_text, marker):
bbox = line.get("bbox")
if bbox:
hits.append(float(bbox[1]))
if not hits:
return None
y = min(hits)
return max(0.0, min((y - page.rect.y0) / page.rect.height, 0.98))
def search_y_ratio(page: fitz.Page, marker: str) -> float | None:
ratios: list[float] = []
for rect in page.search_for(marker):
ratios.append(max(0.0, min((rect.y0 - page.rect.y0) / page.rect.height, 0.98)))
return min(ratios) if ratios else None
def infer_y_ratio(page: fitz.Page, question_number: str) -> float:
for marker in marker_candidates(question_number):
ratio = line_y_ratio(page, marker)
if ratio is not None:
return ratio
ratio = search_y_ratio(page, marker)
if ratio is not None:
return ratio
return 0.05
def main() -> None:
sb = get_supabase()
papers = (
sb.table("papers")
.select("id, source_exam_key")
.eq("course_code", "COMP2211")
.eq("source_kind", "course_library")
.execute()
.data
or []
)
updates: list[tuple[str, float]] = []
for paper in papers:
exam_key = paper["source_exam_key"]
pdf_name = PDF_BY_EXAM_KEY.get(exam_key)
if not pdf_name:
continue
pdf_path = PAPERS_DIR / pdf_name
doc = fitz.open(pdf_path)
try:
questions = (
sb.table("paper_questions")
.select("id, question_number, page_number")
.eq("paper_id", paper["id"])
.order("display_order")
.execute()
.data
or []
)
for question in questions:
page_number = question.get("page_number") or 1
page = doc[page_number - 1]
ratio = infer_y_ratio(page, question["question_number"])
updates.append((question["id"], round(ratio, 4)))
finally:
doc.close()
def apply_update(payload: tuple[str, float]) -> None:
question_id, ratio = payload
attempts = 0
while True:
try:
sb.table("paper_questions").update({"page_y_ratio": ratio}).eq("id", question_id).execute()
return
except httpx.HTTPError:
attempts += 1
if attempts >= 5:
raise
time.sleep(0.4 * attempts)
with ThreadPoolExecutor(max_workers=3) as executor:
futures = [executor.submit(apply_update, payload) for payload in updates]
for future in as_completed(futures):
future.result()
print(f"Backfilled page_y_ratio for {len(updates)} COMP2211 questions.")
if __name__ == "__main__":
main()