Initial commit: PastPaper Master full stack
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
160
backend/backfill_comp2211_page_y.py
Normal file
160
backend/backfill_comp2211_page_y.py
Normal file
@@ -0,0 +1,160 @@
|
||||
"""Backfill page_y_ratio for COMP2211 subquestions."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import time
|
||||
from pathlib import Path
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
|
||||
import fitz
|
||||
import httpx
|
||||
|
||||
from app.services.supabase_client import get_supabase
|
||||
|
||||
|
||||
ROOT = Path(__file__).resolve().parent.parent
|
||||
PAPERS_DIR = ROOT / "pastpaper-scraper" / "papers" / "COMP2211"
|
||||
|
||||
PDF_BY_EXAM_KEY = {
|
||||
"COMP2211-2022-fall-midterm": "(COMP2211)[2022](f)midterm~=yjz8dxdd^_27002.pdf",
|
||||
"COMP2211-2022-spring-midterm": "(COMP2211)[2022](s)midterm~=b8bidkgs^_14629.pdf",
|
||||
"COMP2211-2022-spring-final-part-a": "(COMP2211)[2022](s)final~=b8bidkgs^_33018.pdf",
|
||||
"COMP2211-2022-spring-final-part-b": "(COMP2211)[2022](s)final~=b8bidkgs^_40627.pdf",
|
||||
"COMP2211-2023-spring-midterm": "(COMP2211)[2023](s)midterm~=bxbidkmj^_26587.pdf",
|
||||
"COMP2211-2024-spring-midterm": "(COMP2211)[2024](s)midterm~=rcidkjgf^_82003.pdf",
|
||||
"COMP2211-2024-spring-final": "(COMP2211)[2024](s)final~=igk5mmg^_90365.pdf",
|
||||
}
|
||||
|
||||
|
||||
def marker_candidates(question_number: str) -> list[str]:
|
||||
if "_" in question_number:
|
||||
left, right = question_number.split("_", 1)
|
||||
tokens: list[str] = []
|
||||
m = re.fullmatch(r"(\d+)([a-z])", left)
|
||||
if m:
|
||||
tokens.append(f"({m.group(2)})")
|
||||
elif re.fullmatch(r"\d+[a-z]+", left):
|
||||
tokens.append(f"({re.sub(r'^\\d+', '', left)})")
|
||||
tokens.append(f"({right})")
|
||||
return tokens[::-1]
|
||||
|
||||
m = re.fullmatch(r"(\d+)([a-z])", question_number)
|
||||
if m:
|
||||
return [f"({m.group(2)})", f"Problem {m.group(1)}"]
|
||||
|
||||
if question_number.isdigit():
|
||||
return [f"Problem {question_number}"]
|
||||
|
||||
return [question_number]
|
||||
|
||||
|
||||
def line_matches(line_text: str, marker: str) -> bool:
|
||||
text = re.sub(r"\s+", " ", line_text.strip())
|
||||
if not text:
|
||||
return False
|
||||
if marker.startswith("("):
|
||||
return text.startswith(marker)
|
||||
return marker.lower() in text.lower()
|
||||
|
||||
|
||||
def line_y_ratio(page: fitz.Page, marker: str) -> float | None:
|
||||
data = page.get_text("dict")
|
||||
hits: list[float] = []
|
||||
for block in data.get("blocks", []):
|
||||
if block.get("type") != 0:
|
||||
continue
|
||||
for line in block.get("lines", []):
|
||||
line_text = "".join(
|
||||
span.get("text", "")
|
||||
for span in line.get("spans", [])
|
||||
)
|
||||
if line_matches(line_text, marker):
|
||||
bbox = line.get("bbox")
|
||||
if bbox:
|
||||
hits.append(float(bbox[1]))
|
||||
if not hits:
|
||||
return None
|
||||
y = min(hits)
|
||||
return max(0.0, min((y - page.rect.y0) / page.rect.height, 0.98))
|
||||
|
||||
|
||||
def search_y_ratio(page: fitz.Page, marker: str) -> float | None:
|
||||
ratios: list[float] = []
|
||||
for rect in page.search_for(marker):
|
||||
ratios.append(max(0.0, min((rect.y0 - page.rect.y0) / page.rect.height, 0.98)))
|
||||
return min(ratios) if ratios else None
|
||||
|
||||
|
||||
def infer_y_ratio(page: fitz.Page, question_number: str) -> float:
|
||||
for marker in marker_candidates(question_number):
|
||||
ratio = line_y_ratio(page, marker)
|
||||
if ratio is not None:
|
||||
return ratio
|
||||
ratio = search_y_ratio(page, marker)
|
||||
if ratio is not None:
|
||||
return ratio
|
||||
return 0.05
|
||||
|
||||
|
||||
def main() -> None:
|
||||
sb = get_supabase()
|
||||
papers = (
|
||||
sb.table("papers")
|
||||
.select("id, source_exam_key")
|
||||
.eq("course_code", "COMP2211")
|
||||
.eq("source_kind", "course_library")
|
||||
.execute()
|
||||
.data
|
||||
or []
|
||||
)
|
||||
|
||||
updates: list[tuple[str, float]] = []
|
||||
for paper in papers:
|
||||
exam_key = paper["source_exam_key"]
|
||||
pdf_name = PDF_BY_EXAM_KEY.get(exam_key)
|
||||
if not pdf_name:
|
||||
continue
|
||||
pdf_path = PAPERS_DIR / pdf_name
|
||||
doc = fitz.open(pdf_path)
|
||||
try:
|
||||
questions = (
|
||||
sb.table("paper_questions")
|
||||
.select("id, question_number, page_number")
|
||||
.eq("paper_id", paper["id"])
|
||||
.order("display_order")
|
||||
.execute()
|
||||
.data
|
||||
or []
|
||||
)
|
||||
for question in questions:
|
||||
page_number = question.get("page_number") or 1
|
||||
page = doc[page_number - 1]
|
||||
ratio = infer_y_ratio(page, question["question_number"])
|
||||
updates.append((question["id"], round(ratio, 4)))
|
||||
finally:
|
||||
doc.close()
|
||||
|
||||
def apply_update(payload: tuple[str, float]) -> None:
|
||||
question_id, ratio = payload
|
||||
attempts = 0
|
||||
while True:
|
||||
try:
|
||||
sb.table("paper_questions").update({"page_y_ratio": ratio}).eq("id", question_id).execute()
|
||||
return
|
||||
except httpx.HTTPError:
|
||||
attempts += 1
|
||||
if attempts >= 5:
|
||||
raise
|
||||
time.sleep(0.4 * attempts)
|
||||
|
||||
with ThreadPoolExecutor(max_workers=3) as executor:
|
||||
futures = [executor.submit(apply_update, payload) for payload in updates]
|
||||
for future in as_completed(futures):
|
||||
future.result()
|
||||
|
||||
print(f"Backfilled page_y_ratio for {len(updates)} COMP2211 questions.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user