161 lines
5.1 KiB
Python
161 lines
5.1 KiB
Python
"""Backfill page_y_ratio for COMP2211 subquestions."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
import time
|
|
from pathlib import Path
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
|
|
import fitz
|
|
import httpx
|
|
|
|
from app.services.supabase_client import get_supabase
|
|
|
|
|
|
ROOT = Path(__file__).resolve().parent.parent
|
|
PAPERS_DIR = ROOT / "pastpaper-scraper" / "papers" / "COMP2211"
|
|
|
|
PDF_BY_EXAM_KEY = {
|
|
"COMP2211-2022-fall-midterm": "(COMP2211)[2022](f)midterm~=yjz8dxdd^_27002.pdf",
|
|
"COMP2211-2022-spring-midterm": "(COMP2211)[2022](s)midterm~=b8bidkgs^_14629.pdf",
|
|
"COMP2211-2022-spring-final-part-a": "(COMP2211)[2022](s)final~=b8bidkgs^_33018.pdf",
|
|
"COMP2211-2022-spring-final-part-b": "(COMP2211)[2022](s)final~=b8bidkgs^_40627.pdf",
|
|
"COMP2211-2023-spring-midterm": "(COMP2211)[2023](s)midterm~=bxbidkmj^_26587.pdf",
|
|
"COMP2211-2024-spring-midterm": "(COMP2211)[2024](s)midterm~=rcidkjgf^_82003.pdf",
|
|
"COMP2211-2024-spring-final": "(COMP2211)[2024](s)final~=igk5mmg^_90365.pdf",
|
|
}
|
|
|
|
|
|
def marker_candidates(question_number: str) -> list[str]:
|
|
if "_" in question_number:
|
|
left, right = question_number.split("_", 1)
|
|
tokens: list[str] = []
|
|
m = re.fullmatch(r"(\d+)([a-z])", left)
|
|
if m:
|
|
tokens.append(f"({m.group(2)})")
|
|
elif re.fullmatch(r"\d+[a-z]+", left):
|
|
tokens.append(f"({re.sub(r'^\\d+', '', left)})")
|
|
tokens.append(f"({right})")
|
|
return tokens[::-1]
|
|
|
|
m = re.fullmatch(r"(\d+)([a-z])", question_number)
|
|
if m:
|
|
return [f"({m.group(2)})", f"Problem {m.group(1)}"]
|
|
|
|
if question_number.isdigit():
|
|
return [f"Problem {question_number}"]
|
|
|
|
return [question_number]
|
|
|
|
|
|
def line_matches(line_text: str, marker: str) -> bool:
|
|
text = re.sub(r"\s+", " ", line_text.strip())
|
|
if not text:
|
|
return False
|
|
if marker.startswith("("):
|
|
return text.startswith(marker)
|
|
return marker.lower() in text.lower()
|
|
|
|
|
|
def line_y_ratio(page: fitz.Page, marker: str) -> float | None:
|
|
data = page.get_text("dict")
|
|
hits: list[float] = []
|
|
for block in data.get("blocks", []):
|
|
if block.get("type") != 0:
|
|
continue
|
|
for line in block.get("lines", []):
|
|
line_text = "".join(
|
|
span.get("text", "")
|
|
for span in line.get("spans", [])
|
|
)
|
|
if line_matches(line_text, marker):
|
|
bbox = line.get("bbox")
|
|
if bbox:
|
|
hits.append(float(bbox[1]))
|
|
if not hits:
|
|
return None
|
|
y = min(hits)
|
|
return max(0.0, min((y - page.rect.y0) / page.rect.height, 0.98))
|
|
|
|
|
|
def search_y_ratio(page: fitz.Page, marker: str) -> float | None:
|
|
ratios: list[float] = []
|
|
for rect in page.search_for(marker):
|
|
ratios.append(max(0.0, min((rect.y0 - page.rect.y0) / page.rect.height, 0.98)))
|
|
return min(ratios) if ratios else None
|
|
|
|
|
|
def infer_y_ratio(page: fitz.Page, question_number: str) -> float:
|
|
for marker in marker_candidates(question_number):
|
|
ratio = line_y_ratio(page, marker)
|
|
if ratio is not None:
|
|
return ratio
|
|
ratio = search_y_ratio(page, marker)
|
|
if ratio is not None:
|
|
return ratio
|
|
return 0.05
|
|
|
|
|
|
def main() -> None:
|
|
sb = get_supabase()
|
|
papers = (
|
|
sb.table("papers")
|
|
.select("id, source_exam_key")
|
|
.eq("course_code", "COMP2211")
|
|
.eq("source_kind", "course_library")
|
|
.execute()
|
|
.data
|
|
or []
|
|
)
|
|
|
|
updates: list[tuple[str, float]] = []
|
|
for paper in papers:
|
|
exam_key = paper["source_exam_key"]
|
|
pdf_name = PDF_BY_EXAM_KEY.get(exam_key)
|
|
if not pdf_name:
|
|
continue
|
|
pdf_path = PAPERS_DIR / pdf_name
|
|
doc = fitz.open(pdf_path)
|
|
try:
|
|
questions = (
|
|
sb.table("paper_questions")
|
|
.select("id, question_number, page_number")
|
|
.eq("paper_id", paper["id"])
|
|
.order("display_order")
|
|
.execute()
|
|
.data
|
|
or []
|
|
)
|
|
for question in questions:
|
|
page_number = question.get("page_number") or 1
|
|
page = doc[page_number - 1]
|
|
ratio = infer_y_ratio(page, question["question_number"])
|
|
updates.append((question["id"], round(ratio, 4)))
|
|
finally:
|
|
doc.close()
|
|
|
|
def apply_update(payload: tuple[str, float]) -> None:
|
|
question_id, ratio = payload
|
|
attempts = 0
|
|
while True:
|
|
try:
|
|
sb.table("paper_questions").update({"page_y_ratio": ratio}).eq("id", question_id).execute()
|
|
return
|
|
except httpx.HTTPError:
|
|
attempts += 1
|
|
if attempts >= 5:
|
|
raise
|
|
time.sleep(0.4 * attempts)
|
|
|
|
with ThreadPoolExecutor(max_workers=3) as executor:
|
|
futures = [executor.submit(apply_update, payload) for payload in updates]
|
|
for future in as_completed(futures):
|
|
future.result()
|
|
|
|
print(f"Backfilled page_y_ratio for {len(updates)} COMP2211 questions.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|