"""Backfill page_y_ratio for COMP2211 subquestions.""" from __future__ import annotations import re import time from pathlib import Path from concurrent.futures import ThreadPoolExecutor, as_completed import fitz import httpx from app.services.supabase_client import get_supabase ROOT = Path(__file__).resolve().parent.parent PAPERS_DIR = ROOT / "pastpaper-scraper" / "papers" / "COMP2211" PDF_BY_EXAM_KEY = { "COMP2211-2022-fall-midterm": "(COMP2211)[2022](f)midterm~=yjz8dxdd^_27002.pdf", "COMP2211-2022-spring-midterm": "(COMP2211)[2022](s)midterm~=b8bidkgs^_14629.pdf", "COMP2211-2022-spring-final-part-a": "(COMP2211)[2022](s)final~=b8bidkgs^_33018.pdf", "COMP2211-2022-spring-final-part-b": "(COMP2211)[2022](s)final~=b8bidkgs^_40627.pdf", "COMP2211-2023-spring-midterm": "(COMP2211)[2023](s)midterm~=bxbidkmj^_26587.pdf", "COMP2211-2024-spring-midterm": "(COMP2211)[2024](s)midterm~=rcidkjgf^_82003.pdf", "COMP2211-2024-spring-final": "(COMP2211)[2024](s)final~=igk5mmg^_90365.pdf", } def marker_candidates(question_number: str) -> list[str]: if "_" in question_number: left, right = question_number.split("_", 1) tokens: list[str] = [] m = re.fullmatch(r"(\d+)([a-z])", left) if m: tokens.append(f"({m.group(2)})") elif re.fullmatch(r"\d+[a-z]+", left): tokens.append(f"({re.sub(r'^\\d+', '', left)})") tokens.append(f"({right})") return tokens[::-1] m = re.fullmatch(r"(\d+)([a-z])", question_number) if m: return [f"({m.group(2)})", f"Problem {m.group(1)}"] if question_number.isdigit(): return [f"Problem {question_number}"] return [question_number] def line_matches(line_text: str, marker: str) -> bool: text = re.sub(r"\s+", " ", line_text.strip()) if not text: return False if marker.startswith("("): return text.startswith(marker) return marker.lower() in text.lower() def line_y_ratio(page: fitz.Page, marker: str) -> float | None: data = page.get_text("dict") hits: list[float] = [] for block in data.get("blocks", []): if block.get("type") != 0: continue for line in block.get("lines", []): line_text = "".join( span.get("text", "") for span in line.get("spans", []) ) if line_matches(line_text, marker): bbox = line.get("bbox") if bbox: hits.append(float(bbox[1])) if not hits: return None y = min(hits) return max(0.0, min((y - page.rect.y0) / page.rect.height, 0.98)) def search_y_ratio(page: fitz.Page, marker: str) -> float | None: ratios: list[float] = [] for rect in page.search_for(marker): ratios.append(max(0.0, min((rect.y0 - page.rect.y0) / page.rect.height, 0.98))) return min(ratios) if ratios else None def infer_y_ratio(page: fitz.Page, question_number: str) -> float: for marker in marker_candidates(question_number): ratio = line_y_ratio(page, marker) if ratio is not None: return ratio ratio = search_y_ratio(page, marker) if ratio is not None: return ratio return 0.05 def main() -> None: sb = get_supabase() papers = ( sb.table("papers") .select("id, source_exam_key") .eq("course_code", "COMP2211") .eq("source_kind", "course_library") .execute() .data or [] ) updates: list[tuple[str, float]] = [] for paper in papers: exam_key = paper["source_exam_key"] pdf_name = PDF_BY_EXAM_KEY.get(exam_key) if not pdf_name: continue pdf_path = PAPERS_DIR / pdf_name doc = fitz.open(pdf_path) try: questions = ( sb.table("paper_questions") .select("id, question_number, page_number") .eq("paper_id", paper["id"]) .order("display_order") .execute() .data or [] ) for question in questions: page_number = question.get("page_number") or 1 page = doc[page_number - 1] ratio = infer_y_ratio(page, question["question_number"]) updates.append((question["id"], round(ratio, 4))) finally: doc.close() def apply_update(payload: tuple[str, float]) -> None: question_id, ratio = payload attempts = 0 while True: try: sb.table("paper_questions").update({"page_y_ratio": ratio}).eq("id", question_id).execute() return except httpx.HTTPError: attempts += 1 if attempts >= 5: raise time.sleep(0.4 * attempts) with ThreadPoolExecutor(max_workers=3) as executor: futures = [executor.submit(apply_update, payload) for payload in updates] for future in as_completed(futures): future.result() print(f"Backfilled page_y_ratio for {len(updates)} COMP2211 questions.") if __name__ == "__main__": main()