Initial commit: PastPaper Master full stack
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
240
backend/import_course_manifest.py
Normal file
240
backend/import_course_manifest.py
Normal file
@@ -0,0 +1,240 @@
|
||||
"""Import a canonical course manifest into Supabase-backed papers."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from app.services.paper_processor import process_paper
|
||||
from app.services.supabase_client import get_supabase
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Import a canonical course paper manifest into Supabase."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--manifest",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="Path to the manifest JSON file.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--papers-root",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="Root folder that contains the course PDF files referenced by the manifest.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--user-id",
|
||||
required=False,
|
||||
help="Existing auth.users UUID used as the owner of imported course-library rows.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--course-code",
|
||||
help="Optional filter to only import entries from one course.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--exam-key",
|
||||
action="append",
|
||||
dest="exam_keys",
|
||||
default=[],
|
||||
help="Optional exam_key filter. Repeat the flag to import multiple entries.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--process",
|
||||
action="store_true",
|
||||
help="Run the full paper processing pipeline after the files are uploaded.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Print what would be imported without uploading or writing database rows.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_manifest(path: Path) -> list[dict[str, Any]]:
|
||||
with path.open("r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
if not isinstance(data, list):
|
||||
raise ValueError("Manifest must be a JSON array.")
|
||||
return data
|
||||
|
||||
|
||||
def should_import(entry: dict[str, Any], args: argparse.Namespace) -> bool:
|
||||
if args.course_code and entry.get("course_code") != args.course_code:
|
||||
return False
|
||||
if args.exam_keys and entry.get("exam_key") not in set(args.exam_keys):
|
||||
return False
|
||||
return bool(entry.get("importable"))
|
||||
|
||||
|
||||
def resolve_file_path(root: Path, filename: str | None) -> Path | None:
|
||||
if not filename:
|
||||
return None
|
||||
|
||||
direct = root / filename
|
||||
if direct.exists():
|
||||
return direct
|
||||
|
||||
all_files = [candidate for candidate in root.iterdir() if candidate.is_file()]
|
||||
|
||||
def normalize(name: str) -> str:
|
||||
return name.replace(" (1)", "")
|
||||
|
||||
target_name = normalize(filename)
|
||||
normalized = [candidate for candidate in all_files if normalize(candidate.name) == target_name]
|
||||
if len(normalized) == 1:
|
||||
return normalized[0]
|
||||
|
||||
path = Path(filename)
|
||||
normalized_stem = normalize(path.stem)
|
||||
suffix = path.suffix
|
||||
stem_matches = [
|
||||
candidate
|
||||
for candidate in all_files
|
||||
if candidate.suffix == suffix and normalize(candidate.stem) == normalized_stem
|
||||
]
|
||||
if len(stem_matches) == 1:
|
||||
return stem_matches[0]
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def read_file_bytes(root: Path, filename: str | None) -> bytes | None:
|
||||
if not filename:
|
||||
return None
|
||||
path = resolve_file_path(root, filename)
|
||||
if path is None or not path.exists():
|
||||
raise FileNotFoundError(f"Referenced file does not exist under {root}: {filename}")
|
||||
return path.read_bytes()
|
||||
|
||||
|
||||
def build_storage_path(entry: dict[str, Any], kind: str) -> str:
|
||||
exam_key = entry["exam_key"]
|
||||
return f"course-library/{entry['course_code']}/{exam_key}/{kind}.pdf"
|
||||
|
||||
|
||||
def upsert_paper_record(
|
||||
entry: dict[str, Any],
|
||||
user_id: str | None,
|
||||
paper_url: str,
|
||||
answer_url: str | None,
|
||||
) -> str:
|
||||
sb = get_supabase()
|
||||
payload = {
|
||||
"user_id": user_id,
|
||||
"course_code": entry["course_code"],
|
||||
"year": entry["year"],
|
||||
"term": entry["term"],
|
||||
"exam_type": entry["exam_type"],
|
||||
"part_label": entry.get("part_label"),
|
||||
"paper_file_url": paper_url,
|
||||
"answer_file_url": answer_url,
|
||||
"status": "processing",
|
||||
"source_kind": "course_library",
|
||||
"source_exam_key": entry["exam_key"],
|
||||
"source_question_filename": entry.get("question_pdf"),
|
||||
"source_answer_filename": entry.get("primary_answer_pdf"),
|
||||
}
|
||||
|
||||
existing = (
|
||||
sb.table("papers")
|
||||
.select("id")
|
||||
.eq("source_kind", "course_library")
|
||||
.eq("source_exam_key", entry["exam_key"])
|
||||
.limit(1)
|
||||
.execute()
|
||||
.data
|
||||
)
|
||||
if existing:
|
||||
paper_id = existing[0]["id"]
|
||||
sb.table("papers").update(payload).eq("id", paper_id).execute()
|
||||
return paper_id
|
||||
|
||||
created = sb.table("papers").insert(payload).execute().data
|
||||
return created[0]["id"]
|
||||
|
||||
|
||||
def reset_existing_processed_data(paper_id: str) -> None:
|
||||
sb = get_supabase()
|
||||
sb.table("paper_questions").delete().eq("paper_id", paper_id).execute()
|
||||
sb.table("papers").update(
|
||||
{
|
||||
"status": "processing",
|
||||
"error_message": None,
|
||||
"paper_extracted_text": None,
|
||||
"answer_extracted_text": None,
|
||||
"total_score": None,
|
||||
"question_count": None,
|
||||
"topics_summary": None,
|
||||
"difficulty_level": None,
|
||||
}
|
||||
).eq("id", paper_id).execute()
|
||||
|
||||
|
||||
async def import_entry(
|
||||
entry: dict[str, Any],
|
||||
args: argparse.Namespace,
|
||||
) -> None:
|
||||
paper_bytes = read_file_bytes(args.papers_root, entry.get("question_pdf"))
|
||||
answer_bytes = read_file_bytes(args.papers_root, entry.get("primary_answer_pdf"))
|
||||
|
||||
if paper_bytes is None:
|
||||
raise ValueError(f"Importable entry is missing question PDF: {entry['exam_key']}")
|
||||
|
||||
if args.dry_run:
|
||||
print(
|
||||
f"[dry-run] {entry['exam_key']}: "
|
||||
f"question={entry.get('question_pdf')} answer={entry.get('primary_answer_pdf')}"
|
||||
)
|
||||
return
|
||||
|
||||
sb = get_supabase()
|
||||
paper_path = build_storage_path(entry, "paper")
|
||||
sb.storage.from_("papers").upload(
|
||||
paper_path,
|
||||
paper_bytes,
|
||||
file_options={"content-type": "application/pdf", "upsert": "true"},
|
||||
)
|
||||
paper_url = sb.storage.from_("papers").get_public_url(paper_path)
|
||||
|
||||
answer_url = None
|
||||
if answer_bytes:
|
||||
answer_path = build_storage_path(entry, "answer")
|
||||
sb.storage.from_("papers").upload(
|
||||
answer_path,
|
||||
answer_bytes,
|
||||
file_options={"content-type": "application/pdf", "upsert": "true"},
|
||||
)
|
||||
answer_url = sb.storage.from_("papers").get_public_url(answer_path)
|
||||
|
||||
paper_id = upsert_paper_record(entry, args.user_id, paper_url, answer_url)
|
||||
print(f"Imported metadata for {entry['exam_key']} -> paper_id={paper_id}")
|
||||
|
||||
if args.process:
|
||||
reset_existing_processed_data(paper_id)
|
||||
await process_paper(paper_id, paper_bytes, answer_bytes)
|
||||
print(f"Processed {entry['exam_key']}")
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
args = parse_args()
|
||||
manifest = load_manifest(args.manifest)
|
||||
entries = [entry for entry in manifest if should_import(entry, args)]
|
||||
|
||||
if not entries:
|
||||
print("No manifest entries matched the provided filters.")
|
||||
return
|
||||
|
||||
print(f"Preparing to import {len(entries)} manifest entries.")
|
||||
for entry in entries:
|
||||
await import_entry(entry, args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user