Initial commit: PastPaper Master full stack

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Zhao
2026-04-21 12:15:35 +07:00
commit 7a09167261
105 changed files with 24799 additions and 0 deletions

16
backend/Dockerfile Normal file
View File

@@ -0,0 +1,16 @@
FROM python:3.12-slim
WORKDIR /app
# System deps for PyMuPDF
RUN apt-get update && apt-get install -y --no-install-recommends \
libmupdf-dev gcc g++ && \
rm -rf /var/lib/apt/lists/*
COPY pyproject.toml .
RUN pip install --no-cache-dir .
COPY app/ app/
EXPOSE 8000
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]

View File

@@ -0,0 +1,4 @@
ALTER TABLE papers
ADD COLUMN IF NOT EXISTS processing_step text DEFAULT NULL,
ADD COLUMN IF NOT EXISTS processing_progress integer DEFAULT 0,
ADD COLUMN IF NOT EXISTS processing_total integer DEFAULT 0;

0
backend/app/__init__.py Normal file
View File

36
backend/app/config.py Normal file
View File

@@ -0,0 +1,36 @@
from pydantic_settings import BaseSettings
from functools import lru_cache
import os
class Settings(BaseSettings):
# Supabase
supabase_url: str
supabase_anon_key: str
supabase_service_role_key: str
# LLM - laozhang (gpt-4o, gpt-4o-mini)
laozhang_base_url: str = "https://api.laozhang.ai/v1"
laozhang_api_key: str = ""
# LLM - DashScope (qwen-plus)
dashscope_base_url: str = "https://dashscope.aliyuncs.com/compatible-mode/v1"
dashscope_api_key: str = ""
# LLM - DeepSeek
deepseek_base_url: str = "https://api.deepseek.com/v1"
deepseek_api_key: str = ""
# Google Gemini (official)
google_gemini_api_key: str = ""
model_config = {
"env_file": os.path.join(os.path.dirname(__file__), "../../.env"),
"env_file_encoding": "utf-8",
"extra": "ignore",
}
@lru_cache
def get_settings() -> Settings:
return Settings()

View File

View File

@@ -0,0 +1,34 @@
"""Auth dependency: validate Supabase JWT and return user_id"""
from fastapi import Depends, HTTPException, status
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
from app.services.supabase_client import get_supabase
bearer_scheme = HTTPBearer(auto_error=False)
async def get_current_user_id(
credentials: HTTPAuthorizationCredentials | None = Depends(bearer_scheme),
) -> str:
"""Extract and validate Bearer token, return user_id."""
if not credentials:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Not authenticated",
)
token = credentials.credentials
sb = get_supabase()
try:
result = sb.auth.get_user(token)
user = result.user
if not user:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Invalid token",
)
return user.id
except Exception:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Invalid or expired token",
)

59
backend/app/main.py Normal file
View File

@@ -0,0 +1,59 @@
import asyncio
import threading
from contextlib import asynccontextmanager
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from app.routers import analytics, papers, attempts, questions
def _resume_stale_papers():
"""启动时检查卡在 processing 的 paper自动续传 AI trio"""
try:
from app.services.supabase_client import get_supabase
from app.services.paper_processor import process_paper
sb = get_supabase()
stale = sb.table("papers").select("id").eq("status", "processing").execute().data
if not stale:
return
for p in stale:
paper_id = p["id"]
print(f"[STARTUP] Resuming processing for paper {paper_id[:8]}...")
def run(pid=paper_id):
asyncio.run(process_paper(pid, b"", None))
threading.Thread(target=run, daemon=True).start()
except Exception as e:
print(f"[STARTUP] Resume skipped: {e}")
@asynccontextmanager
async def lifespan(app: FastAPI):
# Startup
_resume_stale_papers()
yield
# Shutdown (nothing to do)
app = FastAPI(title="PastPaper Master API", version="0.1.0", lifespan=lifespan)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # 开发阶段先放开,上线收紧
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
app.include_router(papers.router, prefix="/api/papers", tags=["papers"])
app.include_router(attempts.router, prefix="/api/attempts", tags=["attempts"])
app.include_router(questions.router, prefix="/api/questions", tags=["questions"])
app.include_router(analytics.router, prefix="/api/analytics", tags=["analytics"])
@app.get("/health")
def health():
return {"status": "ok"}

View File

View File

@@ -0,0 +1,285 @@
"""Course-level analytics endpoints."""
from __future__ import annotations
from collections import Counter, defaultdict
from fastapi import APIRouter
from app.services.supabase_client import get_supabase
router = APIRouter()
DIFFICULTY_SCORE = {"easy": 1, "medium": 2, "hard": 3}
DIFFICULTY_LABEL = {1: "Easy", 2: "Medium", 3: "Hard"}
# ── Topic normalization ──────────────────────────────────────
# Map variant spellings to canonical label
_TOPIC_ALIASES: dict[str, str] = {
"numpy": "NumPy",
"naïve bayes": "Naive Bayes",
"naïve bayes classifier": "Naive Bayes",
"naive bayes classifier": "Naive Bayes",
"bayes classifier": "Naive Bayes",
"bayes model": "Naive Bayes",
"bayes' theorem": "Naive Bayes",
"bayes' rule": "Naive Bayes",
"k-nearest neighbors": "K-Nearest Neighbors (KNN)",
"knn": "K-Nearest Neighbors (KNN)",
"k-means clustering": "K-Means Clustering",
"k-means": "K-Means Clustering",
"k means": "K-Means Clustering",
"multilayer perceptron": "Multilayer Perceptron (MLP)",
"multi-layer perceptron": "Multilayer Perceptron (MLP)",
"multi-layer perceptron (mlp)": "Multilayer Perceptron (MLP)",
"mlp": "Multilayer Perceptron (MLP)",
"single layer perceptron": "Perceptron",
"convolutional neural network": "CNN",
"convolutional neural network (cnn)": "CNN",
"convolutional neural networks": "CNN",
"cnn architecture": "CNN",
"cnn properties": "CNN",
"python fundamentals": "Python",
"python programming": "Python",
"python implementation": "Python",
"advanced python programming": "Python",
"python programming: convolutional neural network": "CNN",
"cross-validation": "Cross Validation",
"model evaluation implementation": "Model Evaluation",
"digital image processing": "Image Processing",
"computer vision": "Image Processing",
"array slicing": "Array Slicing",
"slicing": "Array Slicing",
"array indexing": "Array Slicing",
"array reshaping": "Reshape",
"array views": "Array Slicing",
"view vs copy": "Array Slicing",
"boolean indexing": "Array Slicing",
"arange": "NumPy",
"newaxis": "NumPy",
"expand dims": "NumPy",
"transpose": "NumPy",
"type casting": "NumPy",
"element-wise operation": "NumPy",
"array reduction": "NumPy",
"multi-dimensional array": "NumPy",
"dot product": "NumPy",
"vectorization": "NumPy",
"activation functions": "Activation Function",
"linear activation function": "Activation Function",
"neural network architecture": "Neural Networks",
"hidden layer": "Neural Networks",
"deep learning": "Neural Networks",
"deep learning frameworks": "Neural Networks",
"alpha-beta pruning": "Alpha-Beta Pruning",
"minimax algorithm": "Minimax",
"ethics of ai": "AI Ethics",
"ethics": "AI Ethics",
"cosine distance": "Cosine Similarity",
"distance calculation": "Distance Metrics",
"euclidean distance": "Distance Metrics",
"manhattan distance": "Distance Metrics",
"hamming distance": "Distance Metrics",
"precision": "Model Evaluation",
"recall": "Model Evaluation",
"f1 score": "Model Evaluation",
"macro f1 score": "Model Evaluation",
"accuracy": "Model Evaluation",
"classification accuracy": "Model Evaluation",
"confusion matrix": "Model Evaluation",
"convolution operation": "Convolution",
"dilated convolution": "Convolution",
"3d convolution": "Convolution",
"gaussian likelihood": "Probability",
"gaussian distribution": "Probability",
"categorical likelihood": "Probability",
"conditional probability": "Probability",
"total probability theorem": "Probability",
"probability assumptions": "Probability",
"tensorflow": "Keras",
"model summary": "Keras",
"model construction": "Keras",
"trainable parameters": "Parameter Calculation",
"parameter reduction": "Parameter Calculation",
"output shape calculation": "Parameter Calculation",
"shape calculation": "Parameter Calculation",
}
def normalize_topic(label: str) -> str:
return _TOPIC_ALIASES.get(label.lower().strip(), label)
def extract_topic_labels(question: dict) -> list[str]:
labels: list[str] = []
raw_labels: list[str] = []
analytics_topic = question.get("analytics_topic")
if analytics_topic:
raw_labels.append(analytics_topic)
for tag in question.get("topic_tags") or []:
if tag and tag not in raw_labels:
raw_labels.append(tag)
if not raw_labels:
for tag in question.get("topics") or []:
if tag and tag not in raw_labels:
raw_labels.append(tag)
# Normalize and deduplicate
seen: set[str] = set()
for raw in raw_labels:
norm = normalize_topic(raw)
if norm not in seen:
seen.add(norm)
labels.append(norm)
return labels
def extract_question_family(question: dict) -> str:
return (
question.get("question_format")
or question.get("question_type")
or "unknown"
)
@router.get("/courses")
async def list_courses():
"""返回所有有 ready 状态试卷的课程列表"""
sb = get_supabase()
rows = (
sb.table("papers")
.select("course_code")
.eq("status", "ready")
.execute()
.data
)
codes = sorted({row["course_code"] for row in rows if row.get("course_code")})
return codes
@router.get("/course/{course_code}")
async def get_course_analytics(course_code: str):
sb = get_supabase()
papers = (
sb.table("papers")
.select("id, course_code, year, term, exam_type, part_label, status")
.eq("course_code", course_code.upper())
.eq("status", "ready")
.order("year", desc=True)
.execute()
.data
)
if not papers:
return {
"course_code": course_code.upper(),
"kpi": {"papers": 0, "questions": 0, "topics": 0, "difficulty": "N/A"},
"topic_frequency": [],
"question_types": [],
"difficulty_distribution": {"easy": 0, "medium": 0, "hard": 0},
"high_yield_topics": [],
}
paper_ids = [paper["id"] for paper in papers]
questions = (
sb.table("paper_questions")
.select(
"id, paper_id, question_number, question_type, question_format, "
"question_text, score, topics, analytics_topic, topic_tags, difficulty"
)
.in_("paper_id", paper_ids)
.order("display_order")
.execute()
.data
)
papers_by_id = {paper["id"]: paper for paper in papers}
total_questions = len(questions)
topic_counter: Counter[str] = Counter()
type_counter: Counter[str] = Counter()
difficulty_counter: Counter[str] = Counter()
topic_examples: dict[str, list[dict]] = defaultdict(list)
difficulty_scores: list[int] = []
all_question_items: list[dict] = []
for question in questions:
question_type = extract_question_family(question)
type_counter[question_type] += 1
difficulty = question.get("difficulty")
if difficulty in DIFFICULTY_SCORE:
difficulty_counter[difficulty] += 1
difficulty_scores.append(DIFFICULTY_SCORE[difficulty])
paper = papers_by_id.get(question["paper_id"], {})
source_label = (
f"{paper.get('year', '')} {paper.get('term', '').title()} "
f"{paper.get('exam_type', '').title()}"
).strip()
if paper.get("part_label"):
source_label = f"{source_label} Part {paper['part_label']}"
topics = extract_topic_labels(question)
q_item = {
"paper_id": paper.get("id"),
"source": source_label,
"question_number": question["question_number"],
"preview": question["question_text"][:220],
"difficulty": question.get("difficulty"),
"question_type": question_type,
"year": paper.get("year"),
"term": paper.get("term"),
"exam_type": paper.get("exam_type"),
"topics": topics,
}
all_question_items.append(q_item)
for topic in topics:
topic_counter[topic] += 1
topic_examples[topic].append(q_item)
avg_difficulty = "N/A"
if difficulty_scores:
rounded = round(sum(difficulty_scores) / len(difficulty_scores))
avg_difficulty = DIFFICULTY_LABEL.get(rounded, "Medium")
topic_frequency = []
for topic, count in topic_counter.most_common():
pct = round((count / total_questions) * 100) if total_questions else 0
topic_frequency.append(
{
"label": topic,
"count": count,
"pct": pct,
"questions": topic_examples[topic],
}
)
question_types = []
for label, count in type_counter.most_common():
pct = round((count / total_questions) * 100) if total_questions else 0
question_types.append({"label": label, "count": count, "pct": pct})
return {
"course_code": course_code.upper(),
"kpi": {
"papers": len(papers),
"questions": total_questions,
"topics": len(topic_counter),
"difficulty": avg_difficulty,
},
"topic_frequency": topic_frequency,
"question_types": question_types,
"all_questions": all_question_items,
"difficulty_distribution": {
"easy": difficulty_counter.get("easy", 0),
"medium": difficulty_counter.get("medium", 0),
"hard": difficulty_counter.get("hard", 0),
},
"high_yield_topics": [topic for topic, _ in topic_counter.most_common(5)],
}

View File

@@ -0,0 +1,208 @@
"""用户答题记录 + 拍照批改 + 错题本"""
import asyncio
from fastapi import APIRouter, UploadFile, File, Form, HTTPException, Depends
from pydantic import BaseModel
from app.services.supabase_client import get_supabase
from app.services.grader import ocr_photo, grade_answer
from app.dependencies.auth import get_current_user_id
router = APIRouter()
class AttemptCreate(BaseModel):
question_id: str
attempt_type: str # "select" | "input" | "photo"
user_answer: str | None = None
is_correct: bool | None = None
class AttemptUpdate(BaseModel):
in_error_book: bool | None = None
mastered: bool | None = None
@router.post("/")
async def create_attempt(data: AttemptCreate, user_id: str = Depends(get_current_user_id)):
"""记录一次答题"""
sb = get_supabase()
record = {
"user_id": user_id,
"question_id": data.question_id,
"attempt_type": data.attempt_type,
"user_answer": data.user_answer,
"is_correct": data.is_correct,
}
# Auto add to error book if wrong
if data.is_correct is False:
record["in_error_book"] = True
result = sb.table("user_attempts").insert(record).execute()
return result.data[0]
@router.post("/photo")
async def photo_attempt(
question_id: str = Form(...),
photo: UploadFile = File(...),
user_id: str = Depends(get_current_user_id),
):
"""拍照上传 → OCR → AI批改"""
sb = get_supabase()
# 1. Read photo
photo_bytes = await photo.read()
# 2. Upload to storage
storage_path = f"attempts/{user_id}/{question_id}/{photo.filename}"
sb.storage.from_("attempt-photos").upload(
storage_path, photo_bytes,
file_options={"content-type": photo.content_type or "image/jpeg", "upsert": "true"},
)
photo_url = sb.storage.from_("attempt-photos").get_public_url(storage_path)
# 3. OCR (run in thread pool to avoid blocking event loop)
ocr_text = await asyncio.to_thread(ocr_photo, photo_bytes)
# 4. Fetch question for grading context
q_result = sb.table("paper_questions").select("*").eq("id", question_id).execute()
if not q_result.data:
raise HTTPException(status_code=404, detail="Question not found")
question = q_result.data[0]
# 5. AI grading (run in thread pool)
grade_result = await asyncio.to_thread(grade_answer, question, ocr_text)
# 6. Save attempt
record = {
"user_id": user_id,
"question_id": question_id,
"attempt_type": "photo",
"photo_url": photo_url,
"photo_ocr_text": ocr_text,
"is_correct": grade_result.get("is_correct", False),
"feedback": grade_result.get("feedback", ""),
"error_at_step": grade_result.get("error_at_step"),
"in_error_book": not grade_result.get("is_correct", False),
}
result = sb.table("user_attempts").insert(record).execute()
return {
"attempt": result.data[0],
"ocr_text": ocr_text,
"grade": grade_result,
}
@router.get("/error-book")
async def get_error_book(
course_code: str | None = None,
user_id: str = Depends(get_current_user_id),
):
"""获取错题本"""
sb = get_supabase()
attempts = (
sb.table("user_attempts")
.select("*")
.eq("user_id", user_id)
.eq("in_error_book", True)
.eq("mastered", False)
.order("created_at", desc=True)
.execute()
.data
)
if not attempts:
return []
question_ids = list({attempt["question_id"] for attempt in attempts})
questions = (
sb.table("paper_questions")
.select("*")
.in_("id", question_ids)
.execute()
.data
)
questions_by_id = {question["id"]: question for question in questions}
paper_ids = list({question["paper_id"] for question in questions})
papers = (
sb.table("papers")
.select("id, course_code, year, term, exam_type, part_label")
.in_("id", paper_ids)
.execute()
.data
)
papers_by_id = {paper["id"]: paper for paper in papers}
enriched = []
for attempt in attempts:
question = questions_by_id.get(attempt["question_id"])
if not question:
continue
paper = papers_by_id.get(question["paper_id"])
if course_code and paper and paper.get("course_code") != course_code.upper():
continue
enriched.append(
{
**attempt,
"paper_questions": {
**question,
"paper": paper,
},
}
)
return enriched
@router.get("/by-paper/{paper_id}")
async def get_paper_attempts(paper_id: str, user_id: str = Depends(get_current_user_id)):
"""获取某张试卷所有题目的最新判卷记录"""
sb = get_supabase()
attempts = (
sb.table("user_attempts")
.select("question_id, is_correct, feedback, photo_ocr_text, attempt_type, created_at")
.eq("user_id", user_id)
.order("created_at", desc=True)
.execute()
.data
)
# 只保留 photo 类型的,且只保留每题最新一条
question_ids = (
sb.table("paper_questions")
.select("id")
.eq("paper_id", paper_id)
.execute()
.data
)
qid_set = {q["id"] for q in question_ids}
seen: set[str] = set()
result = []
for a in attempts:
if a["question_id"] not in qid_set:
continue
if a["question_id"] in seen:
continue
if a["attempt_type"] != "photo":
continue
seen.add(a["question_id"])
result.append(a)
return result
@router.patch("/{attempt_id}")
async def update_attempt(attempt_id: str, data: AttemptUpdate):
"""更新错题状态(标记掌握等)"""
sb = get_supabase()
update = {}
if data.in_error_book is not None:
update["in_error_book"] = data.in_error_book
if data.mastered is not None:
update["mastered"] = data.mastered
if not update:
raise HTTPException(status_code=400, detail="Nothing to update")
result = sb.table("user_attempts").update(update).eq("id", attempt_id).execute()
if not result.data:
raise HTTPException(status_code=404, detail="Attempt not found")
return result.data[0]

View File

@@ -0,0 +1,142 @@
"""试卷上传 + 处理管线"""
import asyncio
import threading
from fastapi import APIRouter, UploadFile, File, Form, HTTPException, Depends
from app.services.supabase_client import get_supabase
from app.services.text_extractor import extract_pdf, get_full_text
from app.services.paper_processor import process_paper
from app.dependencies.auth import get_current_user_id
router = APIRouter()
def _upload_and_process_sync(
paper_id: str,
storage_path: str,
paper_bytes: bytes,
answer_bytes: bytes | None,
):
"""在独立线程中运行Storage 上传 + AI 处理"""
sb = get_supabase()
try:
paper_storage_path = f"{storage_path}/paper.pdf"
sb.storage.from_("papers").upload(
paper_storage_path, paper_bytes,
file_options={"content-type": "application/pdf", "upsert": "true"},
)
paper_url = sb.storage.from_("papers").get_public_url(paper_storage_path)
update_data: dict = {"paper_file_url": paper_url}
if answer_bytes:
answer_storage_path = f"{storage_path}/answer.pdf"
sb.storage.from_("papers").upload(
answer_storage_path, answer_bytes,
file_options={"content-type": "application/pdf", "upsert": "true"},
)
update_data["answer_file_url"] = sb.storage.from_("papers").get_public_url(answer_storage_path)
sb.table("papers").update(update_data).eq("id", paper_id).execute()
except Exception:
pass
# process_paper 是 async在新事件循环里跑
asyncio.run(process_paper(paper_id, paper_bytes, answer_bytes))
@router.get("/")
async def list_papers():
"""获取试卷列表(公共资产,所有用户共享)"""
sb = get_supabase()
return (
sb.table("papers")
.select("id, course_code, year, term, exam_type, status, question_count, total_score, difficulty_level, processing_step, processing_progress, processing_total, created_at")
.order("created_at", desc=True)
.execute()
.data
)
@router.get("/mine")
async def my_papers(user_id: str = Depends(get_current_user_id)):
"""当前用户上传的试卷(含 processing 状态)"""
sb = get_supabase()
return (
sb.table("papers")
.select("id, course_code, year, term, exam_type, part_label, status, question_count, processing_step, processing_progress, processing_total, created_at")
.eq("user_id", user_id)
.order("created_at", desc=True)
.execute()
.data
)
@router.post("/upload")
async def upload_paper(
paper_file: UploadFile = File(...),
answer_file: UploadFile | None = File(None),
course_code: str = Form(...),
year: int = Form(...),
term: str = Form(...),
exam_type: str = Form(...),
user_id: str = Depends(get_current_user_id),
):
"""上传试卷 PDF可选答案 PDF触发后台处理"""
sb = get_supabase()
# 1. 读取文件内容(已在内存中,快)
paper_bytes = await paper_file.read()
answer_bytes = await answer_file.read() if answer_file else None
# 2. 立即创建记录status=processing马上返回
storage_path = f"{course_code.upper()}/{year}_{term}_{exam_type}"
paper_record = sb.table("papers").insert({
"user_id": user_id,
"course_code": course_code.upper(),
"year": year,
"term": term,
"exam_type": exam_type,
"paper_file_url": "", # 后台上传后更新
"answer_file_url": None,
"status": "processing",
}).execute()
paper_id = paper_record.data[0]["id"]
# 3. 在独立线程中运行,完全不阻塞事件循环
threading.Thread(
target=_upload_and_process_sync,
args=(paper_id, storage_path, paper_bytes, answer_bytes),
daemon=True,
).start()
return {
"paper_id": paper_id,
"status": "processing",
"message": "试卷已上传,正在处理中...",
}
@router.get("/{paper_id}")
async def get_paper(paper_id: str):
"""获取试卷信息 + 处理状态"""
sb = get_supabase()
result = sb.table("papers").select("*").eq("id", paper_id).execute()
if not result.data:
raise HTTPException(status_code=404, detail="Paper not found")
return result.data[0]
@router.get("/{paper_id}/questions")
async def get_questions(paper_id: str):
"""获取试卷的所有题目(含 AI 三件套)"""
sb = get_supabase()
result = (
sb.table("paper_questions")
.select("*")
.eq("paper_id", paper_id)
.order("display_order")
.execute()
)
return result.data

View File

@@ -0,0 +1,325 @@
"""题目相关:变式题生成 + 相似题召回"""
from __future__ import annotations
import asyncio
import time
from fastapi import APIRouter, HTTPException, Depends
from pydantic import BaseModel
from app.services.supabase_client import get_supabase
from app.services.grader import generate_variant
from app.dependencies.auth import get_current_user_id
# Simple in-memory cache: question_id → (timestamp, result)
_similar_cache: dict[str, tuple[float, list]] = {}
_CACHE_TTL = 300 # 5 minutes
class VariantUpdate(BaseModel):
favorited: bool | None = None
router = APIRouter()
def normalized_labels(values: list[str] | None) -> dict[str, str]:
labels: dict[str, str] = {}
for value in values or []:
if value:
labels[value.lower()] = value
return labels
def question_family(question: dict) -> str:
return question.get("question_format") or question.get("question_type") or "unknown"
def display_topics(question: dict) -> list[str]:
labels: list[str] = []
analytics_topic = question.get("analytics_topic")
if analytics_topic:
labels.append(analytics_topic)
for topic in question.get("topic_tags") or []:
if topic and topic not in labels:
labels.append(topic)
if labels:
return labels
for topic in question.get("topics") or []:
if topic and topic not in labels:
labels.append(topic)
return labels
def similarity_score(
target: dict,
candidate: dict,
text_score: float = 0.0,
) -> tuple[int, list[str]]:
score = 0
reasons: list[str] = []
# Primary topic bucket: 40 pts
target_topic = target.get("analytics_topic")
candidate_topic = candidate.get("analytics_topic")
if target_topic and target_topic == candidate_topic:
score += 40
reasons.append(f"Same topic: {target_topic}")
# Concept overlap: up to 20 pts
target_topics = normalized_labels(target.get("topic_tags"))
candidate_topics = normalized_labels(candidate.get("topic_tags"))
shared_topics = sorted(set(target_topics) & set(candidate_topics))
if shared_topics:
score += min(len(shared_topics) * 10, 20)
# Only show concept reason if analytics_topic didn't already match (avoid redundancy)
if not (target_topic and target_topic == candidate_topic):
reasons.append(
"Shared concept: "
+ ", ".join(target_topics[key] for key in shared_topics[:2])
)
# Skill overlap: up to 20 pts
target_skills = normalized_labels(target.get("skill_tags"))
candidate_skills = normalized_labels(candidate.get("skill_tags"))
shared_skills = sorted(set(target_skills) & set(candidate_skills))
if shared_skills:
score += min(len(shared_skills) * 10, 20)
reasons.append(
"Shared skill: "
+ ", ".join(target_skills[key] for key in shared_skills[:2])
)
# Same question format: 10 pts
if question_family(candidate) == question_family(target):
score += 10
reasons.append("Same format")
# Same difficulty: 5 pts
if candidate.get("difficulty") and candidate.get("difficulty") == target.get("difficulty"):
score += 5
reasons.append("Same difficulty")
# Full-text similarity from PostgreSQL ts_rank_cd: up to 20 pts
if text_score > 0:
text_pts = min(round(text_score * 60), 20)
score += text_pts
if text_pts >= 4:
reasons.append("Similar wording")
return min(score, 99), reasons
@router.get("/variants/favorited")
async def get_favorited_variants(user_id: str = Depends(get_current_user_id)):
"""获取用户收藏的所有 variant用于 Error Book"""
sb = get_supabase()
rows = (
sb.table("question_variants")
.select("*, paper_questions(question_number, paper_id, papers(id, course_code, year, term, exam_type, part_label))")
.eq("user_id", user_id)
.eq("favorited", True)
.order("created_at", desc=True)
.execute()
.data
)
return rows
@router.post("/{question_id}/variant")
async def create_variant(question_id: str, user_id: str = Depends(get_current_user_id)):
"""生成变式题并入库"""
sb = get_supabase()
result = sb.table("paper_questions").select("*").eq("id", question_id).execute()
if not result.data:
raise HTTPException(status_code=404, detail="Question not found")
question = result.data[0]
variant_data = await asyncio.to_thread(generate_variant, question)
variant_data["knowledge_reminder"] = question.get("knowledge_reminder", "")
saved = sb.table("question_variants").insert({
"user_id": user_id,
"source_question_id": question_id,
"variant_data": variant_data,
"favorited": False,
}).execute()
row = saved.data[0]
row["source_question_number"] = question["question_number"]
return row
@router.get("/{question_id}/variants")
async def list_variants(question_id: str, user_id: str = Depends(get_current_user_id)):
"""获取某道题的用户所有 variant"""
sb = get_supabase()
q_result = sb.table("paper_questions").select("question_number").eq("id", question_id).execute()
question_number = q_result.data[0]["question_number"] if q_result.data else ""
rows = (
sb.table("question_variants")
.select("*")
.eq("user_id", user_id)
.eq("source_question_id", question_id)
.order("created_at", desc=True)
.execute()
.data
)
for row in rows:
row["source_question_number"] = question_number
return rows
@router.patch("/variant/{variant_id}")
async def update_variant(variant_id: str, data: VariantUpdate, user_id: str = Depends(get_current_user_id)):
"""更新 variant收藏/取消收藏)"""
sb = get_supabase()
update: dict = {}
if data.favorited is not None:
update["favorited"] = data.favorited
if not update:
raise HTTPException(status_code=400, detail="Nothing to update")
result = (
sb.table("question_variants")
.update(update)
.eq("id", variant_id)
.eq("user_id", user_id)
.execute()
)
if not result.data:
raise HTTPException(status_code=404, detail="Variant not found")
return result.data[0]
@router.delete("/variant/{variant_id}", status_code=204)
async def delete_variant(variant_id: str, user_id: str = Depends(get_current_user_id)):
"""删除 variant"""
sb = get_supabase()
sb.table("question_variants").delete().eq("id", variant_id).eq("user_id", user_id).execute()
@router.get("/{question_id}/similar")
async def get_similar_questions(question_id: str, limit: int = 6):
"""Retrieve similar questions from the same course."""
# Cache hit
cached = _similar_cache.get(question_id)
if cached and (time.time() - cached[0]) < _CACHE_TTL:
return cached[1][:max(1, min(limit, 12))]
sb = get_supabase()
result = sb.table("paper_questions").select("*, similar_questions").eq("id", question_id).execute()
if not result.data:
raise HTTPException(status_code=404, detail="Question not found")
target = result.data[0]
# Return pre-computed immediately; schedule background refresh
if target.get("similar_questions"):
precomputed = target["similar_questions"]
_similar_cache[question_id] = (time.time(), precomputed)
return precomputed[:max(1, min(limit, 12))]
paper_result = sb.table("papers").select("id, course_code").eq("id", target["paper_id"]).execute()
# (fallback: compute on-the-fly for questions not yet backfilled)
if not paper_result.data:
raise HTTPException(status_code=404, detail="Paper not found")
course_code = paper_result.data[0]["course_code"]
papers = (
sb.table("papers")
.select("id, course_code, year, term, exam_type, part_label")
.eq("course_code", course_code)
.eq("status", "ready")
.execute()
.data
)
paper_ids = [paper["id"] for paper in papers if paper["id"] != target["paper_id"]]
if not paper_ids:
return []
papers_by_id = {paper["id"]: paper for paper in papers}
# Pre-filter by analytics_topic in DB when possible (cuts candidates from ~250 to ~30)
candidates_query = (
sb.table("paper_questions")
.select(
"id, paper_id, question_number, question_type, question_format, "
"question_text, score, topics, analytics_topic, topic_tags, skill_tags, "
"difficulty, knowledge_reminder, ai_hint, solution"
)
.in_("paper_id", paper_ids)
)
target_topic = target.get("analytics_topic")
if target_topic:
candidates_query = candidates_query.eq("analytics_topic", target_topic)
candidates = candidates_query.execute().data
if not candidates:
return []
# Batch full-text scores from PostgreSQL (skip if too many candidates — slow)
text_scores: dict[str, float] = {}
if len(candidates) <= 50:
try:
rpc_result = sb.rpc(
"text_similarity_scores",
{
"query_text": target.get("question_text") or "",
"candidate_ids": [c["id"] for c in candidates],
},
).execute()
for row in rpc_result.data or []:
text_scores[row["question_id"]] = float(row["text_score"] or 0)
except Exception:
pass
ranked = []
for candidate in candidates:
text_score = text_scores.get(candidate["id"], 0.0)
match_percent, reasons = similarity_score(target, candidate, text_score)
if match_percent < 20:
continue
paper = papers_by_id.get(candidate["paper_id"], {})
source = (
f"{paper.get('year', '')} {paper.get('term', '').title()} "
f"{paper.get('exam_type', '').title()}"
).strip()
if paper.get("part_label"):
source = f"{source} Part {paper['part_label']}"
ranked.append(
{
"id": candidate["id"],
"paper_id": candidate["paper_id"],
"source": source,
"question_number": candidate["question_number"],
"match_percent": match_percent,
"match_reasons": reasons,
"question_type": question_family(candidate),
"question_text": candidate["question_text"],
"topics": display_topics(candidate),
"difficulty": candidate.get("difficulty"),
"knowledge_reminder": candidate.get("knowledge_reminder", ""),
"ai_hint": candidate.get("ai_hint", ""),
"solution": candidate.get("solution", ""),
}
)
ranked.sort(key=lambda item: (-item["match_percent"], item["source"], item["question_number"]))
# Keep only the best-scoring question per paper
seen_papers: set[str] = set()
deduped = []
for item in ranked:
if item["paper_id"] not in seen_papers:
seen_papers.add(item["paper_id"])
deduped.append(item)
_similar_cache[question_id] = (time.time(), deduped)
# Persist to DB so future requests are instant
try:
sb.table("paper_questions").update({"similar_questions": deduped}).eq("id", question_id).execute()
except Exception:
pass
return deduped[:max(1, min(limit, 12))]

View File

View File

@@ -0,0 +1,146 @@
"""OCR, grading, and variant generation prompts"""
import json
import base64
from app.services.llm_clients import get_vision_client, get_deepseek_client
OCR_PROMPT = """You are an expert at recognizing handwritten answers. Analyze this photo of a student's handwritten answer and extract the text and mathematical formulas.
Requirements:
- Faithfully extract what the student wrote, do not modify or correct
- Use LaTeX format for math formulas (e.g. $x^2 + 1$)
- If there are multiple steps, list them in original order
- If some handwriting is unclear, mark with [unclear]
Return only the extracted text, no additional explanation."""
GRADING_PROMPT = """You are an expert academic grader. Grade the following student answer. ALL output must be in English.
Question info:
- Number: {question_number}
- Type: {question_type}
- Question: {question_text}
- Score: {score}
Reference answer / solution:
{reference_answer}
Student answer:
{student_answer}
Grade and return JSON:
{{
"is_correct": true/false,
"score_given": 0-{score},
"feedback": "<HTML> Step-by-step analysis of the student's answer, pointing out correct parts and errors, using KaTeX formulas </HTML>",
"error_at_step": null or the step number where errors begin (integer)
}}
Grading rules:
- MC / fill-blank: only correct if answer matches exactly
- Long questions: give partial credit for correct steps even if the final answer is wrong
- feedback in HTML format, supports KaTeX ($..$ inline, $$...$$ block)
- Mark errors with <div class="common-error">...</div>
- Identify exactly which step the error starts"""
VARIANT_PROMPT = """You are an expert exam question creator. Generate a similar but different variant question based on the original below. ALL output must be in English.
Original question info:
- Type: {question_type}
- Question: {question_text}
- Topics: {topics}
- Difficulty: {difficulty}
- Reference answer: {answer}
Requirements:
- Variant must test the same knowledge points at similar difficulty
- Data/scenario/wording must differ — don't just change numbers
- Must provide a complete correct answer
Format requirements (CRITICAL):
- All text in HTML format, absolutely NO markdown syntax
- Code: <pre><code class="language-xxx">...</code></pre>, NOT ```
- Math: $...$ (inline) or $$...$$ (block), KaTeX compatible
- Line breaks: <br>, paragraphs: <p>
Return JSON:
{{
"question_text": "HTML formatted variant question",
"question_type": "{question_type}",
"options": [MC only, format {{"label":"A","text":"..."}}, ...] or null,
"correct_answer": "Correct answer (plain text)",
"ai_hint": "HTML formatted hint that guides thinking WITHOUT giving the answer",
"solution": "HTML formatted complete step-by-step solution"
}}"""
def ocr_photo(photo_bytes: bytes) -> str:
"""Gemini Vision OCR for handwritten answers"""
client = get_vision_client()
b64 = base64.b64encode(photo_bytes).decode("utf-8")
resp = client.chat.completions.create(
model="gemini-2.5-flash",
messages=[
{"role": "system", "content": OCR_PROMPT},
{"role": "user", "content": [
{"type": "image_url", "image_url": {
"url": f"data:image/jpeg;base64,{b64}",
}},
]},
],
temperature=0,
max_tokens=2000,
)
return resp.choices[0].message.content or ""
def grade_answer(question: dict, student_answer: str) -> dict:
"""Qwen grades student answer"""
reference = question.get("raw_answer_text") or question.get("solution") or "No reference answer"
score = question.get("score") or "unknown"
ds = get_deepseek_client()
resp = ds.chat.completions.create(
model="deepseek-chat",
messages=[
{"role": "system", "content": GRADING_PROMPT.format(
question_number=question["question_number"],
question_type=question["question_type"],
question_text=question["question_text"],
score=score,
reference_answer=reference,
student_answer=student_answer,
)},
],
temperature=0.2,
response_format={"type": "json_object"},
)
return json.loads(resp.choices[0].message.content)
def generate_variant(question: dict) -> dict:
"""Gemini generates a variant question"""
answer = (
question.get("correct_option")
or question.get("correct_answer")
or question.get("raw_answer_text")
or "N/A"
)
ds = get_deepseek_client()
resp = ds.chat.completions.create(
model="deepseek-chat",
messages=[
{"role": "system", "content": VARIANT_PROMPT.format(
question_type=question["question_type"],
question_text=question["question_text"],
topics=", ".join(question.get("topics", [])),
difficulty=question.get("difficulty", "medium"),
answer=answer,
)},
],
temperature=0.5,
response_format={"type": "json_object"},
)
return json.loads(resp.choices[0].message.content)

View File

@@ -0,0 +1,74 @@
import httpx
from openai import OpenAI
from app.config import get_settings
_TIMEOUT = httpx.Timeout(connect=10, read=300, write=60, pool=10)
_gpt_client: OpenAI | None = None
_qwen_client: OpenAI | None = None
_gemini_flash_client: OpenAI | None = None
_gemini_lite_client: OpenAI | None = None
_deepseek_client: OpenAI | None = None
def get_gpt_client() -> OpenAI:
"""laozhang API — gpt-4o / gpt-4o-mini"""
global _gpt_client
if _gpt_client is None:
s = get_settings()
_gpt_client = OpenAI(
base_url=s.laozhang_base_url,
api_key=s.laozhang_api_key,
)
return _gpt_client
def get_qwen_client() -> OpenAI:
"""DashScope — qwen-plus"""
global _qwen_client
if _qwen_client is None:
s = get_settings()
_qwen_client = OpenAI(
base_url=s.dashscope_base_url,
api_key=s.dashscope_api_key,
)
return _qwen_client
def get_vision_client() -> OpenAI:
"""Google Gemini 官方 API视觉用于拆题+OCR— 部署在新加坡可用"""
global _gemini_flash_client
if _gemini_flash_client is None:
s = get_settings()
_gemini_flash_client = OpenAI(
base_url="https://generativelanguage.googleapis.com/v1beta/openai/",
api_key=s.google_gemini_api_key,
timeout=_TIMEOUT,
)
return _gemini_flash_client
def get_gemini_lite_client() -> OpenAI:
"""laozhang — gemini-3.1-flash-lite-preview轻量用于 AI trio"""
global _gemini_lite_client
if _gemini_lite_client is None:
s = get_settings()
_gemini_lite_client = OpenAI(
base_url=s.laozhang_base_url,
api_key=s.laozhang_api_key,
timeout=_TIMEOUT,
)
return _gemini_lite_client
def get_deepseek_client() -> OpenAI:
"""DeepSeek — deepseek-chat用于 AI trio"""
global _deepseek_client
if _deepseek_client is None:
s = get_settings()
_deepseek_client = OpenAI(
base_url=s.deepseek_base_url,
api_key=s.deepseek_api_key,
timeout=_TIMEOUT,
)
return _deepseek_client

View File

@@ -0,0 +1,576 @@
"""试卷处理管线PDF → 结构化题目 → AI 三件套Vision 模式)"""
import asyncio
import base64
import io
import json
import re
import traceback
from contextlib import redirect_stdout
import fitz # pymupdf
from app.services.supabase_client import get_supabase
from app.services.llm_clients import get_vision_client, get_deepseek_client
def strip_nulls(obj):
"""Recursively remove \\u0000 null bytes from strings (PostgreSQL rejects them)."""
if isinstance(obj, str):
return obj.replace("\u0000", "")
if isinstance(obj, dict):
return {k: strip_nulls(v) for k, v in obj.items()}
if isinstance(obj, list):
return [strip_nulls(i) for i in obj]
return obj
# ============================================
# Prompts
# ============================================
STRUCTURE_PROMPT = """You are an expert exam paper structure analyst. You are given images of a past exam paper. Analyze every page carefully and extract all questions into structured JSON.
All generated values must be in English. Do not output Chinese.
CRITICAL RULES for question_text:
- Each question's question_text must be FULLY SELF-CONTAINED. Include ALL context needed to solve it.
- For sub-questions (e.g. (a)(i)), copy the ENTIRE parent question setup (variable definitions, code blocks, problem description) into the question_text, then append the specific sub-question.
- For Python/code questions: include ALL variable definitions and import statements verbatim, exactly as they appear in the exam, preserving multi-line arrays and data structures completely.
- Never truncate code. If a variable is defined across multiple lines (e.g. a numpy array), include every line.
Output JSON format (strictly follow):
{
"total_score": 100,
"difficulty_level": "medium",
"topics_summary": {"Topic A": 40, "Topic B": 30, "Topic C": 30},
"questions": [
{
"question_number": "1a",
"parent_question": "1",
"question_type": "mc",
"question_text": "Original question text...",
"score": 5,
"page_number": 1,
"options": [{"label": "A", "text": "Option content"}, {"label": "B", "text": "..."}],
"topics": ["Linked List", "Pointer"],
"difficulty": "easy"
},
{
"question_number": "2",
"parent_question": null,
"question_type": "long_question",
"question_text": "Original question text...",
"score": 15,
"page_number": 2,
"options": null,
"topics": ["Recursion"],
"difficulty": "hard"
}
]
}
Rules:
- question_type must be one of: "mc" (multiple choice), "true_false" (true/false), "fill_blank" (fill in blank), "long_question" (long question)
- True/False questions MUST use "true_false" type, with options set to [{"label":"True","text":"True"},{"label":"False","text":"False"}], correct_option as "True" or "False"
- Multiple choice must extract the options array
- Sub-questions use parent_question to link to parent: "1a" parent is "1"
- Independent questions without sub-questions set parent_question to null
- page_number inferred from where the question appears
- topics inferred from the question content
- difficulty: "easy" | "medium" | "hard"
- Extract ALL questions, do not miss any
- Keep topic labels in English only
"""
ANSWER_MATCH_PROMPT = """You are an expert exam answer matching specialist. Below is the answer text for an exam paper. Extract and match answers to their corresponding question numbers.
All generated values must be in English. Do not output Chinese.
Question structure:
{questions_json}
Answer text:
{answer_text}
Output JSON format:
{{
"answers": [
{{
"question_number": "1a",
"correct_option": "B",
"correct_answer": null,
"raw_answer_text": "Original answer text..."
}},
{{
"question_number": "2",
"correct_option": null,
"correct_answer": null,
"raw_answer_text": "Complete solution process and answer..."
}}
]
}}
Rules:
- For MC questions, fill correct_option (e.g. "B")
- For fill-blank questions, fill correct_answer (e.g. "O(n log n)")
- For long questions, only fill raw_answer_text (complete solution process)
- Match all questions where answers can be found
- Keep raw_answer_text faithful to the source answer, but do not add Chinese commentary
"""
ANALYSIS_PROMPT = """You are an expert academic answer analyst. Generate three sections for the following exam question. ALL output must be in English.
Question info:
- Number: {question_number}
- Type: {question_type}
- Score: {score}
- Question: {question_text}
- Topics: {topics}
{answer_section}
Generate THREE sections in HTML format (supports KaTeX: block $$ ... $$ inline $ ... $):
Output JSON:
{{
"knowledge_reminder": "<HTML> Prerequisite knowledge points needed for this question, as a concise bullet list </HTML>",
"ai_hint": "<HTML> A hint that guides thinking direction WITHOUT giving away the answer </HTML>",
"solution": "<HTML> Complete step-by-step solution (Step 1, Step 2, ...) with derivations, formulas, and common mistake warnings </HTML>"
}}
Solution requirements:
- Must include complete working process, not just the answer
- Each step must have an explanation
- If a reference answer is provided, derive the solution based on it
- If no reference answer, work out the complete solution independently
- For MC questions, explain why the correct option is right AND why others are wrong
- Use <ol> or numbered steps
- Mark common mistakes with <div class="common-error">...</div>
KaTeX formula rules:
- Block formula: $$ on its own line, with blank lines before and after
- Inline formula: $x^2$ no line break
- Matrix: \\begin{{bmatrix}} ... \\end{{bmatrix}}
- Fraction: \\frac{{a}}{{b}}
"""
BATCH_ANALYSIS_PROMPT = """You are an expert academic answer analyst. Generate three study sections for each question below. ALL output must be in English.
For every question, return:
- knowledge_reminder: concise prerequisite bullets in HTML
- ai_hint: a helpful hint in HTML without revealing the final answer
- solution: a complete step-by-step solution in HTML
Return JSON in this exact format:
{{
"analyses": [
{{
"question_number": "1a",
"knowledge_reminder": "<HTML>...</HTML>",
"ai_hint": "<HTML>...</HTML>",
"solution": "<HTML>...</HTML>"
}}
]
}}
Rules:
- Return one item for every provided question_number
- Keep each item matched to the same question_number
- All text must be in English
- HTML only, KaTeX compatible
- For MC questions, explain why the correct option is right and why the others are wrong
- For long questions, show a complete derivation or reasoning chain
- Use <ol> or numbered steps in solution when appropriate
- Mark common mistakes with <div class="common-error">...</div>
- CRITICAL: When a question_text contains "[Context from parent question X]" followed by "[Sub-question Y]", the parent section is background context only. You MUST solve ONLY the specific sub-question labeled [Sub-question Y]. Do NOT solve other sub-questions listed in the parent context. Give one precise answer for that single sub-question only.
Questions:
{questions_payload}
"""
# ============================================
# 处理管线
# ============================================
RETRYABLE_ERROR_MARKERS = (
"429",
"rate limit",
"rate_limit",
"too many requests",
"timeout",
"timed out",
"connection",
)
def is_retryable_error(exc: Exception) -> bool:
message = str(exc).lower()
return any(marker in message for marker in RETRYABLE_ERROR_MARKERS)
def pdf_to_images(pdf_bytes: bytes, dpi: int = 96) -> list[str]:
"""将 PDF 每页渲染为 base64 PNG 图片列表96dpi 平衡清晰度与成本)"""
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
images = []
mat = fitz.Matrix(dpi / 72, dpi / 72)
for page in doc:
pix = page.get_pixmap(matrix=mat, colorspace=fitz.csRGB)
img_bytes = pix.tobytes("png")
images.append(base64.b64encode(img_bytes).decode())
doc.close()
return images
def parse_json_response(text: str) -> dict:
"""解析模型返回的 JSON兼容 markdown 代码块包装"""
text = text.strip()
# 去掉 ```json ... ``` 包装
if text.startswith("```"):
lines = text.splitlines()
text = "\n".join(lines[1:-1] if lines[-1].strip() == "```" else lines[1:])
# 移除 JSON 字符串中的非法控制字符0x00-0x1F 除了 \t \n \r
text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', text)
# 修复模型返回的无效 JSON 转义序列:只修奇数个反斜杠后的非法字符
text = re.sub(r'(?<!\\)((?:\\\\)*)\\([^"\\/bfnrtu])', r'\1\\\\\2', text)
return json.loads(text)
async def gemini_vision_json(
*,
system_prompt: str,
images: list[str],
user_text: str = "",
temperature: float = 0,
max_attempts: int = 6,
) -> dict:
"""发送图片 + prompt 给 Gemini vision 模型,返回 JSON"""
client = get_vision_client()
delay_seconds = 2
content: list = []
for b64 in images:
content.append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{b64}"}})
if user_text:
content.append({"type": "text", "text": user_text})
for attempt in range(1, max_attempts + 1):
try:
response = client.chat.completions.create(
model="gemini-2.5-flash",
messages=[
{"role": "system", "content": system_prompt + "\n\nIMPORTANT: Your entire response must be valid JSON only. No markdown, no code fences, no extra text."},
{"role": "user", "content": content},
],
temperature=temperature,
max_tokens=16384,
)
return parse_json_response(response.choices[0].message.content)
except Exception as exc:
if attempt == max_attempts or not is_retryable_error(exc):
raise
await asyncio.sleep(delay_seconds)
delay_seconds = min(delay_seconds * 2, 30)
async def deepseek_json_completion(
*,
system_prompt: str,
user_prompt: str | None = None,
temperature: float = 0,
max_attempts: int = 6,
) -> dict:
"""DeepSeek 纯文本 JSON completion用于 AI trio 生成)"""
client = get_deepseek_client()
delay_seconds = 2
for attempt in range(1, max_attempts + 1):
try:
messages = [{"role": "system", "content": system_prompt}]
if user_prompt:
messages.append({"role": "user", "content": user_prompt})
response = client.chat.completions.create(
model="deepseek-chat",
messages=messages,
temperature=temperature,
max_tokens=8192,
response_format={"type": "json_object"},
)
raw = response.choices[0].message.content
raw = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', raw)
raw = re.sub(r'(?<!\\)((?:\\\\)*)\\([^"\\/bfnrtu])', r'\1\\\\\2', raw)
return json.loads(raw)
except Exception as exc:
if attempt == max_attempts or not is_retryable_error(exc):
raise
await asyncio.sleep(delay_seconds)
delay_seconds = min(delay_seconds * 2, 30)
def chunked(items: list[dict], size: int) -> list[list[dict]]:
return [items[i:i + size] for i in range(0, len(items), size)]
def _question_sort_key(qnum: str) -> tuple:
"""自然排序题号1a < 1b < ... < 1i < 1j < 2ai < 2aii < 10a"""
parts = re.findall(r'(\d+|[a-zA-Z]+|[()]+)', qnum)
key = []
for idx, p in enumerate(parts):
if p.isdigit():
key.append((0, int(p), ''))
elif p in ('(', ')'):
continue
else:
# Single letter (a-z): always sort alphabetically (a=1, b=2, ..., j=10)
if len(p) == 1 and p.isalpha():
key.append((1, ord(p.lower()) - ord('a') + 1, p))
else:
# Multi-letter: roman numerals for sub-sub-questions (i=1, ii=2, iii=3, ...)
romans = {'i':1,'ii':2,'iii':3,'iv':4,'v':5,'vi':6,'vii':7,'viii':8,'ix':9,'x':10,'xi':11,'xii':12,'xiii':13}
if p.lower() in romans:
key.append((2, romans[p.lower()], p))
else:
key.append((1, 0, p))
return tuple(key)
def sort_questions(questions: list[dict]) -> list[dict]:
"""按题号自然排序"""
return sorted(questions, key=lambda q: _question_sort_key(q.get("question_number", "")))
def extract_code_block(text: str) -> str:
"""
从题目文本中提取 Python 代码块。
策略找到第一个明确的代码起始行import/赋值/print
然后把后续所有缩进或延续行一并带上,直到明显的非代码段落。
"""
lines = text.splitlines()
result = []
in_code = False
open_brackets = 0
CODE_START = re.compile(r"^\s*(import |from \w|[A-Za-z_]\w*\s*=|print\()")
for line in lines:
stripped = line.strip()
# 已在代码块内:括号未闭合时继续收集
if in_code and open_brackets > 0:
result.append(stripped)
open_brackets += stripped.count("(") + stripped.count("[") + stripped.count("{")
open_brackets -= stripped.count(")") + stripped.count("]") + stripped.count("}")
continue
# 检测新的代码起始行
if CODE_START.match(line):
in_code = True
result.append(stripped)
open_brackets += stripped.count("(") + stripped.count("[") + stripped.count("{")
open_brackets -= stripped.count(")") + stripped.count("]") + stripped.count("}")
continue
# 非代码行:重置(但保留 in_code=True 以便继续接后续代码行)
in_code = False
return "\n".join(result)
# 保持向后兼容
extract_code_lines = extract_code_block
def try_exec_python(code: str, shared_ns: dict) -> str | None:
"""
在 shared_ns 命名空间中执行 code捕获 stdout。
返回输出字符串,失败返回 None。
"""
buf = io.StringIO()
try:
with redirect_stdout(buf):
exec(code, shared_ns) # noqa: S102
output = buf.getvalue().strip()
return output if output else None
except Exception:
return None
async def _resume_ai_trio(sb, paper_id: str, questions: list[dict]):
"""为缺 solution 的题目生成 AI trio逐条写回 DB。支持断点续传。"""
need = [q for q in questions if not q.get("solution")]
if not need:
# 全部已有 solution直接标记完成
sb.table("papers").update({"status": "ready", "processing_step": None}).eq("id", paper_id).execute()
return
total_q = len(questions)
done_q = total_q - len(need)
# 构建 payload
id_map = {q["question_number"]: q["id"] for q in need}
# 需要完整的 question_text 来生成 AI trio
full_data = sb.table("paper_questions").select(
"id, question_number, question_type, question_text, score, correct_option, correct_answer, raw_answer_text"
).eq("paper_id", paper_id).in_("id", [q["id"] for q in need]).execute().data
payloads = []
for q in full_data:
answer_section = q.get("raw_answer_text") or ""
if not answer_section and q.get("correct_option"):
answer_section = f"Correct option: {q['correct_option']}"
elif not answer_section and q.get("correct_answer"):
answer_section = f"Correct answer: {q['correct_answer']}"
payloads.append({
"question_number": q["question_number"],
"question_type": q["question_type"] or "long_question",
"score": q.get("score") or "unknown",
"question_text": q["question_text"] or "",
"reference_answer": answer_section,
})
batches = chunked(payloads, 3)
for batch_idx, batch in enumerate(batches, 1):
current = done_q + batch_idx * 3
_update_progress(sb, paper_id, f"Generating solutions ({min(current, total_q)}/{total_q} questions)", batch_idx, len(batches))
try:
result = await deepseek_json_completion(
system_prompt=BATCH_ANALYSIS_PROMPT.format(
questions_payload=json.dumps(batch, ensure_ascii=False),
),
temperature=0.3,
)
for item in result.get("analyses", []):
qnum = item.get("question_number")
qid = id_map.get(qnum)
if qid:
sb.table("paper_questions").update({
"knowledge_reminder": item.get("knowledge_reminder", ""),
"ai_hint": item.get("ai_hint", ""),
"solution": item.get("solution", ""),
}).eq("id", qid).execute()
except Exception:
pass # 单批失败不影响其他批
await asyncio.sleep(1)
# 标记完成
sb.table("papers").update({"status": "ready", "processing_step": None}).eq("id", paper_id).execute()
def _update_progress(sb, paper_id: str, step: str, progress: int = 0, total: int = 0):
"""更新处理进度到 DB"""
sb.table("papers").update({
"processing_step": step,
"processing_progress": progress,
"processing_total": total,
}).eq("id", paper_id).execute()
async def process_paper(paper_id: str, paper_bytes: bytes, answer_bytes: bytes | None):
"""后台处理管线: PDF pages → Vision 结构化 → AI 三件套
设计原则:每个步骤完成后立即持久化到 DB支持断点续传。
"""
sb = get_supabase()
try:
# 检查是否已有题目(断点续传场景)
existing = sb.table("paper_questions").select("id, question_number, solution").eq("paper_id", paper_id).execute().data
if existing:
# 已有题目 → 跳过提取,直接补 AI trio
await _resume_ai_trio(sb, paper_id, existing)
return
# ── Step 1: PDF → 图片 ──
_update_progress(sb, paper_id, "Rendering PDF pages...")
paper_images = pdf_to_images(paper_bytes)
# ── Step 2: Vision 结构化拆题 ──
PAGE_BATCH = 8
all_questions: list = []
meta: dict = {}
num_page_batches = -(-len(paper_images) // PAGE_BATCH)
for i in range(0, len(paper_images), PAGE_BATCH):
batch_imgs = paper_images[i:i + PAGE_BATCH]
batch_idx = i // PAGE_BATCH + 1
_update_progress(sb, paper_id, f"Reading pages {i+1}-{i+len(batch_imgs)}...", batch_idx, num_page_batches)
batch_result = await gemini_vision_json(
system_prompt=STRUCTURE_PROMPT,
images=batch_imgs,
user_text=f"Pages {i+1}-{i+len(batch_imgs)} of the exam paper. Extract all questions visible on these pages.",
temperature=0,
)
if not meta:
meta = {k: batch_result.get(k) for k in ("total_score", "difficulty_level", "topics_summary")}
all_questions.extend(batch_result.get("questions", []))
all_questions = sort_questions(all_questions)
questions = all_questions
# 更新 paper 概览
sb.table("papers").update({
"total_score": meta.get("total_score"),
"question_count": len(questions),
"topics_summary": meta.get("topics_summary"),
"difficulty_level": meta.get("difficulty_level"),
}).eq("id", paper_id).execute()
# ── Step 3: 答案匹配(分批,失败跳过)──
answers_map = {}
if answer_bytes:
_update_progress(sb, paper_id, "Matching answers...")
try:
answer_images = pdf_to_images(answer_bytes)
questions_json = json.dumps(
[{"question_number": q["question_number"], "question_type": q["question_type"]}
for q in questions], ensure_ascii=False,
)
all_answers: list = []
for ai in range(0, len(answer_images), 8):
batch_ans_imgs = answer_images[ai:ai + 8]
try:
match_result = await gemini_vision_json(
system_prompt=ANSWER_MATCH_PROMPT.format(
questions_json=questions_json, answer_text="(See images)",
),
images=batch_ans_imgs,
user_text=f"Match answers to these questions: {questions_json}",
temperature=0,
)
all_answers.extend(match_result.get("answers", []))
except Exception:
pass
answers_map = {a["question_number"]: a for a in all_answers}
except Exception:
pass
# ── Step 4: 立即写入题目到 DB先不含 AI trio──
_update_progress(sb, paper_id, "Saving questions...")
for i, q in enumerate(questions):
qnum = q["question_number"]
answer = answers_map.get(qnum, {})
sb.table("paper_questions").insert(strip_nulls({
"paper_id": paper_id,
"question_number": qnum,
"parent_question": q.get("parent_question"),
"display_order": i,
"question_type": q["question_type"],
"question_text": q["question_text"],
"score": q.get("score"),
"page_number": q.get("page_number"),
"options": q.get("options"),
"correct_option": answer.get("correct_option"),
"correct_answer": answer.get("correct_answer"),
"raw_answer_text": answer.get("raw_answer_text"),
"topics": q.get("topics", []),
"analytics_topic": q.get("topics", [None])[0],
"topic_tags": q.get("topics", []),
"difficulty": q.get("difficulty"),
})).execute()
# ── Step 5: AI trio逐条更新支持断点续传──
saved = sb.table("paper_questions").select("id, question_number, solution").eq("paper_id", paper_id).execute().data
await _resume_ai_trio(sb, paper_id, saved)
except Exception as e:
sb.table("papers").update({
"status": "error",
"error_message": f"{type(e).__name__}: {str(e)}\n{traceback.format_exc()[-500:]}",
}).eq("id", paper_id).execute()
raise

View File

@@ -0,0 +1,13 @@
from supabase import create_client, Client
from app.config import get_settings
_client: Client | None = None
def get_supabase() -> Client:
"""获取 Supabase client (service_role绕过 RLS)"""
global _client
if _client is None:
s = get_settings()
_client = create_client(s.supabase_url, s.supabase_service_role_key)
return _client

View File

@@ -0,0 +1,48 @@
"""PDF 文本提取 — 复用 SOS 的 text_extractor 逻辑"""
import base64
import fitz # PyMuPDF
from dataclasses import dataclass
@dataclass
class ExtractedContent:
pages_text: list[str] # 每页文本
page_images: dict[int, str] # 页码 → base64 图片(图片密集型页面)
total_pages: int
has_images: bool
def extract_pdf(file_bytes: bytes) -> ExtractedContent:
"""从 PDF 提取文本和图片"""
doc = fitz.open(stream=file_bytes, filetype="pdf")
pages_text = []
page_images = {}
for i, page in enumerate(doc):
text = page.get_text("text")
pages_text.append(text)
# 如果某页文本很少但有图片,可能是扫描件 → 保存为图片用于 Vision OCR
if len(text.strip()) < 50:
pix = page.get_pixmap(dpi=200)
img_bytes = pix.tobytes("png")
page_images[i] = base64.b64encode(img_bytes).decode("utf-8")
doc.close()
return ExtractedContent(
pages_text=pages_text,
page_images=page_images,
total_pages=len(pages_text),
has_images=len(page_images) > 0,
)
def get_full_text(extracted: ExtractedContent) -> str:
"""合并所有页面文本"""
return "\n\n".join(
f"--- Page {i+1} ---\n{text}"
for i, text in enumerate(extracted.pages_text)
if text.strip()
)

View File

@@ -0,0 +1,252 @@
"""
重新生成所有题目的 AI trio子题带父题上下文。
用法: python backfill_ai_trio_with_context.py [--paper-id <id>] [--course <code>]
"""
import asyncio
import io
import json
import re
import sys
import time
import argparse
from contextlib import redirect_stdout
from app.services.supabase_client import get_supabase
from app.services.llm_clients import get_deepseek_client
def extract_code_lines(text: str) -> str:
lines = (text or "").splitlines()
result = []
in_code = False
open_brackets = 0
CODE_START = re.compile(r"^\s*(import |from \w|[A-Za-z_]\w*\s*=|print\()")
for line in lines:
stripped = line.strip()
if in_code and open_brackets > 0:
result.append(stripped)
open_brackets += stripped.count("(") + stripped.count("[") + stripped.count("{")
open_brackets -= stripped.count(")") + stripped.count("]") + stripped.count("}")
continue
if CODE_START.match(line):
in_code = True
result.append(stripped)
open_brackets += stripped.count("(") + stripped.count("[") + stripped.count("{")
open_brackets -= stripped.count(")") + stripped.count("]") + stripped.count("}")
continue
in_code = False
return "\n".join(result)
def try_exec_python(code: str, shared_ns: dict) -> str | None:
buf = io.StringIO()
try:
with redirect_stdout(buf):
exec(code, shared_ns) # noqa: S102
output = buf.getvalue().strip()
return output if output else None
except Exception:
return None
BATCH_ANALYSIS_PROMPT = """You are an expert academic answer analyst. Generate three study sections for each question below. ALL output must be in English.
For every question, return:
- knowledge_reminder: concise prerequisite bullets in HTML
- ai_hint: a helpful hint in HTML without revealing the final answer
- solution: a complete step-by-step solution in HTML
Return JSON in this exact format:
{{
"analyses": [
{{
"question_number": "1a",
"knowledge_reminder": "<HTML>...</HTML>",
"ai_hint": "<HTML>...</HTML>",
"solution": "<HTML>...</HTML>"
}}
]
}}
Rules:
- Return one item for every provided question_number
- All text must be in English
- HTML only, KaTeX compatible (block $$ ... $$ inline $ ... $)
- For MC questions, explain why the correct option is right and why others are wrong
- For long questions, show a complete derivation or reasoning chain
- Use <ol> or numbered steps in solution when appropriate
- Mark common mistakes with <div class="common-error">...</div>
- CRITICAL: When a question_text contains "[Context from parent question X]" followed by "[Sub-question Y]", the parent section is background context only. You MUST solve ONLY the specific sub-question labeled [Sub-question Y]. Do NOT solve other sub-questions listed in the parent context. Give one precise answer for that single sub-question only.
Questions:
{questions_payload}
"""
def chunked(lst, size):
return [lst[i:i+size] for i in range(0, len(lst), size)]
async def deepseek_batch(batch: list[dict]) -> list[dict]:
client = get_deepseek_client()
for attempt in range(5):
try:
resp = client.chat.completions.create(
model="deepseek-chat",
messages=[{
"role": "system",
"content": BATCH_ANALYSIS_PROMPT.format(
questions_payload=json.dumps(batch, ensure_ascii=False)
)
}],
temperature=0.3,
max_tokens=8192,
response_format={"type": "json_object"},
)
raw = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', resp.choices[0].message.content)
raw = re.sub(r'(?<!\\)((?:\\\\)*)\\([^"\\/bfnrtu])', r'\1\\\\\2', raw)
data = json.loads(raw)
return data.get("analyses", [])
except Exception as e:
print(f" attempt {attempt+1} failed: {e}")
if attempt < 4:
await asyncio.sleep(2 ** attempt * 2)
return []
async def main():
parser = argparse.ArgumentParser()
parser.add_argument("--paper-id", help="Only process this paper")
parser.add_argument("--course", help="Only process papers with this course code")
parser.add_argument("--missing-only", action="store_true", help="Only process questions missing solution")
args = parser.parse_args()
sb = get_supabase()
# Fetch all questions (with paper info for filtering)
query = sb.table("paper_questions").select(
"id, paper_id, question_number, question_type, question_text, "
"parent_question, score, correct_option, correct_answer, raw_answer_text, "
"analytics_topic, topic_tags, solution"
)
if args.paper_id:
query = query.eq("paper_id", args.paper_id)
result = query.order("paper_id").order("display_order").execute()
all_questions = result.data
if args.course:
# Filter by course via papers table
papers_res = sb.table("papers").select("id").eq("course_code", args.course.upper()).execute()
paper_ids = {p["id"] for p in papers_res.data}
all_questions = [q for q in all_questions if q["paper_id"] in paper_ids]
if args.missing_only:
all_questions = [q for q in all_questions if not q.get("solution")]
print(f"Questions missing solution: {len(all_questions)}")
else:
print(f"Total questions to process: {len(all_questions)}")
# Group by paper_id
from collections import defaultdict
by_paper: dict[str, list] = defaultdict(list)
for q in all_questions:
by_paper[q["paper_id"]].append(q)
total_updated = 0
for paper_id, questions in by_paper.items():
print(f"\nPaper {paper_id}{len(questions)} questions")
# 所有题都可能是别的题的父题
parent_text_map: dict[str, str] = {
q["question_number"]: q["question_text"] or ""
for q in questions
}
# Build payloads with context + Python exec
payloads = []
exec_namespaces: dict[str, dict] = {}
for q in questions:
parent_q = q.get("parent_question")
if parent_q and parent_q in parent_text_map:
full_text = (
f"[Context from parent question {parent_q}]\n"
f"{parent_text_map[parent_q]}\n\n"
f"[Sub-question {q['question_number']}]\n"
f"{q['question_text'] or ''}"
)
else:
full_text = q["question_text"] or ""
answer_section = ""
if q.get("raw_answer_text"):
answer_section = q["raw_answer_text"]
elif q.get("correct_option"):
answer_section = f"Correct option: {q['correct_option']}"
elif q.get("correct_answer"):
answer_section = f"Correct answer: {q['correct_answer']}"
# 尝试 Python exec 拿真实输出
if not answer_section:
group_key = parent_q or q["question_number"]
if group_key not in exec_namespaces:
ns: dict = {}
try:
import numpy as np
ns["np"] = np
except ImportError:
pass
# 先执行父题 setup 代码
if parent_q and parent_q in parent_text_map:
setup = extract_code_lines(parent_text_map[parent_q])
try_exec_python(setup, ns)
exec_namespaces[group_key] = ns
ns = exec_namespaces[group_key]
sub_code = extract_code_lines(q["question_text"] or "")
if sub_code:
exec_out = try_exec_python(sub_code, ns)
if exec_out is not None:
answer_section = f"Executed output: {exec_out}"
print(f" [exec] {q['question_number']}: {exec_out[:60]}")
payloads.append({
"_id": q["id"],
"question_number": q["question_number"],
"question_type": q["question_type"] or "long_question",
"score": q.get("score") or "unknown",
"question_text": full_text,
"reference_answer": answer_section,
})
# Process in batches of 3
id_map = {q["question_number"]: q["id"] for q in questions}
for batch in chunked(payloads, 3):
# Strip internal _id before sending to model
model_batch = [{k: v for k, v in p.items() if k != "_id"} for p in batch]
nums = [p["question_number"] for p in batch]
print(f" Batch {nums} ...", end=" ", flush=True)
analyses = await deepseek_batch(model_batch)
for item in analyses:
qnum = item.get("question_number")
qid = id_map.get(qnum)
if not qid:
continue
sb.table("paper_questions").update({
"knowledge_reminder": item.get("knowledge_reminder"),
"ai_hint": item.get("ai_hint"),
"solution": item.get("solution"),
}).eq("id", qid).execute()
total_updated += 1
print(f"done ({len(analyses)} updated)")
await asyncio.sleep(1)
print(f"\nDone. Total updated: {total_updated}")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,160 @@
"""Backfill page_y_ratio for COMP2211 subquestions."""
from __future__ import annotations
import re
import time
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
import fitz
import httpx
from app.services.supabase_client import get_supabase
ROOT = Path(__file__).resolve().parent.parent
PAPERS_DIR = ROOT / "pastpaper-scraper" / "papers" / "COMP2211"
PDF_BY_EXAM_KEY = {
"COMP2211-2022-fall-midterm": "(COMP2211)[2022](f)midterm~=yjz8dxdd^_27002.pdf",
"COMP2211-2022-spring-midterm": "(COMP2211)[2022](s)midterm~=b8bidkgs^_14629.pdf",
"COMP2211-2022-spring-final-part-a": "(COMP2211)[2022](s)final~=b8bidkgs^_33018.pdf",
"COMP2211-2022-spring-final-part-b": "(COMP2211)[2022](s)final~=b8bidkgs^_40627.pdf",
"COMP2211-2023-spring-midterm": "(COMP2211)[2023](s)midterm~=bxbidkmj^_26587.pdf",
"COMP2211-2024-spring-midterm": "(COMP2211)[2024](s)midterm~=rcidkjgf^_82003.pdf",
"COMP2211-2024-spring-final": "(COMP2211)[2024](s)final~=igk5mmg^_90365.pdf",
}
def marker_candidates(question_number: str) -> list[str]:
if "_" in question_number:
left, right = question_number.split("_", 1)
tokens: list[str] = []
m = re.fullmatch(r"(\d+)([a-z])", left)
if m:
tokens.append(f"({m.group(2)})")
elif re.fullmatch(r"\d+[a-z]+", left):
tokens.append(f"({re.sub(r'^\\d+', '', left)})")
tokens.append(f"({right})")
return tokens[::-1]
m = re.fullmatch(r"(\d+)([a-z])", question_number)
if m:
return [f"({m.group(2)})", f"Problem {m.group(1)}"]
if question_number.isdigit():
return [f"Problem {question_number}"]
return [question_number]
def line_matches(line_text: str, marker: str) -> bool:
text = re.sub(r"\s+", " ", line_text.strip())
if not text:
return False
if marker.startswith("("):
return text.startswith(marker)
return marker.lower() in text.lower()
def line_y_ratio(page: fitz.Page, marker: str) -> float | None:
data = page.get_text("dict")
hits: list[float] = []
for block in data.get("blocks", []):
if block.get("type") != 0:
continue
for line in block.get("lines", []):
line_text = "".join(
span.get("text", "")
for span in line.get("spans", [])
)
if line_matches(line_text, marker):
bbox = line.get("bbox")
if bbox:
hits.append(float(bbox[1]))
if not hits:
return None
y = min(hits)
return max(0.0, min((y - page.rect.y0) / page.rect.height, 0.98))
def search_y_ratio(page: fitz.Page, marker: str) -> float | None:
ratios: list[float] = []
for rect in page.search_for(marker):
ratios.append(max(0.0, min((rect.y0 - page.rect.y0) / page.rect.height, 0.98)))
return min(ratios) if ratios else None
def infer_y_ratio(page: fitz.Page, question_number: str) -> float:
for marker in marker_candidates(question_number):
ratio = line_y_ratio(page, marker)
if ratio is not None:
return ratio
ratio = search_y_ratio(page, marker)
if ratio is not None:
return ratio
return 0.05
def main() -> None:
sb = get_supabase()
papers = (
sb.table("papers")
.select("id, source_exam_key")
.eq("course_code", "COMP2211")
.eq("source_kind", "course_library")
.execute()
.data
or []
)
updates: list[tuple[str, float]] = []
for paper in papers:
exam_key = paper["source_exam_key"]
pdf_name = PDF_BY_EXAM_KEY.get(exam_key)
if not pdf_name:
continue
pdf_path = PAPERS_DIR / pdf_name
doc = fitz.open(pdf_path)
try:
questions = (
sb.table("paper_questions")
.select("id, question_number, page_number")
.eq("paper_id", paper["id"])
.order("display_order")
.execute()
.data
or []
)
for question in questions:
page_number = question.get("page_number") or 1
page = doc[page_number - 1]
ratio = infer_y_ratio(page, question["question_number"])
updates.append((question["id"], round(ratio, 4)))
finally:
doc.close()
def apply_update(payload: tuple[str, float]) -> None:
question_id, ratio = payload
attempts = 0
while True:
try:
sb.table("paper_questions").update({"page_y_ratio": ratio}).eq("id", question_id).execute()
return
except httpx.HTTPError:
attempts += 1
if attempts >= 5:
raise
time.sleep(0.4 * attempts)
with ThreadPoolExecutor(max_workers=3) as executor:
futures = [executor.submit(apply_update, payload) for payload in updates]
for future in as_completed(futures):
future.result()
print(f"Backfilled page_y_ratio for {len(updates)} COMP2211 questions.")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,365 @@
"""Backfill COMP2211 tags to the revised retrieval schema."""
from __future__ import annotations
import re
from collections import OrderedDict
from app.services.supabase_client import get_supabase
SKILL_LABELS = {
"concept_check": "Concept Check",
"code_tracing": "Code Tracing",
"algorithm_tracing": "Algorithm Tracing",
"distance_calculation": "Distance Calculation",
"centroid_update": "Centroid Update",
"weight_update": "Weight Update",
"decision_boundary": "Decision Boundary",
"implementation": "Implementation",
"debugging": "Debugging",
"model_selection": "Model Selection",
"concept_explanation": "Concept Explanation",
"architecture_reasoning": "Architecture Reasoning",
"convergence_reasoning": "Convergence Reasoning",
"generalization_reasoning": "Generalization Reasoning",
"classification_decision": "Classification Decision",
}
ACRONYMS = {
"ai": "AI",
"cnn": "CNN",
"knn": "KNN",
"mlp": "MLP",
"nb": "NB",
"numpy": "NumPy",
}
def title_case_with_acronyms(value: str) -> str:
words = re.split(r"[\s_]+", value.strip())
parts: list[str] = []
for word in words:
if not word:
continue
lowered = word.lower()
parts.append(ACRONYMS.get(lowered, lowered.capitalize()))
return " ".join(parts)
def normalize_skill_tag(tag: str) -> str:
if tag in SKILL_LABELS:
return SKILL_LABELS[tag]
return title_case_with_acronyms(tag)
def text_blob(question: dict) -> str:
parts = [
question.get("question_text") or "",
question.get("raw_answer_text") or "",
" ".join(question.get("topic_tags") or []),
" ".join(question.get("skill_tags") or []),
question.get("analytics_topic") or "",
]
return " ".join(parts).lower()
def has_any(text: str, phrases: list[str]) -> bool:
return any(phrase in text for phrase in phrases)
def infer_analytics_topic(question: dict) -> str:
text = text_blob(question)
broad = question.get("analytics_topic") or ""
skills = {normalize_skill_tag(tag) for tag in (question.get("skill_tags") or [])}
if has_any(text, ["ethics", "bias", "privacy", "autonomous vehicle", "informed consent", "human participants", "ethically"]):
return "Ethics of AI"
if has_any(text, ["minimax", "alpha-beta", "alpha beta", "game tree", "tic-tac-toe", "tic tac toe"]):
return "Game Trees"
if has_any(text, ["search algorithm", "best-first", "breadth-first", "depth-first", "a* search", "a star"]):
return "Search Algorithms"
if has_any(text, ["cross validation", "d-fold", "k-fold", "train/val", "validation set", "fold "]) or broad == "Cross Validation":
return "Cross Validation"
if has_any(text, ["confusion matrix", "precision", "recall", "macro f1", "f1 score", "accuracy score", "evaluation metric"]):
return "Evaluation Metrics"
if has_any(text, ["naive bayes", "gaussian distribution", "laplace smoothing", "likelihood", "posterior probability"]) or broad == "Naive Bayes":
return "Naive Bayes"
if has_any(text, ["bayes classifier", "conditional probability", "bayesian inference", "prior probability", "posterior"]) or broad == "Bayesian Inference":
return "Bayesian Inference"
if has_any(text, ["leader clustering", "k-means", "k means", "centroid", "elbow method", "silhouette", "cluster assignments", "closest centroid", "new cluster"]):
return "K-Means"
if has_any(text, ["k-nearest", "nearest neighbors", "weighted knn", "cosine distance", "euclidean distance", "manhattan distance", "6-cross-validation error for k", "class for cosine distance"]):
return "KNN"
if has_any(text, ["multilayer perceptron", "mlp", "back propagation", "backpropagation", "hidden layer", "output layer", "dropout", "softmax", "sigmoid function", "relu as the activation"]) or broad == "MLP":
return "MLP"
if has_any(text, ["perceptron", "decision boundary", "single neuron", "weight update", "activation function f(z)", "linearly separable"]) or broad == "Perceptron":
return "Perceptron"
if has_any(text, ["convolutional neural network", "cnn", "kernel", "padding", "stride", "pooling", "dilated convolution", "3d convolution", "otsu", "histogram", "image processing", "grayscale image"]):
return "CNN"
if has_any(text, ["numpy", "python", "np.", "broadcasting", "reshape", "transpose", "mask", "vectorized", "np.arange", "np.mean", "np.dot", "np.convolve"]):
return "Python and NumPy"
if broad == "KNN and Clustering":
if (
has_any(text, ["k-means", "k means", "centroid", "leader clustering", "elbow", "silhouette"])
or "Centroid Update" in skills
or "Convergence Reasoning" in skills
or "Algorithm Tracing" in skills
or "Model Selection" in skills
):
return "K-Means"
return "KNN"
if broad == "Perceptron and MLP":
if (
has_any(text, ["hidden layer", "backprop", "activation function", "softmax", "relu", "sigmoid", "multilayer perceptron", "mlp"])
or "Architecture Reasoning" in skills
):
return "MLP"
return "Perceptron"
if broad == "Probabilistic Models":
if has_any(text, ["naive bayes", "gaussian", "laplace", "likelihood"]):
return "Naive Bayes"
return "Bayesian Inference"
if broad == "Evaluation and Validation":
if has_any(text, ["cross validation", "cross-validation", "k-fold", "d-fold", "validation set", "train/val"]):
return "Cross Validation"
return "Evaluation Metrics"
if broad == "Search and Games":
if has_any(text, ["minimax", "alpha-beta", "alpha beta", "game tree"]):
return "Game Trees"
return "Search Algorithms"
broad_map = {
"Vision and CNN": "CNN",
"Python Fundamentals": "Python and NumPy",
"Ethics of AI": "Ethics of AI",
}
return broad_map.get(broad, "Python and NumPy")
TOPIC_CONCEPTS = {
"Naive Bayes": [
("Naive Bayes", ["naive bayes"]),
("Prior", ["prior"]),
("Likelihood", ["likelihood"]),
("Posterior", ["posterior"]),
("Gaussian", ["gaussian"]),
("Laplace Smoothing", ["laplace"]),
("Missing Data", ["missing data", "missing value"]),
],
"Bayesian Inference": [
("Bayesian Inference", ["bayes", "conditional probability", "posterior"]),
("Conditional Probability", ["conditional probability"]),
("Bayes Rule", ["bayes rule", "posterior"]),
("Prior", ["prior"]),
("Posterior", ["posterior"]),
],
"KNN": [
("KNN", ["k-nearest", "nearest neighbors", "knn"]),
("Euclidean Distance", ["euclidean distance"]),
("Manhattan Distance", ["manhattan distance"]),
("Cosine Distance", ["cosine distance"]),
("Weighted KNN", ["weighted k-nearest", "weighted knn", "inverse of the distance"]),
("Classification", ["class label", "predict", "classification"]),
("Cross Validation", ["cross-validation", "cross validation"]),
("Test Error", ["test error"]),
],
"K-Means": [
("K-Means", ["k-means", "k means"]),
("Centroid Update", ["centroid"]),
("Convergence", ["converged", "convergence"]),
("Leader Clustering", ["leader clustering"]),
("Outliers", ["outlier"]),
("Model Selection", ["elbow method", "silhouette", "suitable k"]),
],
"Perceptron": [
("Perceptron", ["perceptron"]),
("Decision Boundary", ["decision boundary", "linearly separable"]),
("Weight Update", ["weight update", "∆w", "deltaw", "backward propagation"]),
("Convergence", ["converged", "convergence"]),
("Activation Function", ["activation function"]),
],
"MLP": [
("MLP", ["mlp", "multilayer perceptron"]),
("Backpropagation", ["back propagation", "backpropagation", "backward propagation"]),
("Activation Function", ["activation function", "relu", "sigmoid", "softmax"]),
("Hidden Layer", ["hidden layer"]),
("Output Layer", ["output layer"]),
("Parameter Count", ["number of parameters", "parameter"]),
("Overfitting", ["overfitting", "dropout"]),
],
"CNN": [
("CNN", ["cnn", "convolutional neural network"]),
("Convolution", ["convolution", "kernel"]),
("Padding", ["padding", "reflection padding", "zero padding"]),
("Stride", ["stride"]),
("Pooling", ["pooling", "max pooling", "average pooling"]),
("Image Processing", ["image processing", "grayscale image"]),
("Histogram", ["histogram"]),
("Otsu Thresholding", ["otsu"]),
("Dilated Convolution", ["dilated convolution"]),
("3D Convolution", ["3d convolution"]),
("Dropout", ["dropout"]),
],
"Evaluation Metrics": [
("Evaluation Metrics", ["evaluation", "metric"]),
("Confusion Matrix", ["confusion matrix"]),
("Accuracy", ["accuracy"]),
("Precision", ["precision"]),
("Recall", ["recall"]),
("F1 Score", ["f1"]),
("Macro F1", ["macro f1"]),
],
"Cross Validation": [
("Cross Validation", ["cross validation", "cross-validation", "d-fold", "k-fold"]),
("Train Validation Split", ["validation set", "train", "test fold"]),
("Model Selection", ["choose k", "which k", "fold"]),
("Data Shuffling", ["shuffle", "shuffling"]),
],
"Python and NumPy": [
("Python and NumPy", ["numpy", "python"]),
("NumPy", ["numpy", "np."]),
("Broadcasting", ["broadcast"]),
("Array Indexing", ["index", "slice"]),
("Vectorization", ["no explicit loops", "vectorized"]),
("Matrix Multiplication", ["matmul", "matrix multiplication", "@"]),
("Reshape", ["reshape"]),
("Transpose", ["transpose"]),
("Masking", ["mask"]),
("Convolution", ["convolve"]),
],
"Search Algorithms": [
("Search Algorithms", ["search"]),
("Breadth-First Search", ["breadth-first", "breadth first", "bfs"]),
("Depth-First Search", ["depth-first", "depth first", "dfs"]),
("Best-First Search", ["best-first", "best first"]),
("A* Search", ["a* search", "a star", "astar"]),
("Heuristic", ["heuristic"]),
],
"Game Trees": [
("Game Trees", ["game tree", "minimax", "alpha-beta", "alpha beta"]),
("Minimax", ["minimax"]),
("Alpha-Beta Pruning", ["alpha-beta", "alpha beta", "pruned"]),
("Utility", ["utility"]),
],
"Ethics of AI": [
("Ethics of AI", ["ethics", "ethical"]),
("Bias", ["bias"]),
("Privacy", ["privacy"]),
("Fairness", ["fair"]),
("Research Ethics", ["informed consent", "human participants"]),
("Governance", ["monitoring", "production", "organizations"]),
("Autonomous Vehicles", ["autonomous vehicle"]),
],
}
TOPIC_DEFAULTS = {
"Naive Bayes": ["Likelihood", "Posterior"],
"Bayesian Inference": ["Conditional Probability", "Bayes Rule"],
"KNN": ["Classification", "Distance Calculation"],
"K-Means": ["Centroid Update", "Convergence"],
"Perceptron": ["Decision Boundary", "Weight Update"],
"MLP": ["Activation Function", "Hidden Layer"],
"CNN": ["Convolution", "Padding"],
"Evaluation Metrics": ["Confusion Matrix", "F1 Score"],
"Cross Validation": ["Train Validation Split", "Model Selection"],
"Python and NumPy": ["NumPy", "Vectorization"],
"Search Algorithms": ["Breadth-First Search", "Heuristic"],
"Game Trees": ["Minimax", "Alpha-Beta Pruning"],
"Ethics of AI": ["Bias", "Fairness"],
}
DEFAULT_SKILLS = {
"Naive Bayes": ["Probability Reasoning"],
"Bayesian Inference": ["Probability Reasoning"],
"KNN": ["Classification Decision"],
"K-Means": ["Centroid Update"],
"Perceptron": ["Decision Boundary"],
"MLP": ["Concept Explanation"],
"CNN": ["Concept Explanation"],
"Evaluation Metrics": ["Metric Reasoning"],
"Cross Validation": ["Model Selection"],
"Python and NumPy": ["Code Tracing"],
"Search Algorithms": ["Algorithm Tracing"],
"Game Trees": ["Game Reasoning"],
"Ethics of AI": ["Ethical Reasoning"],
}
def unique_keep_order(values: list[str]) -> list[str]:
return list(OrderedDict((value, None) for value in values if value).keys())
def build_topic_tags(question: dict, analytics_topic: str) -> list[str]:
text = text_blob(question)
tags: list[str] = [analytics_topic]
for label, keywords in TOPIC_CONCEPTS.get(analytics_topic, []):
if label == analytics_topic:
continue
if has_any(text, keywords):
tags.append(label)
for default in TOPIC_DEFAULTS.get(analytics_topic, []):
if len(unique_keep_order(tags)) >= 2:
break
tags.append(default)
tags = unique_keep_order(tags)
return tags[:5]
def build_skill_tags(question: dict, analytics_topic: str) -> list[str]:
raw = question.get("skill_tags") or []
converted = unique_keep_order([normalize_skill_tag(tag) for tag in raw])
if not converted:
converted = DEFAULT_SKILLS.get(analytics_topic, ["Concept Check"])
return converted[:3]
def main() -> None:
sb = get_supabase()
papers = (
sb.table("papers")
.select("id")
.eq("course_code", "COMP2211")
.eq("source_kind", "course_library")
.execute()
.data
)
paper_ids = [paper["id"] for paper in papers]
if not paper_ids:
print("No COMP2211 course-library papers found.")
return
questions = (
sb.table("paper_questions")
.select("id, paper_id, question_number, question_text, raw_answer_text, analytics_topic, topic_tags, skill_tags, topics")
.in_("paper_id", paper_ids)
.order("paper_id")
.order("display_order")
.execute()
.data
)
for question in questions:
analytics_topic = infer_analytics_topic(question)
topic_tags = build_topic_tags(question, analytics_topic)
skill_tags = build_skill_tags(question, analytics_topic)
payload = {
"analytics_topic": analytics_topic,
"topic_primary": analytics_topic,
"topic_tags": topic_tags,
"topics": topic_tags,
"skill_tags": skill_tags,
}
sb.table("paper_questions").update(payload).eq("id", question["id"]).execute()
print(f"Backfilled {len(questions)} COMP2211 questions.")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,169 @@
"""Backfill AI trio for questions where knowledge_reminder IS NULL.
For each question, generates fields in two separate LLM calls to avoid token truncation:
Call 1 → knowledge_reminder + ai_hint (short, ~500 tokens output)
Call 2 → solution (long, up to 4096 tokens output)
Run from the backend directory:
uv run python backfill_null_ai_trio.py [--dry-run]
"""
from __future__ import annotations
import asyncio
import json
import sys
from app.services.supabase_client import get_supabase
from app.services.paper_processor import qwen_json_completion
KNOWLEDGE_HINT_PROMPT = """\
You are an expert tutor. Given a past-paper question, produce two short study aids in English.
Return JSON exactly:
{{
"knowledge_reminder": "2-4 sentences summarising the key concept or formula the student must recall.",
"ai_hint": "1-3 sentence nudge that guides WITHOUT giving the answer away."
}}
Question:
{payload}
"""
SOLUTION_PROMPT = """\
You are an expert tutor. Given a past-paper question and its reference answer, write a clear, \
step-by-step model solution in English. Show all working. Be thorough but stop when the answer \
is complete — do not pad.
Return JSON exactly:
{{
"solution": "<full step-by-step solution as a single string, use \\n for line breaks>"
}}
Question:
{payload}
"""
def build_payload(q: dict) -> dict:
ref = ""
if q.get("raw_answer_text"):
ref = q["raw_answer_text"]
elif q.get("correct_option"):
ref = f"Correct option: {q['correct_option']}"
elif q.get("correct_answer"):
ref = f"Correct answer: {q['correct_answer']}"
return {
"question_number": q["question_number"],
"question_type": q["question_type"] or "long_question",
"score": q.get("score") or "unknown",
"question_text": q.get("question_text") or "",
"topics": q.get("topics") or [],
"reference_answer": ref,
}
async def process_one(sb, q: dict, dry_run: bool) -> bool:
payload_str = json.dumps(build_payload(q), ensure_ascii=False)
row_id = q["id"]
qnum = q["question_number"]
if dry_run:
print(f" [dry-run] would process {qnum}")
return True
update: dict = {}
# ── Call 1: knowledge_reminder + ai_hint ─────────────────────────
try:
r1 = await qwen_json_completion(
system_prompt=KNOWLEDGE_HINT_PROMPT.format(payload=payload_str),
temperature=0.3,
max_tokens=1024,
)
if r1.get("knowledge_reminder"):
update["knowledge_reminder"] = r1["knowledge_reminder"]
if r1.get("ai_hint"):
update["ai_hint"] = r1["ai_hint"]
except Exception as e:
print(f" WARN call-1 failed for {qnum}: {e}")
await asyncio.sleep(1)
# ── Call 2: solution ──────────────────────────────────────────────
try:
r2 = await qwen_json_completion(
system_prompt=SOLUTION_PROMPT.format(payload=payload_str),
temperature=0.3,
max_tokens=4096,
)
if r2.get("solution"):
update["solution"] = r2["solution"]
except Exception as e:
print(f" WARN call-2 failed for {qnum}: {e}")
if not update:
print(f" SKIP {qnum}: both calls returned nothing")
return False
sb.table("paper_questions").update(update).eq("id", row_id).execute()
return True
async def backfill(dry_run: bool = False) -> None:
sb = get_supabase()
papers = (
sb.table("papers")
.select("id")
.eq("course_code", "COMP2211")
.eq("source_kind", "course_library")
.execute()
.data
)
paper_ids = [p["id"] for p in papers]
if not paper_ids:
print("No COMP2211 course-library papers found.")
return
questions = (
sb.table("paper_questions")
.select("id, paper_id, question_number, question_type, score, question_text, topics, raw_answer_text, correct_option, correct_answer")
.in_("paper_id", paper_ids)
.is_("knowledge_reminder", "null")
.order("paper_id")
.order("display_order")
.execute()
.data
)
if not questions:
print("No NULL questions found — all done!")
return
print(f"Found {len(questions)} questions with NULL knowledge_reminder.")
# Group by paper for cleaner output
from collections import defaultdict
by_paper: dict[str, list] = defaultdict(list)
for q in questions:
by_paper[q["paper_id"]].append(q)
total_updated = 0
for paper_idx, (paper_id, qs) in enumerate(by_paper.items(), 1):
print(f"\n[{paper_idx}/{len(by_paper)}] paper_id={paper_id}{len(qs)} NULL questions")
for q in qs:
print(f" Processing {q['question_number']}...", end=" ", flush=True)
ok = await process_one(sb, q, dry_run)
if ok:
total_updated += 1
print("done")
await asyncio.sleep(1.5)
print(f"\nDone. {total_updated}/{len(questions)} questions updated.")
if __name__ == "__main__":
dry_run = "--dry-run" in sys.argv
asyncio.run(backfill(dry_run=dry_run))

View File

@@ -0,0 +1,135 @@
"""Pre-compute similar_questions for all COMP2211 course-library questions.
For each question, runs the same similarity logic as the API and writes the result
into paper_questions.similar_questions (JSONB). The API will then return this
pre-computed value directly with no computation overhead.
Run from the backend directory:
uv run python backfill_similar_questions.py [--dry-run]
"""
from __future__ import annotations
import sys
from collections import Counter
from app.services.supabase_client import get_supabase
from app.routers.questions import (
similarity_score,
question_family,
display_topics,
)
def run(dry_run: bool = False) -> None:
sb = get_supabase()
# Fetch all ready COMP2211 papers
papers = (
sb.table("papers")
.select("id, year, term, exam_type, part_label")
.eq("course_code", "COMP2211")
.eq("status", "ready")
.execute()
.data
)
if not papers:
print("No ready COMP2211 papers found.")
return
papers_by_id = {p["id"]: p for p in papers}
paper_ids = list(papers_by_id.keys())
# Fetch all questions for these papers
all_questions = (
sb.table("paper_questions")
.select(
"id, paper_id, question_number, question_type, question_format, "
"question_text, score, topics, analytics_topic, topic_tags, skill_tags, "
"difficulty, knowledge_reminder, ai_hint, solution"
)
.in_("paper_id", paper_ids)
.execute()
.data
)
print(f"Found {len(all_questions)} questions across {len(papers)} papers.")
# Batch full-text scores not practical here; skip RPC, rely on tag/topic scoring
# (text_score = 0 for all, still produces good tag-based results)
updated = 0
skipped = 0
for i, target in enumerate(all_questions, 1):
target_paper_id = target["paper_id"]
target_topic = target.get("analytics_topic")
# Candidates: same course, different paper
candidates = [
q for q in all_questions
if q["paper_id"] != target_paper_id
]
# Pre-filter by analytics_topic if available
if target_topic:
candidates = [c for c in candidates if c.get("analytics_topic") == target_topic]
if not candidates:
skipped += 1
print(f" [{i}/{len(all_questions)}] {target['question_number']} — no candidates, skip")
continue
ranked = []
for candidate in candidates:
match_percent, reasons = similarity_score(target, candidate, text_score=0.0)
if match_percent < 20:
continue
paper = papers_by_id.get(candidate["paper_id"], {})
source = (
f"{paper.get('year', '')} {paper.get('term', '').title()} "
f"{paper.get('exam_type', '').title()}"
).strip()
if paper.get("part_label"):
source = f"{source} Part {paper['part_label']}"
ranked.append({
"id": candidate["id"],
"paper_id": candidate["paper_id"],
"source": source,
"question_number": candidate["question_number"],
"match_percent": match_percent,
"match_reasons": reasons,
"question_type": question_family(candidate),
"question_text": candidate["question_text"],
"topics": display_topics(candidate),
"difficulty": candidate.get("difficulty"),
"knowledge_reminder": candidate.get("knowledge_reminder", ""),
"ai_hint": candidate.get("ai_hint", ""),
"solution": candidate.get("solution", ""),
})
ranked.sort(key=lambda item: (-item["match_percent"], item["source"], item["question_number"]))
# Deduplicate: best per paper
seen_papers: set[str] = set()
deduped = []
for item in ranked:
if item["paper_id"] not in seen_papers:
seen_papers.add(item["paper_id"])
deduped.append(item)
deduped = deduped[:12]
print(f" [{i}/{len(all_questions)}] {target['question_number']}{len(deduped)} similar", end="")
if dry_run:
print(" [dry-run]")
continue
sb.table("paper_questions").update({"similar_questions": deduped}).eq("id", target["id"]).execute()
updated += 1
print()
print(f"\nDone. {updated} updated, {skipped} skipped (no candidates).")
if __name__ == "__main__":
dry_run = "--dry-run" in sys.argv
run(dry_run=dry_run)

238
backend/backfill_vision.py Normal file
View File

@@ -0,0 +1,238 @@
"""
用 Vision 模式重新处理所有已 ready 的试卷:
- 从 Supabase Storage 拉 PDF → 图片 → Vision 拆题 → exec → AI trio → 更新 DB
用法:
python backfill_vision.py --course COMP2211
python backfill_vision.py --paper-id <uuid>
"""
import asyncio
import argparse
import requests
from app.services.supabase_client import get_supabase
from app.services.paper_processor import (
process_paper,
strip_nulls,
pdf_to_images,
gemini_vision_json,
deepseek_json_completion,
parse_json_response,
extract_code_lines,
try_exec_python,
chunked,
sort_questions,
STRUCTURE_PROMPT,
ANSWER_MATCH_PROMPT,
BATCH_ANALYSIS_PROMPT,
)
import json
import traceback
async def reprocess_paper(paper: dict):
"""重新处理单张试卷Vision 模式)"""
sb = get_supabase()
paper_id = paper["id"]
label = f"{paper['course_code']} {paper['year']} {paper['term']} {paper['exam_type']}"
print(f"\n=== {label} ({paper_id[:8]}) ===")
# 1. 拉 PDF
try:
pdf_bytes = requests.get(paper["paper_file_url"], timeout=60).content
except Exception as e:
print(f" SKIP: failed to fetch PDF: {e}")
return
answer_bytes = None
if paper.get("answer_file_url"):
try:
answer_bytes = requests.get(paper["answer_file_url"], timeout=60).content
except Exception:
pass
# 2. PDF → 图片
print(f" Rendering {len(pdf_to_images(pdf_bytes))} pages...", end=" ", flush=True)
paper_images = pdf_to_images(pdf_bytes)
print("done")
# 3. Vision 拆题(分批,每批 8 页)
PAGE_BATCH = 8
all_questions: list = []
meta: dict = {}
print(f" Vision extraction ({len(paper_images)} pages, {-(-len(paper_images)//PAGE_BATCH)} batches)...")
for i in range(0, len(paper_images), PAGE_BATCH):
batch_imgs = paper_images[i:i + PAGE_BATCH]
print(f" Pages {i+1}-{i+len(batch_imgs)}...", end=" ", flush=True)
try:
batch_result = await gemini_vision_json(
system_prompt=STRUCTURE_PROMPT,
images=batch_imgs,
user_text=f"Pages {i+1}-{i+len(batch_imgs)} of the exam paper. Extract all questions visible on these pages.",
temperature=0,
)
if not meta:
meta = {k: batch_result.get(k) for k in ("total_score", "difficulty_level", "topics_summary")}
qs = batch_result.get("questions", [])
all_questions.extend(qs)
print(f"done ({len(qs)} questions)")
except Exception as e:
print(f"FAILED: {e}")
structure = {**meta, "questions": all_questions}
questions = sort_questions(all_questions)
print(f" Total: {len(questions)} questions extracted")
# 4. 答案匹配
answers_map = {}
if answer_bytes:
print(" Vision answer matching...", end=" ", flush=True)
answer_images = pdf_to_images(answer_bytes)
questions_json = json.dumps(
[{"question_number": q["question_number"], "question_type": q["question_type"]}
for q in questions], ensure_ascii=False
)
try:
match_result = await gemini_vision_json(
system_prompt=ANSWER_MATCH_PROMPT.format(
questions_json=questions_json, answer_text="(See images)"
),
images=answer_images,
user_text=f"Match answers to these questions: {questions_json}",
temperature=0,
)
answers_map = {a["question_number"]: a for a in match_result.get("answers", [])}
print(f"done ({len(answers_map)} matched)")
except Exception as e:
print(f"FAILED: {e}")
# 5. 构建 payloadsexec Python
import numpy as np
exec_namespaces: dict = {}
batched_payloads = []
for q in questions:
qnum = q["question_number"]
answer = answers_map.get(qnum, {})
full_text = q["question_text"] or ""
answer_section = ""
if answer.get("raw_answer_text"):
answer_section = answer["raw_answer_text"]
elif answer.get("correct_option"):
answer_section = f"Correct option: {answer['correct_option']}"
elif answer.get("correct_answer"):
answer_section = f"Correct answer: {answer['correct_answer']}"
if not answer_section:
parent_q = q.get("parent_question")
group_key = parent_q or qnum
if group_key not in exec_namespaces:
ns: dict = {"np": np}
setup = extract_code_lines(full_text)
try_exec_python(setup, ns)
exec_namespaces[group_key] = ns
ns = exec_namespaces[group_key]
print_lines = [l.strip() for l in full_text.splitlines() if l.strip().startswith("print(")]
if print_lines:
out = try_exec_python(print_lines[-1], ns)
if out is not None:
answer_section = f"Executed output: {out}"
print(f" [exec] {qnum}: {out[:60]}")
batched_payloads.append({
"question_number": qnum,
"question_type": q["question_type"],
"score": q.get("score", "unknown"),
"question_text": full_text,
"topics": q.get("topics", []),
"reference_answer": answer_section,
})
# 6. AI trio
print(f" Generating AI trio ({len(batched_payloads)} questions, {len(list(chunked(batched_payloads, 3)))} batches)...")
analyses: dict = {}
for batch in chunked(batched_payloads, 3):
nums = [p["question_number"] for p in batch]
print(f" Batch {nums}...", end=" ", flush=True)
try:
result = await deepseek_json_completion(
system_prompt=BATCH_ANALYSIS_PROMPT.format(
questions_payload=json.dumps(batch, ensure_ascii=False)
),
temperature=0.3,
)
for item in result.get("analyses", []):
if item.get("question_number"):
analyses[item["question_number"]] = item
print(f"done ({len(result.get('analyses', []))})")
except Exception as e:
print(f"FAILED: {e}")
await asyncio.sleep(1)
# 7. 删除旧题目,写入新题目
print(" Writing to DB...", end=" ", flush=True)
sb.table("paper_questions").delete().eq("paper_id", paper_id).execute()
for i, q in enumerate(questions):
qnum = q["question_number"]
answer = answers_map.get(qnum, {})
analysis = analyses.get(qnum, {})
sb.table("paper_questions").insert(strip_nulls({
"paper_id": paper_id,
"question_number": qnum,
"parent_question": q.get("parent_question"),
"display_order": i,
"question_type": q["question_type"],
"question_text": q["question_text"],
"score": q.get("score"),
"page_number": q.get("page_number"),
"options": q.get("options"),
"correct_option": answer.get("correct_option"),
"correct_answer": answer.get("correct_answer"),
"raw_answer_text": answer.get("raw_answer_text"),
"topics": q.get("topics", []),
"analytics_topic": q.get("topics", [None])[0],
"topic_tags": q.get("topics", []),
"difficulty": q.get("difficulty"),
"knowledge_reminder": analysis.get("knowledge_reminder", ""),
"ai_hint": analysis.get("ai_hint", ""),
"solution": analysis.get("solution", ""),
})).execute()
sb.table("papers").update({
"question_count": len(questions),
"total_score": structure.get("total_score"),
"topics_summary": structure.get("topics_summary"),
"difficulty_level": structure.get("difficulty_level"),
}).eq("id", paper_id).execute()
print(f"done ({len(questions)} questions written)")
async def main():
parser = argparse.ArgumentParser()
parser.add_argument("--course", help="Course code")
parser.add_argument("--paper-id", help="Single paper ID")
args = parser.parse_args()
sb = get_supabase()
query = sb.table("papers").select("*").eq("status", "ready")
if args.paper_id:
query = query.eq("id", args.paper_id)
elif args.course:
query = query.eq("course_code", args.course.upper())
papers = query.order("created_at").execute().data
print(f"Papers to reprocess: {len(papers)}")
for paper in papers:
try:
await reprocess_paper(paper)
except Exception as e:
print(f" ERROR: {e}")
traceback.print_exc()
print("\nAll done.")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,29 @@
"""Deprecated: study aids must come from LLM output, not template fillers."""
from __future__ import annotations
import sys
MESSAGE = """
fill_manual_study_aids.py is intentionally disabled.
Reason:
- knowledge_reminder / ai_hint / solution must be generated by LLM
- template-based filler content polluted the COMP2211 course library
Use one of these paths instead:
1. Regenerate study aids through the real LLM pipeline in app/services/paper_processor.py
2. Rebuild paper_questions from a reviewed source and then run LLM generation
This script must not be used to backfill production study aids.
""".strip()
def main() -> None:
print(MESSAGE, file=sys.stderr)
raise SystemExit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,240 @@
"""Import a canonical course manifest into Supabase-backed papers."""
from __future__ import annotations
import argparse
import asyncio
import json
from pathlib import Path
from typing import Any
from app.services.paper_processor import process_paper
from app.services.supabase_client import get_supabase
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Import a canonical course paper manifest into Supabase."
)
parser.add_argument(
"--manifest",
type=Path,
required=True,
help="Path to the manifest JSON file.",
)
parser.add_argument(
"--papers-root",
type=Path,
required=True,
help="Root folder that contains the course PDF files referenced by the manifest.",
)
parser.add_argument(
"--user-id",
required=False,
help="Existing auth.users UUID used as the owner of imported course-library rows.",
)
parser.add_argument(
"--course-code",
help="Optional filter to only import entries from one course.",
)
parser.add_argument(
"--exam-key",
action="append",
dest="exam_keys",
default=[],
help="Optional exam_key filter. Repeat the flag to import multiple entries.",
)
parser.add_argument(
"--process",
action="store_true",
help="Run the full paper processing pipeline after the files are uploaded.",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Print what would be imported without uploading or writing database rows.",
)
return parser.parse_args()
def load_manifest(path: Path) -> list[dict[str, Any]]:
with path.open("r", encoding="utf-8") as f:
data = json.load(f)
if not isinstance(data, list):
raise ValueError("Manifest must be a JSON array.")
return data
def should_import(entry: dict[str, Any], args: argparse.Namespace) -> bool:
if args.course_code and entry.get("course_code") != args.course_code:
return False
if args.exam_keys and entry.get("exam_key") not in set(args.exam_keys):
return False
return bool(entry.get("importable"))
def resolve_file_path(root: Path, filename: str | None) -> Path | None:
if not filename:
return None
direct = root / filename
if direct.exists():
return direct
all_files = [candidate for candidate in root.iterdir() if candidate.is_file()]
def normalize(name: str) -> str:
return name.replace(" (1)", "")
target_name = normalize(filename)
normalized = [candidate for candidate in all_files if normalize(candidate.name) == target_name]
if len(normalized) == 1:
return normalized[0]
path = Path(filename)
normalized_stem = normalize(path.stem)
suffix = path.suffix
stem_matches = [
candidate
for candidate in all_files
if candidate.suffix == suffix and normalize(candidate.stem) == normalized_stem
]
if len(stem_matches) == 1:
return stem_matches[0]
return None
def read_file_bytes(root: Path, filename: str | None) -> bytes | None:
if not filename:
return None
path = resolve_file_path(root, filename)
if path is None or not path.exists():
raise FileNotFoundError(f"Referenced file does not exist under {root}: {filename}")
return path.read_bytes()
def build_storage_path(entry: dict[str, Any], kind: str) -> str:
exam_key = entry["exam_key"]
return f"course-library/{entry['course_code']}/{exam_key}/{kind}.pdf"
def upsert_paper_record(
entry: dict[str, Any],
user_id: str | None,
paper_url: str,
answer_url: str | None,
) -> str:
sb = get_supabase()
payload = {
"user_id": user_id,
"course_code": entry["course_code"],
"year": entry["year"],
"term": entry["term"],
"exam_type": entry["exam_type"],
"part_label": entry.get("part_label"),
"paper_file_url": paper_url,
"answer_file_url": answer_url,
"status": "processing",
"source_kind": "course_library",
"source_exam_key": entry["exam_key"],
"source_question_filename": entry.get("question_pdf"),
"source_answer_filename": entry.get("primary_answer_pdf"),
}
existing = (
sb.table("papers")
.select("id")
.eq("source_kind", "course_library")
.eq("source_exam_key", entry["exam_key"])
.limit(1)
.execute()
.data
)
if existing:
paper_id = existing[0]["id"]
sb.table("papers").update(payload).eq("id", paper_id).execute()
return paper_id
created = sb.table("papers").insert(payload).execute().data
return created[0]["id"]
def reset_existing_processed_data(paper_id: str) -> None:
sb = get_supabase()
sb.table("paper_questions").delete().eq("paper_id", paper_id).execute()
sb.table("papers").update(
{
"status": "processing",
"error_message": None,
"paper_extracted_text": None,
"answer_extracted_text": None,
"total_score": None,
"question_count": None,
"topics_summary": None,
"difficulty_level": None,
}
).eq("id", paper_id).execute()
async def import_entry(
entry: dict[str, Any],
args: argparse.Namespace,
) -> None:
paper_bytes = read_file_bytes(args.papers_root, entry.get("question_pdf"))
answer_bytes = read_file_bytes(args.papers_root, entry.get("primary_answer_pdf"))
if paper_bytes is None:
raise ValueError(f"Importable entry is missing question PDF: {entry['exam_key']}")
if args.dry_run:
print(
f"[dry-run] {entry['exam_key']}: "
f"question={entry.get('question_pdf')} answer={entry.get('primary_answer_pdf')}"
)
return
sb = get_supabase()
paper_path = build_storage_path(entry, "paper")
sb.storage.from_("papers").upload(
paper_path,
paper_bytes,
file_options={"content-type": "application/pdf", "upsert": "true"},
)
paper_url = sb.storage.from_("papers").get_public_url(paper_path)
answer_url = None
if answer_bytes:
answer_path = build_storage_path(entry, "answer")
sb.storage.from_("papers").upload(
answer_path,
answer_bytes,
file_options={"content-type": "application/pdf", "upsert": "true"},
)
answer_url = sb.storage.from_("papers").get_public_url(answer_path)
paper_id = upsert_paper_record(entry, args.user_id, paper_url, answer_url)
print(f"Imported metadata for {entry['exam_key']} -> paper_id={paper_id}")
if args.process:
reset_existing_processed_data(paper_id)
await process_paper(paper_id, paper_bytes, answer_bytes)
print(f"Processed {entry['exam_key']}")
async def main() -> None:
args = parse_args()
manifest = load_manifest(args.manifest)
entries = [entry for entry in manifest if should_import(entry, args)]
if not entries:
print("No manifest entries matched the provided filters.")
return
print(f"Preparing to import {len(entries)} manifest entries.")
for entry in entries:
await import_entry(entry, args)
if __name__ == "__main__":
asyncio.run(main())

17
backend/pyproject.toml Normal file
View File

@@ -0,0 +1,17 @@
[project]
name = "pastpaper-master-backend"
version = "0.1.0"
requires-python = ">=3.11"
dependencies = [
"fastapi>=0.115.0",
"uvicorn[standard]>=0.30.0",
"python-dotenv>=1.0.0",
"python-multipart>=0.0.9",
"supabase>=2.0.0",
"openai>=1.50.0",
"PyMuPDF>=1.24.0",
"pydantic>=2.0.0",
"pydantic-settings>=2.0.0",
"httpx>=0.27.0",
"numpy>=2.4.4",
]

View File

@@ -0,0 +1,174 @@
"""Regenerate AI trio (knowledge_reminder, ai_hint, solution) for all COMP2211 course-library questions.
Reads existing paper_questions rows and runs the same BATCH_ANALYSIS_PROMPT used by
paper_processor.py — but does UPDATE instead of INSERT, so question structure is untouched.
Run from the backend directory:
uv run python regen_ai_trio_comp2211.py
Pass --dry-run to print batches without calling the LLM or writing to the database.
"""
from __future__ import annotations
import asyncio
import json
import sys
from app.services.supabase_client import get_supabase
from app.services.paper_processor import BATCH_ANALYSIS_PROMPT, qwen_json_completion, chunked
def build_reference_answer(q: dict) -> str:
if q.get("raw_answer_text"):
return q["raw_answer_text"]
if q.get("correct_option"):
return f"Correct option: {q['correct_option']}"
if q.get("correct_answer"):
return f"Correct answer: {q['correct_answer']}"
return ""
async def regen(dry_run: bool = False) -> None:
sb = get_supabase()
papers = (
sb.table("papers")
.select("id")
.eq("course_code", "COMP2211")
.eq("source_kind", "course_library")
.execute()
.data
)
paper_ids = [p["id"] for p in papers]
if not paper_ids:
print("No COMP2211 course-library papers found.")
return
questions = (
sb.table("paper_questions")
.select("id, paper_id, question_number, question_type, score, question_text, topics, raw_answer_text, correct_option, correct_answer")
.in_("paper_id", paper_ids)
.order("paper_id")
.order("display_order")
.execute()
.data
)
print(f"Found {len(questions)} questions across {len(paper_ids)} papers.")
payloads = [
{
"question_number": q["question_number"],
"question_type": q["question_type"] or "long_question",
"score": q.get("score") or "unknown",
"question_text": q.get("question_text") or "",
"topics": q.get("topics") or [],
"reference_answer": build_reference_answer(q),
}
for q in questions
]
id_by_qnum_paper: dict[tuple[str, str], str] = {
(q["paper_id"], q["question_number"]): q["id"]
for q in questions
}
paper_id_by_qnum: dict[str, str] = {
q["question_number"]: q["paper_id"] for q in questions
}
# Group payloads by paper so batches don't mix papers (cleaner context for LLM)
from collections import defaultdict
payloads_by_paper: dict[str, list[dict]] = defaultdict(list)
for q, payload in zip(questions, payloads):
payloads_by_paper[q["paper_id"]].append((q["id"], payload))
total_updated = 0
total_papers = len(payloads_by_paper)
for paper_idx, (paper_id, items) in enumerate(payloads_by_paper.items(), 1):
ids = [item[0] for item in items]
batch_payloads = [item[1] for item in items]
print(f"\n[{paper_idx}/{total_papers}] paper_id={paper_id}{len(batch_payloads)} questions")
for batch_idx, batch in enumerate(chunked(batch_payloads, 3), 1):
print(f" Batch {batch_idx}: questions {[b['question_number'] for b in batch]}", end="", flush=True)
if dry_run:
print(" [dry-run, skipped]")
continue
batch_start = (batch_idx - 1) * 3
batch_ids = ids[batch_start: batch_start + 3]
async def run_single(row_id: str, payload: dict) -> bool:
try:
r = await qwen_json_completion(
system_prompt=BATCH_ANALYSIS_PROMPT.format(
questions_payload=json.dumps([payload], ensure_ascii=False),
),
temperature=0.3,
max_tokens=8192,
)
items = r.get("analyses", [])
if not items:
return False
analysis = items[0]
sb.table("paper_questions").update({
"knowledge_reminder": analysis.get("knowledge_reminder", ""),
"ai_hint": analysis.get("ai_hint", ""),
"solution": analysis.get("solution", ""),
}).eq("id", row_id).execute()
return True
except Exception:
return False
try:
result = await qwen_json_completion(
system_prompt=BATCH_ANALYSIS_PROMPT.format(
questions_payload=json.dumps(batch, ensure_ascii=False),
),
temperature=0.3,
max_tokens=8192,
)
analyses = {item["question_number"]: item for item in result.get("analyses", [])}
written = 0
for row_id, payload in zip(batch_ids, batch):
qnum = payload["question_number"]
analysis = analyses.get(qnum)
if not analysis:
# fallback: retry this single question alone
ok = await run_single(row_id, payload)
if ok:
written += 1
total_updated += 1
else:
print(f"\n SKIP: {qnum}")
else:
sb.table("paper_questions").update({
"knowledge_reminder": analysis.get("knowledge_reminder", ""),
"ai_hint": analysis.get("ai_hint", ""),
"solution": analysis.get("solution", ""),
}).eq("id", row_id).execute()
written += 1
total_updated += 1
print(f"{written} written")
except Exception as exc:
# batch failed entirely — retry each question individually
print(f" [batch error, retrying 1-by-1]")
written = 0
for row_id, payload in zip(batch_ids, batch):
ok = await run_single(row_id, payload)
if ok:
written += 1
total_updated += 1
await asyncio.sleep(1)
print(f"{written}/{len(batch)} written")
await asyncio.sleep(2.5)
print(f"\nDone. {total_updated} questions updated.")
if __name__ == "__main__":
dry_run = "--dry-run" in sys.argv
asyncio.run(regen(dry_run=dry_run))

View File

@@ -0,0 +1,69 @@
"""Re-generate AI trio (knowledge_reminder, ai_hint, solution) in English for existing questions."""
import json
import asyncio
from app.services.supabase_client import get_supabase
from app.services.llm_clients import get_qwen_client
from app.services.paper_processor import ANALYSIS_PROMPT
async def regenerate_for_paper(paper_id: str):
sb = get_supabase()
qwen = get_qwen_client()
questions = sb.table("paper_questions").select("*").eq("paper_id", paper_id).order("display_order").execute().data
print(f"Found {len(questions)} questions for paper {paper_id[:8]}")
for q in questions:
qnum = q["question_number"]
print(f" Regenerating Q{qnum}...", end=" ", flush=True)
answer_section = ""
if q.get("raw_answer_text"):
answer_section = f"- Reference answer: {q['raw_answer_text']}"
elif q.get("correct_option"):
answer_section = f"- Correct option: {q['correct_option']}"
elif q.get("correct_answer"):
answer_section = f"- Correct answer: {q['correct_answer']}"
resp = qwen.chat.completions.create(
model="qwen-plus",
messages=[
{"role": "system", "content": ANALYSIS_PROMPT.format(
question_number=qnum,
question_type=q["question_type"],
score=q.get("score", "unknown"),
question_text=q["question_text"],
topics=", ".join(q.get("topics", [])),
answer_section=answer_section,
)},
],
temperature=0.3,
response_format={"type": "json_object"},
)
analysis = json.loads(resp.choices[0].message.content)
sb.table("paper_questions").update({
"knowledge_reminder": analysis.get("knowledge_reminder", ""),
"ai_hint": analysis.get("ai_hint", ""),
"solution": analysis.get("solution", ""),
}).eq("id", q["id"]).execute()
print("done")
print(f"All questions regenerated for paper {paper_id[:8]}")
async def main():
sb = get_supabase()
papers = sb.table("papers").select("id,course_code,year,term").eq("status", "ready").order("created_at", desc=True).execute().data
for p in papers:
print(f"\n=== {p['course_code']} {p['year']} {p['term']} ===")
await regenerate_for_paper(p["id"])
print("\nAll done!")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,224 @@
"""Split COMP2211 Spring 2022 final part A into subquestions."""
from __future__ import annotations
import json
import re
from dataclasses import dataclass
from pathlib import Path
from app.services.supabase_client import get_supabase
EXAM_KEY = "COMP2211-2022-spring-final-part-a"
TRUE_FALSE_OPTIONS = [{"label": "True", "text": "True"}, {"label": "False", "text": "False"}]
PROBLEM_SEED_PATH = (
Path(__file__).resolve().parent.parent
/ "pastpaper-scraper"
/ "reviews"
/ "COMP2211"
/ "problem_seed.json"
)
@dataclass(frozen=True)
class ChildSpec:
question_number: str
parent_question: str
top_level_number: str
path: tuple[str, ...]
score: float
question_type: str
question_format: str | None = None
analytics_topic: str | None = None
topic_primary: str | None = None
topic_tags: tuple[str, ...] | None = None
skill_tags: tuple[str, ...] | None = None
page_number: int = 1
def short_answer(
question_number: str,
parent_question: str,
top_level_number: str,
path: tuple[str, ...],
score: float,
*,
analytics_topic: str | None = None,
topic_primary: str | None = None,
topic_tags: tuple[str, ...] | None = None,
skill_tags: tuple[str, ...] | None = None,
page_number: int,
) -> ChildSpec:
return ChildSpec(
question_number=question_number,
parent_question=parent_question,
top_level_number=top_level_number,
path=path,
score=score,
question_type="long_question",
question_format="short_answer",
analytics_topic=analytics_topic,
topic_primary=topic_primary,
topic_tags=topic_tags,
skill_tags=skill_tags,
page_number=page_number,
)
CHILDREN: list[ChildSpec] = [
ChildSpec("1a", "1", "1", ("a",), 1, "true_false", "true_false", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("concept_check", "algorithm_property"), page_number=2),
ChildSpec("1b", "1", "1", ("b",), 1, "true_false", "true_false", "Perceptron and MLP", "Perceptron and MLP", ("Perceptron and MLP",), ("concept_check", "architecture_reasoning"), page_number=2),
ChildSpec("1c", "1", "1", ("c",), 1, "true_false", "true_false", "Perceptron and MLP", "Perceptron and MLP", ("Perceptron and MLP",), ("concept_check", "activation_selection"), page_number=2),
ChildSpec("1d", "1", "1", ("d",), 1, "true_false", "true_false", "Evaluation and Validation", "Evaluation and Validation", ("Evaluation and Validation",), ("concept_check", "metric_reasoning"), page_number=2),
ChildSpec("1e", "1", "1", ("e",), 1, "true_false", "true_false", "Vision and CNN", "Vision and CNN", ("Vision and CNN",), ("concept_check", "hardware_reasoning"), page_number=2),
ChildSpec("1f", "1", "1", ("f",), 1, "true_false", "true_false", "Vision and CNN", "Vision and CNN", ("Vision and CNN",), ("concept_check", "image_processing"), page_number=2),
ChildSpec("1g", "1", "1", ("g",), 1, "true_false", "true_false", "Vision and CNN", "Vision and CNN", ("Vision and CNN",), ("concept_check", "cnn_architecture"), page_number=2),
ChildSpec("1h", "1", "1", ("h",), 1, "true_false", "true_false", "Perceptron and MLP", "Perceptron and MLP", ("Perceptron and MLP",), ("concept_check", "regularization"), page_number=2),
ChildSpec("1i", "1", "1", ("i",), 1, "true_false", "true_false", "Search and Games", "Search and Games", ("Search and Games",), ("concept_check", "game_reasoning"), page_number=2),
ChildSpec("1j", "1", "1", ("j",), 1, "true_false", "true_false", "Search and Games", "Search and Games", ("Search and Games",), ("concept_check", "pruning_reasoning"), page_number=2),
ChildSpec("2a", "2", "2", ("a",), 6.5, "long_question", "long_answer", "Probabilistic Models", "Probabilistic Models", ("Probabilistic Models",), ("manual_computation", "probability_reasoning", "classification_decision"), page_number=4),
ChildSpec("2b", "2", "2", ("b",), 7.5, "long_question", "long_answer", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("distance_calculation", "algorithm_tracing", "classification_decision"), page_number=4),
short_answer("3a", "3", "3", ("a",), 3, analytics_topic="Evaluation and Validation", topic_primary="Evaluation and Validation", topic_tags=("Evaluation and Validation",), skill_tags=("concept_explanation", "metric_reasoning"), page_number=6),
short_answer("3b", "3", "3", ("b",), 2, analytics_topic="Perceptron and MLP", topic_primary="Perceptron and MLP", topic_tags=("Perceptron and MLP",), skill_tags=("concept_explanation", "activation_selection"), page_number=6),
short_answer("3c", "3", "3", ("c",), 2, analytics_topic="Perceptron and MLP", topic_primary="Perceptron and MLP", topic_tags=("Perceptron and MLP",), skill_tags=("architecture_reasoning", "output_layer_design"), page_number=6),
short_answer("3d", "3", "3", ("d",), 3, analytics_topic="Perceptron and MLP", topic_primary="Perceptron and MLP", topic_tags=("Perceptron and MLP",), skill_tags=("concept_explanation", "optimization_reasoning"), page_number=6),
short_answer("3e_i", "3e", "3", ("e", "i"), 1, analytics_topic="Perceptron and MLP", topic_primary="Perceptron and MLP", topic_tags=("Perceptron and MLP",), skill_tags=("optimization_reasoning",), page_number=6),
short_answer("3e_ii", "3e", "3", ("e", "ii"), 1, analytics_topic="Perceptron and MLP", topic_primary="Perceptron and MLP", topic_tags=("Perceptron and MLP",), skill_tags=("optimization_reasoning",), page_number=6),
short_answer("3f", "3", "3", ("f",), 2, analytics_topic="Perceptron and MLP", topic_primary="Perceptron and MLP", topic_tags=("Perceptron and MLP",), skill_tags=("regularization", "concept_explanation"), page_number=6),
ChildSpec("4a_i", "4a", "4", ("a", "i"), 2, "fill_blank", "fill_blank", page_number=7),
ChildSpec("4a_ii", "4a", "4", ("a", "ii"), 2, "long_question", "long_answer", page_number=7),
ChildSpec("4b_i", "4b", "4", ("b", "i"), 3, "fill_blank", "fill_blank", page_number=7),
ChildSpec("4b_ii", "4b", "4", ("b", "ii"), 4, "fill_blank", "fill_blank", page_number=7),
ChildSpec("4b_iii", "4b", "4", ("b", "iii"), 4, "long_question", "long_answer", page_number=7),
]
MARKER_RE = re.compile(r"(?m)^\(([a-z]+|[ivx]+)\)\s*")
def split_sections(text: str) -> tuple[str, dict[str, str]]:
matches = list(MARKER_RE.finditer(text))
if not matches:
return text.strip(), {}
intro = text[: matches[0].start()].strip()
sections: dict[str, str] = {}
for idx, match in enumerate(matches):
marker = match.group(1)
end = matches[idx + 1].start() if idx + 1 < len(matches) else len(text)
sections[marker] = text[match.start() : end].strip()
return intro, sections
def extract_segment(text: str, path: tuple[str, ...]) -> str:
current = text.strip()
carried_intro: list[str] = []
for depth, marker in enumerate(path):
intro, sections = split_sections(current)
if depth == 0 and intro:
carried_intro.append(intro)
current = sections.get(marker, current)
return "\n".join(part for part in [*carried_intro, current] if part).strip()
def extract_true_false_answers(answer_text: str) -> dict[str, str]:
answers: dict[str, str] = {}
matches = list(re.finditer(r"(?m)^\(([a-j])\)\s*\n?([TF])\b", answer_text))
for match in matches:
answers[match.group(1)] = match.group(2)
return answers
def derive_correct_answer(answer_text: str) -> str | None:
if not answer_text:
return None
tail = answer_text.split("Answer:", 1)[1] if "Answer:" in answer_text else answer_text
lines = [line.strip() for line in tail.splitlines() if line.strip()]
if not lines:
return None
first = lines[0]
if first.lower().startswith("marking scheme"):
return None
if len(first) <= 240:
return first
return None
def load_seed_rows() -> dict[str, dict]:
data = json.loads(PROBLEM_SEED_PATH.read_text())
return {
row["question_number"]: row
for row in data
if row["source_exam_key"] == EXAM_KEY
}
def main() -> None:
sb = get_supabase()
paper = sb.table("papers").select("id").eq("source_exam_key", EXAM_KEY).execute().data[0]
paper_id = paper["id"]
current_rows = (
sb.table("paper_questions")
.select("*")
.eq("paper_id", paper_id)
.order("display_order")
.execute()
.data
)
existing_by_number = {row["question_number"]: row for row in current_rows}
parent_rows = load_seed_rows()
tf_answers = extract_true_false_answers(parent_rows["1"]["raw_answer_text"] or "")
inserts = []
for display_order, child in enumerate(CHILDREN, start=1):
parent = parent_rows[child.top_level_number]
existing = existing_by_number.get(child.question_number, {})
question_text = extract_segment(parent["question_text"] or "", child.path)
raw_answer_text = extract_segment(parent["raw_answer_text"] or "", child.path)
correct_option = None
correct_answer = None
options = None
if child.question_type == "true_false":
correct_option = tf_answers.get(child.path[0])
options = TRUE_FALSE_OPTIONS
elif child.question_type == "fill_blank":
correct_answer = derive_correct_answer(raw_answer_text)
inserts.append(
{
"paper_id": paper_id,
"question_number": child.question_number,
"parent_question": child.parent_question,
"display_order": display_order,
"question_type": child.question_type,
"question_format": child.question_format,
"question_text": question_text,
"score": child.score,
"page_number": child.page_number,
"page_y_ratio": existing.get("page_y_ratio"),
"options": options,
"correct_option": correct_option,
"correct_answer": correct_answer,
"raw_answer_text": raw_answer_text,
"topics": existing.get("topics") or (list(child.topic_tags) if child.topic_tags else parent.get("topics")),
"topic_primary": existing.get("topic_primary") or child.topic_primary or parent.get("topic_primary"),
"analytics_topic": existing.get("analytics_topic") or child.analytics_topic or parent.get("analytics_topic"),
"topic_tags": existing.get("topic_tags") or (list(child.topic_tags) if child.topic_tags else parent.get("topic_tags")),
"skill_tags": existing.get("skill_tags") or (list(child.skill_tags) if child.skill_tags else parent.get("skill_tags")),
"difficulty": existing.get("difficulty") or parent.get("difficulty"),
"knowledge_reminder": existing.get("knowledge_reminder", ""),
"ai_hint": existing.get("ai_hint", ""),
"solution": existing.get("solution", ""),
}
)
sb.table("paper_questions").delete().eq("paper_id", paper_id).execute()
sb.table("paper_questions").insert(inserts).execute()
sb.table("papers").update({"question_count": len(inserts), "status": "processing"}).eq("id", paper_id).execute()
print(f"Inserted {len(inserts)} rows for {EXAM_KEY}.")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,232 @@
"""Split COMP2211 Spring 2022 final part B into subquestions."""
from __future__ import annotations
import json
import re
from dataclasses import dataclass
from pathlib import Path
from app.services.supabase_client import get_supabase
EXAM_KEY = "COMP2211-2022-spring-final-part-b"
PROBLEM_SEED_PATH = (
Path(__file__).resolve().parent.parent
/ "pastpaper-scraper"
/ "reviews"
/ "COMP2211"
/ "problem_seed.json"
)
@dataclass(frozen=True)
class ChildSpec:
question_number: str
parent_question: str
top_level_number: str
path: tuple[str, ...]
score: float
question_type: str
question_format: str | None = None
analytics_topic: str | None = None
topic_primary: str | None = None
topic_tags: tuple[str, ...] | None = None
skill_tags: tuple[str, ...] | None = None
options: tuple[tuple[str, str], ...] | None = None
correct_option: str | None = None
correct_answer: str | None = None
page_number: int = 1
def short_answer(
question_number: str,
parent_question: str,
top_level_number: str,
path: tuple[str, ...],
score: float,
*,
analytics_topic: str | None = None,
topic_primary: str | None = None,
topic_tags: tuple[str, ...] | None = None,
skill_tags: tuple[str, ...] | None = None,
correct_answer: str | None = None,
page_number: int,
) -> ChildSpec:
return ChildSpec(
question_number=question_number,
parent_question=parent_question,
top_level_number=top_level_number,
path=path,
score=score,
question_type="long_question",
question_format="short_answer",
analytics_topic=analytics_topic,
topic_primary=topic_primary,
topic_tags=topic_tags,
skill_tags=skill_tags,
correct_answer=correct_answer,
page_number=page_number,
)
def mc(
question_number: str,
parent_question: str,
top_level_number: str,
path: tuple[str, ...],
score: float,
*,
options: tuple[tuple[str, str], ...],
correct_option: str,
analytics_topic: str,
skill_tags: tuple[str, ...],
page_number: int,
) -> ChildSpec:
return ChildSpec(
question_number=question_number,
parent_question=parent_question,
top_level_number=top_level_number,
path=path,
score=score,
question_type="mc",
question_format="mc",
analytics_topic=analytics_topic,
topic_primary=analytics_topic,
topic_tags=(analytics_topic,),
skill_tags=skill_tags,
options=options,
correct_option=correct_option,
page_number=page_number,
)
ETHICS_ABCD = (
("A", "A"),
("B", "B"),
("C", "C"),
("D", "D"),
)
CHILDREN: list[ChildSpec] = [
ChildSpec("1a", "1", "1", ("a",), 1.5, "long_question", "long_answer", page_number=2),
short_answer("1b", "1", "1", ("b",), 1.5, analytics_topic="Vision and CNN", topic_primary="Vision and CNN", topic_tags=("Vision and CNN",), skill_tags=("concept_explanation", "data_augmentation"), page_number=2),
ChildSpec("1c", "1", "1", ("c",), 4.5, "long_question", "long_answer", page_number=2),
short_answer("1d", "1", "1", ("d",), 2, analytics_topic="Vision and CNN", topic_primary="Vision and CNN", topic_tags=("Vision and CNN",), skill_tags=("architecture_reasoning", "parameter_reduction"), page_number=3),
ChildSpec("1e", "1", "1", ("e",), 2.5, "fill_blank", "fill_blank", correct_answer="1558656", page_number=3),
ChildSpec("1f_i", "1f", "1", ("f", "i"), 2.5, "fill_blank", "fill_blank", correct_answer="2071656", page_number=3),
ChildSpec("1f_ii", "1f", "1", ("f", "ii"), 2.5, "fill_blank", "fill_blank", correct_answer="150529000", page_number=4),
short_answer("1g", "1", "1", ("g",), 2, analytics_topic="Vision and CNN", topic_primary="Vision and CNN", topic_tags=("Vision and CNN",), skill_tags=("architecture_reasoning", "comparison"), page_number=4),
ChildSpec("2a", "2", "2", ("a",), 9, "long_question", "coding", page_number=5),
short_answer("2b", "2", "2", ("b",), 4, analytics_topic="Vision and CNN", topic_primary="Vision and CNN", topic_tags=("Vision and CNN",), skill_tags=("architecture_reasoning", "regression_reasoning"), page_number=6),
ChildSpec("3a", "3", "3", ("a",), 3.5, "long_question", "long_answer", page_number=9),
short_answer("3b", "3", "3", ("b",), 0.5, analytics_topic="Search and Games", topic_primary="Search and Games", topic_tags=("Search and Games",), skill_tags=("game_reasoning",), correct_answer="E-a", page_number=9),
short_answer("3c", "3", "3", ("c",), 1.5, analytics_topic="Search and Games", topic_primary="Search and Games", topic_tags=("Search and Games",), skill_tags=("concept_explanation", "game_reasoning"), page_number=9),
short_answer("3d", "3", "3", ("d",), 2.5, analytics_topic="Search and Games", topic_primary="Search and Games", topic_tags=("Search and Games",), skill_tags=("pruning_reasoning",), correct_answer="E-j and E-f", page_number=9),
mc("4a", "4", "4", ("a",), 1, options=ETHICS_ABCD, correct_option="C", analytics_topic="Ethics of AI", skill_tags=("concept_check", "ethical_reasoning"), page_number=10),
mc("4b", "4", "4", ("b",), 1, options=ETHICS_ABCD, correct_option="A", analytics_topic="Ethics of AI", skill_tags=("concept_check", "bias_reasoning"), page_number=10),
mc("4c", "4", "4", ("c",), 1, options=ETHICS_ABCD, correct_option="C", analytics_topic="Ethics of AI", skill_tags=("concept_check", "ethical_reasoning"), page_number=10),
mc("4d", "4", "4", ("d",), 1, options=ETHICS_ABCD, correct_option="B", analytics_topic="Ethics of AI", skill_tags=("concept_check", "bias_reasoning"), page_number=10),
short_answer("4e", "4", "4", ("e",), 3, analytics_topic="Ethics of AI", topic_primary="Ethics of AI", topic_tags=("Ethics of AI",), skill_tags=("argumentation", "concept_explanation"), page_number=11),
]
MARKER_RE = re.compile(r"(?m)^\(([a-z]+|[ivx]+)\)\s*")
def split_sections(text: str) -> tuple[str, dict[str, str]]:
matches = list(MARKER_RE.finditer(text))
if not matches:
return text.strip(), {}
intro = text[: matches[0].start()].strip()
sections: dict[str, str] = {}
for idx, match in enumerate(matches):
marker = match.group(1)
end = matches[idx + 1].start() if idx + 1 < len(matches) else len(text)
sections[marker] = text[match.start() : end].strip()
return intro, sections
def extract_segment(text: str, path: tuple[str, ...]) -> str:
current = text.strip()
carried_intro: list[str] = []
for depth, marker in enumerate(path):
intro, sections = split_sections(current)
if depth == 0 and intro:
carried_intro.append(intro)
current = sections.get(marker, current)
return "\n".join(part for part in [*carried_intro, current] if part).strip()
def load_seed_rows() -> dict[str, dict]:
data = json.loads(PROBLEM_SEED_PATH.read_text())
return {
row["question_number"]: row
for row in data
if row["source_exam_key"] == EXAM_KEY
}
def main() -> None:
sb = get_supabase()
paper = sb.table("papers").select("id").eq("source_exam_key", EXAM_KEY).execute().data[0]
paper_id = paper["id"]
current_rows = (
sb.table("paper_questions")
.select("*")
.eq("paper_id", paper_id)
.order("display_order")
.execute()
.data
)
existing_by_number = {row["question_number"]: row for row in current_rows}
parent_rows = load_seed_rows()
inserts = []
for display_order, child in enumerate(CHILDREN, start=1):
parent = parent_rows[child.top_level_number]
existing = existing_by_number.get(child.question_number, {})
question_text = extract_segment(parent["question_text"] or "", child.path)
raw_answer_text = extract_segment(parent["raw_answer_text"] or "", child.path)
options = None
if child.options:
options = [{"label": label, "text": text} for label, text in child.options]
inserts.append(
{
"paper_id": paper_id,
"question_number": child.question_number,
"parent_question": child.parent_question,
"display_order": display_order,
"question_type": child.question_type,
"question_format": child.question_format,
"question_text": question_text,
"score": child.score,
"page_number": child.page_number,
"page_y_ratio": existing.get("page_y_ratio"),
"options": options,
"correct_option": child.correct_option,
"correct_answer": child.correct_answer,
"raw_answer_text": raw_answer_text,
"topics": existing.get("topics") or (list(child.topic_tags) if child.topic_tags else parent.get("topics")),
"topic_primary": existing.get("topic_primary") or child.topic_primary or parent.get("topic_primary"),
"analytics_topic": existing.get("analytics_topic") or child.analytics_topic or parent.get("analytics_topic"),
"topic_tags": existing.get("topic_tags") or (list(child.topic_tags) if child.topic_tags else parent.get("topic_tags")),
"skill_tags": existing.get("skill_tags") or (list(child.skill_tags) if child.skill_tags else parent.get("skill_tags")),
"difficulty": existing.get("difficulty") or parent.get("difficulty"),
"knowledge_reminder": existing.get("knowledge_reminder", ""),
"ai_hint": existing.get("ai_hint", ""),
"solution": existing.get("solution", ""),
}
)
sb.table("paper_questions").delete().eq("paper_id", paper_id).execute()
sb.table("paper_questions").insert(inserts).execute()
sb.table("papers").update({"question_count": len(inserts), "status": "processing"}).eq("id", paper_id).execute()
print(f"Inserted {len(inserts)} rows for {EXAM_KEY}.")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,233 @@
"""Split COMP2211 Spring 2022 midterm top-level problems into subquestions."""
from __future__ import annotations
import json
import re
from dataclasses import dataclass
from pathlib import Path
from app.services.supabase_client import get_supabase
EXAM_KEY = "COMP2211-2022-spring-midterm"
TRUE_FALSE_OPTIONS = [{"label": "True", "text": "True"}, {"label": "False", "text": "False"}]
@dataclass(frozen=True)
class ChildSpec:
question_number: str
parent_question: str
top_level_number: str
path: tuple[str, ...]
score: float
question_type: str
question_format: str | None = None
page_number: int = 1
def short_answer(
question_number: str,
parent_question: str,
top_level_number: str,
path: tuple[str, ...],
score: float,
*,
page_number: int,
) -> ChildSpec:
return ChildSpec(
question_number=question_number,
parent_question=parent_question,
top_level_number=top_level_number,
path=path,
score=score,
question_type="long_question",
question_format="short_answer",
page_number=page_number,
)
CHILDREN: list[ChildSpec] = [
*[
ChildSpec(f"1{letter}", "1", "1", (letter,), 1.5, "true_false", page_number=2)
for letter in "abcdefghij"
],
ChildSpec("2a_i", "2a", "2", ("a", "i"), 1, "fill_blank", page_number=4),
ChildSpec("2a_ii", "2a", "2", ("a", "ii"), 1, "fill_blank", page_number=4),
ChildSpec("2a_iii", "2a", "2", ("a", "iii"), 1, "fill_blank", page_number=4),
ChildSpec("2a_iv", "2a", "2", ("a", "iv"), 1, "fill_blank", page_number=4),
ChildSpec("2a_v", "2a", "2", ("a", "v"), 1, "fill_blank", page_number=4),
ChildSpec("2b", "2", "2", ("b",), 2, "fill_blank", page_number=4),
ChildSpec("2c", "2", "2", ("c",), 9, "long_question", "coding", page_number=5),
ChildSpec("3a", "3", "3", ("a",), 2, "fill_blank", page_number=7),
ChildSpec("3b_i", "3b", "3", ("b", "i"), 1.75, "fill_blank", page_number=7),
ChildSpec("3b_ii", "3b", "3", ("b", "ii"), 1.75, "fill_blank", page_number=7),
ChildSpec("3b_iii", "3b", "3", ("b", "iii"), 1.75, "fill_blank", page_number=7),
ChildSpec("3b_iv", "3b", "3", ("b", "iv"), 1.75, "fill_blank", page_number=7),
short_answer("3c", "3", "3", ("c",), 2, page_number=8),
ChildSpec("4a", "4", "4", ("a",), 3, "long_question", "long_answer", page_number=9),
short_answer("4b_i", "4b", "4", ("b", "i"), 3, page_number=9),
short_answer("4b_ii", "4b", "4", ("b", "ii"), 3, page_number=9),
ChildSpec("4c_i", "4c", "4", ("c", "i"), 2, "long_question", "long_answer", page_number=10),
ChildSpec("4c_ii", "4c", "4", ("c", "ii"), 3, "long_question", "long_answer", page_number=10),
ChildSpec("5a", "5", "5", ("a",), 4.5, "long_question", "long_answer", page_number=11),
ChildSpec("5b", "5", "5", ("b",), 1.5, "fill_blank", page_number=11),
ChildSpec("5c", "5", "5", ("c",), 4.5, "long_question", "long_answer", page_number=11),
short_answer("5d", "5", "5", ("d",), 1.5, page_number=11),
ChildSpec("6a", "6", "6", ("a",), 8, "long_question", "long_answer", page_number=12),
short_answer("6b", "6", "6", ("b",), 2, page_number=13),
ChildSpec("6c", "6", "6", ("c",), 10, "long_question", "coding", page_number=13),
short_answer("7a", "7", "7", ("a",), 4, page_number=14),
short_answer("7b", "7", "7", ("b",), 6, page_number=14),
ChildSpec("7c", "7", "7", ("c",), 2, "fill_blank", page_number=15),
]
MARKER_RE = re.compile(r"(?m)^\(([a-z]+)\)\s*")
PROBLEM_SEED_PATH = (
Path(__file__).resolve().parent.parent
/ "pastpaper-scraper"
/ "reviews"
/ "COMP2211"
/ "problem_seed.json"
)
def split_sections(text: str) -> tuple[str, dict[str, str]]:
matches = list(MARKER_RE.finditer(text))
if not matches:
return text.strip(), {}
intro = text[: matches[0].start()].strip()
sections: dict[str, str] = {}
for idx, match in enumerate(matches):
marker = match.group(1)
end = matches[idx + 1].start() if idx + 1 < len(matches) else len(text)
sections[marker] = text[match.start() : end].strip()
return intro, sections
def extract_segment(text: str, path: tuple[str, ...]) -> str:
intro, sections = split_sections(text)
if not path:
return text.strip()
first = sections.get(path[0], "")
if not first:
return text.strip()
if len(path) == 1:
return "\n".join(part for part in [intro, first] if part).strip()
child_intro, child_sections = split_sections(first)
second = child_sections.get(path[1], "")
return "\n".join(part for part in [intro, child_intro, second] if part).strip()
def extract_true_false_answers(answer_text: str) -> dict[str, str]:
answers: dict[str, str] = {}
matches = list(re.finditer(r"(?m)^\(([a-j])\)\s*\n?([TF])\b", answer_text))
for match in matches:
answers[match.group(1)] = match.group(2)
return answers
def derive_correct_answer(answer_text: str) -> str | None:
if not answer_text:
return None
if "Answer:" in answer_text:
tail = answer_text.split("Answer:", 1)[1]
else:
tail = answer_text
lines = [line.strip() for line in tail.splitlines() if line.strip()]
if not lines:
return None
first = lines[0]
if first.lower().startswith("marking scheme"):
return None
if len(first) <= 240:
return first
return None
def load_seed_rows() -> dict[str, dict]:
data = json.loads(PROBLEM_SEED_PATH.read_text())
return {
row["question_number"]: row
for row in data
if row["source_exam_key"] == EXAM_KEY
}
def main() -> None:
sb = get_supabase()
paper = (
sb.table("papers")
.select("id")
.eq("source_exam_key", EXAM_KEY)
.execute()
.data[0]
)
paper_id = paper["id"]
current_rows = (
sb.table("paper_questions")
.select("*")
.eq("paper_id", paper_id)
.order("display_order")
.execute()
.data
)
existing_by_number = {row["question_number"]: row for row in current_rows}
parent_rows = load_seed_rows()
tf_answers = extract_true_false_answers(parent_rows["1"]["raw_answer_text"] or "")
inserts = []
for display_order, child in enumerate(CHILDREN, start=1):
parent = parent_rows[child.top_level_number]
existing = existing_by_number.get(child.question_number, {})
question_text = extract_segment(parent["question_text"] or "", child.path)
raw_answer_text = extract_segment(parent["raw_answer_text"] or "", child.path)
correct_option = None
correct_answer = None
options = None
if child.question_type == "true_false":
marker = child.path[0]
correct_option = tf_answers.get(marker)
options = TRUE_FALSE_OPTIONS
elif child.question_type == "fill_blank":
correct_answer = derive_correct_answer(raw_answer_text)
inserts.append(
{
"paper_id": paper_id,
"question_number": child.question_number,
"parent_question": child.parent_question,
"display_order": display_order,
"question_type": child.question_type,
"question_format": child.question_format,
"question_text": question_text,
"score": child.score,
"page_number": child.page_number,
"page_y_ratio": existing.get("page_y_ratio"),
"options": options,
"correct_option": correct_option,
"correct_answer": correct_answer,
"raw_answer_text": raw_answer_text,
"topics": existing.get("topics") or parent.get("topics"),
"topic_primary": existing.get("topic_primary") or parent.get("topic_primary"),
"analytics_topic": existing.get("analytics_topic") or parent.get("analytics_topic"),
"topic_tags": existing.get("topic_tags") or parent.get("topic_tags"),
"skill_tags": existing.get("skill_tags") or parent.get("skill_tags"),
"difficulty": existing.get("difficulty") or parent.get("difficulty"),
"knowledge_reminder": existing.get("knowledge_reminder", ""),
"ai_hint": existing.get("ai_hint", ""),
"solution": existing.get("solution", ""),
}
)
sb.table("paper_questions").delete().eq("paper_id", paper_id).execute()
sb.table("paper_questions").insert(inserts).execute()
sb.table("papers").update({"question_count": len(inserts), "status": "processing"}).eq("id", paper_id).execute()
print(f"Inserted {len(inserts)} rows for {EXAM_KEY}.")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,268 @@
"""Split COMP2211 Spring 2023 midterm into subquestions."""
from __future__ import annotations
import json
import re
from dataclasses import dataclass
from pathlib import Path
from app.services.supabase_client import get_supabase
EXAM_KEY = "COMP2211-2023-spring-midterm"
PROBLEM_SEED_PATH = (
Path(__file__).resolve().parent.parent
/ "pastpaper-scraper"
/ "reviews"
/ "COMP2211"
/ "problem_seed.json"
)
TRUE_FALSE_OPTIONS = [{"label": "True", "text": "True"}, {"label": "False", "text": "False"}]
@dataclass(frozen=True)
class ChildSpec:
question_number: str
parent_question: str
top_level_number: str
path: tuple[str, ...]
score: float
question_type: str
question_format: str | None = None
analytics_topic: str | None = None
topic_primary: str | None = None
topic_tags: tuple[str, ...] | None = None
skill_tags: tuple[str, ...] | None = None
options: tuple[tuple[str, str], ...] | None = None
correct_option: str | None = None
correct_answer: str | None = None
page_number: int = 1
def short_answer(
question_number: str,
parent_question: str,
top_level_number: str,
path: tuple[str, ...],
score: float,
*,
analytics_topic: str | None = None,
topic_primary: str | None = None,
topic_tags: tuple[str, ...] | None = None,
skill_tags: tuple[str, ...] | None = None,
correct_answer: str | None = None,
page_number: int,
) -> ChildSpec:
return ChildSpec(
question_number=question_number,
parent_question=parent_question,
top_level_number=top_level_number,
path=path,
score=score,
question_type="long_question",
question_format="short_answer",
analytics_topic=analytics_topic,
topic_primary=topic_primary,
topic_tags=topic_tags,
skill_tags=skill_tags,
correct_answer=correct_answer,
page_number=page_number,
)
def mc(
question_number: str,
parent_question: str,
top_level_number: str,
path: tuple[str, ...],
score: float,
*,
options: tuple[tuple[str, str], ...],
correct_option: str,
analytics_topic: str,
skill_tags: tuple[str, ...],
page_number: int,
) -> ChildSpec:
return ChildSpec(
question_number=question_number,
parent_question=parent_question,
top_level_number=top_level_number,
path=path,
score=score,
question_type="mc",
question_format="mc",
analytics_topic=analytics_topic,
topic_primary=analytics_topic,
topic_tags=(analytics_topic,),
skill_tags=skill_tags,
options=options,
correct_option=correct_option,
page_number=page_number,
)
ABCDE = (("A", "A"), ("B", "B"), ("C", "C"), ("D", "D"), ("E", "E"))
CHILDREN: list[ChildSpec] = [
ChildSpec("1a", "1", "1", ("a",), 1, "true_false", "true_false", "Probabilistic Models", "Probabilistic Models", ("Probabilistic Models",), ("concept_check", "classification_decision"), page_number=3),
ChildSpec("1b", "1", "1", ("b",), 1, "true_false", "true_false", "Probabilistic Models", "Probabilistic Models", ("Probabilistic Models",), ("concept_check", "classification_decision"), page_number=3),
ChildSpec("1c", "1", "1", ("c",), 1, "true_false", "true_false", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("concept_check", "algorithm_property"), page_number=3),
ChildSpec("1d", "1", "1", ("d",), 1, "true_false", "true_false", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("concept_check", "distance_reasoning"), page_number=3),
ChildSpec("1e", "1", "1", ("e",), 1, "true_false", "true_false", "Evaluation and Validation", "Evaluation and Validation", ("Evaluation and Validation",), ("concept_check", "validation_reasoning"), page_number=3),
ChildSpec("1f", "1", "1", ("f",), 1, "true_false", "true_false", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("concept_check", "algorithm_property"), page_number=3),
ChildSpec("1g", "1", "1", ("g",), 1, "true_false", "true_false", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("concept_check", "robustness_reasoning"), page_number=3),
ChildSpec("1h", "1", "1", ("h",), 1, "true_false", "true_false", "Perceptron and MLP", "Perceptron and MLP", ("Perceptron and MLP",), ("concept_check", "decision_boundary"), page_number=3),
ChildSpec("1i", "1", "1", ("i",), 1, "true_false", "true_false", "Perceptron and MLP", "Perceptron and MLP", ("Perceptron and MLP",), ("concept_check", "optimization_reasoning"), page_number=3),
ChildSpec("1j", "1", "1", ("j",), 1, "true_false", "true_false", "Perceptron and MLP", "Perceptron and MLP", ("Perceptron and MLP",), ("concept_check", "expressiveness_reasoning"), page_number=3),
short_answer("2a_i", "2a", "2", ("a", "i"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("code_tracing",), page_number=4),
short_answer("2a_ii", "2a", "2", ("a", "ii"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("code_tracing",), page_number=4),
short_answer("2a_iii", "2a", "2", ("a", "iii"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("code_tracing",), page_number=4),
short_answer("2a_iv", "2a", "2", ("a", "iv"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("code_tracing",), page_number=4),
short_answer("2a_v", "2a", "2", ("a", "v"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("indexing", "code_tracing"), page_number=4),
short_answer("2a_vi", "2a", "2", ("a", "vi"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("indexing", "error_reasoning"), page_number=5),
short_answer("2a_vii", "2a", "2", ("a", "vii"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("masking", "code_tracing"), page_number=5),
short_answer("2a_viii", "2a", "2", ("a", "viii"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("aggregation", "code_tracing"), page_number=5),
short_answer("2a_ix", "2a", "2", ("a", "ix"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("transpose", "code_tracing"), page_number=5),
short_answer("2b_i", "2b", "2", ("b", "i"), 2, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("broadcasting", "code_tracing"), page_number=6),
short_answer("2b_ii", "2b", "2", ("b", "ii"), 2, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("broadcasting", "error_reasoning"), page_number=6),
short_answer("2b_iii", "2b", "2", ("b", "iii"), 2, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("broadcasting", "code_tracing"), page_number=6),
ChildSpec("2c", "2", "2", ("c",), 6, "long_question", "coding", "Python Fundamentals", "Python Fundamentals", ("Python Fundamentals",), ("implementation", "vectorization", "geometry_reasoning"), page_number=7),
short_answer("3", "3", "3", (), 8, analytics_topic="Probabilistic Models", topic_primary="Probabilistic Models", topic_tags=("Probabilistic Models",), skill_tags=("concept_explanation", "missing_data_reasoning"), page_number=9),
ChildSpec("4a", "4", "4", ("a",), 8, "long_question", "long_answer", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("distance_calculation", "classification_decision"), page_number=10),
short_answer("4b", "4", "4", ("b",), 6, analytics_topic="KNN and Clustering", topic_primary="KNN and Clustering", topic_tags=("KNN and Clustering",), skill_tags=("distance_reasoning", "comparison"), page_number=11),
ChildSpec("5a", "5", "5", ("a",), 7, "long_question", "long_answer", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("distance_calculation", "algorithm_tracing"), page_number=12),
ChildSpec("5b", "5", "5", ("b",), 7, "long_question", "long_answer", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("centroid_update", "algorithm_tracing"), page_number=12),
short_answer("5c", "5", "5", ("c",), 5, analytics_topic="KNN and Clustering", topic_primary="KNN and Clustering", topic_tags=("KNN and Clustering",), skill_tags=("concept_explanation", "model_selection"), page_number=14),
short_answer("6a", "6", "6", ("a",), 2, analytics_topic="Perceptron and MLP", topic_primary="Perceptron and MLP", topic_tags=("Perceptron and MLP",), skill_tags=("convergence_reasoning",), page_number=15),
mc("6b", "6", "6", ("b",), 2, options=ABCDE, correct_option="D", analytics_topic="Perceptron and MLP", skill_tags=("generalization_reasoning",), page_number=15),
short_answer("6c", "6", "6", ("c",), 2, analytics_topic="Perceptron and MLP", topic_primary="Perceptron and MLP", topic_tags=("Perceptron and MLP",), skill_tags=("activation_reasoning",), page_number=16),
ChildSpec("6d", "6", "6", ("d",), 6, "long_question", "coding", "Perceptron and MLP", "Perceptron and MLP", ("Perceptron and MLP",), ("debugging", "implementation", "weight_update"), page_number=16),
short_answer("7a", "7", "7", ("a",), 4, analytics_topic="Perceptron and MLP", topic_primary="Perceptron and MLP", topic_tags=("Perceptron and MLP",), skill_tags=("decision_boundary", "linearity_reasoning"), page_number=18),
short_answer("7b", "7", "7", ("b",), 2, analytics_topic="Perceptron and MLP", topic_primary="Perceptron and MLP", topic_tags=("Perceptron and MLP",), skill_tags=("decision_boundary", "linearity_reasoning"), page_number=18),
ChildSpec("7c", "7", "7", ("c",), 10, "long_question", "long_answer", "Perceptron and MLP", "Perceptron and MLP", ("Perceptron and MLP",), ("architecture_reasoning", "parameter_design"), page_number=19),
]
MARKER_RE = re.compile(r"(?m)^\(([a-z]+|[ivx]+)\)\s*")
def split_sections(text: str) -> tuple[str, dict[str, str]]:
matches = list(MARKER_RE.finditer(text))
if not matches:
return text.strip(), {}
intro = text[: matches[0].start()].strip()
sections: dict[str, str] = {}
for idx, match in enumerate(matches):
marker = match.group(1)
end = matches[idx + 1].start() if idx + 1 < len(matches) else len(text)
sections[marker] = text[match.start() : end].strip()
return intro, sections
def extract_segment(text: str, path: tuple[str, ...]) -> str:
current = text.strip()
carried_intro: list[str] = []
for depth, marker in enumerate(path):
intro, sections = split_sections(current)
if depth == 0 and intro:
carried_intro.append(intro)
current = sections.get(marker, current)
return "\n".join(part for part in [*carried_intro, current] if part).strip()
def extract_true_false_answers(answer_text: str) -> dict[str, str]:
answers: dict[str, str] = {}
matches = list(re.finditer(r"(?m)^\(([a-j])\)\s*\n?T\s*F", answer_text))
if matches:
return answers
for match in re.finditer(r"(?m)^\(([a-j])\)\s*\n?([TF])\b", answer_text):
answers[match.group(1)] = match.group(2)
if answers:
return answers
lines = [line.strip() for line in answer_text.splitlines() if line.strip()]
current = None
for line in lines:
m = re.fullmatch(r"\(([a-j])\)", line)
if m:
current = m.group(1)
continue
if current and line in {"T", "F"}:
answers[current] = line
current = None
return answers
def load_seed_rows() -> dict[str, dict]:
data = json.loads(PROBLEM_SEED_PATH.read_text())
return {row["question_number"]: row for row in data if row["source_exam_key"] == EXAM_KEY}
def main() -> None:
sb = get_supabase()
paper = sb.table("papers").select("id").eq("source_exam_key", EXAM_KEY).execute().data[0]
paper_id = paper["id"]
current_rows = (
sb.table("paper_questions")
.select("*")
.eq("paper_id", paper_id)
.order("display_order")
.execute()
.data
)
existing_by_number = {row["question_number"]: row for row in current_rows}
parent_rows = load_seed_rows()
tf_answers = extract_true_false_answers(parent_rows["1"]["raw_answer_text"] or "")
inserts = []
for display_order, child in enumerate(CHILDREN, start=1):
parent = parent_rows[child.top_level_number]
existing = existing_by_number.get(child.question_number, {})
question_text = extract_segment(parent["question_text"] or "", child.path)
raw_answer_text = extract_segment(parent["raw_answer_text"] or "", child.path) if child.path else (parent["raw_answer_text"] or "")
options = None
correct_option = child.correct_option
if child.options:
options = [{"label": label, "text": text} for label, text in child.options]
if child.question_type == "true_false":
options = TRUE_FALSE_OPTIONS
correct_option = tf_answers.get(child.path[0])
inserts.append(
{
"paper_id": paper_id,
"question_number": child.question_number,
"parent_question": child.parent_question,
"display_order": display_order,
"question_type": child.question_type,
"question_format": child.question_format,
"question_text": question_text,
"score": child.score,
"page_number": child.page_number,
"page_y_ratio": existing.get("page_y_ratio"),
"options": options,
"correct_option": correct_option,
"correct_answer": child.correct_answer,
"raw_answer_text": raw_answer_text,
"topics": existing.get("topics") or (list(child.topic_tags) if child.topic_tags else parent.get("topics")),
"topic_primary": existing.get("topic_primary") or child.topic_primary or parent.get("topic_primary"),
"analytics_topic": existing.get("analytics_topic") or child.analytics_topic or parent.get("analytics_topic"),
"topic_tags": existing.get("topic_tags") or (list(child.topic_tags) if child.topic_tags else parent.get("topic_tags")),
"skill_tags": existing.get("skill_tags") or (list(child.skill_tags) if child.skill_tags else parent.get("skill_tags")),
"difficulty": existing.get("difficulty") or parent.get("difficulty"),
"knowledge_reminder": existing.get("knowledge_reminder", ""),
"ai_hint": existing.get("ai_hint", ""),
"solution": existing.get("solution", ""),
}
)
sb.table("paper_questions").delete().eq("paper_id", paper_id).execute()
sb.table("paper_questions").insert(inserts).execute()
sb.table("papers").update({"question_count": len(inserts), "status": "processing"}).eq("id", paper_id).execute()
print(f"Inserted {len(inserts)} rows for {EXAM_KEY}.")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,242 @@
"""Split COMP2211 Spring 2024 final into subquestions."""
from __future__ import annotations
import json
import re
from dataclasses import dataclass
from pathlib import Path
from app.services.supabase_client import get_supabase
EXAM_KEY = "COMP2211-2024-spring-final"
PROBLEM_SEED_PATH = (
Path(__file__).resolve().parent.parent
/ "pastpaper-scraper"
/ "reviews"
/ "COMP2211"
/ "problem_seed.json"
)
TRUE_FALSE_OPTIONS = [{"label": "True", "text": "True"}, {"label": "False", "text": "False"}]
@dataclass(frozen=True)
class ChildSpec:
question_number: str
parent_question: str
top_level_number: str
path: tuple[str, ...]
score: float
question_type: str
question_format: str | None = None
analytics_topic: str | None = None
topic_primary: str | None = None
topic_tags: tuple[str, ...] | None = None
skill_tags: tuple[str, ...] | None = None
options: tuple[tuple[str, str], ...] | None = None
correct_option: str | None = None
correct_answer: str | None = None
page_number: int = 1
def short_answer(
question_number: str,
parent_question: str,
top_level_number: str,
path: tuple[str, ...],
score: float,
*,
analytics_topic: str | None = None,
topic_primary: str | None = None,
topic_tags: tuple[str, ...] | None = None,
skill_tags: tuple[str, ...] | None = None,
correct_answer: str | None = None,
page_number: int,
) -> ChildSpec:
return ChildSpec(
question_number=question_number,
parent_question=parent_question,
top_level_number=top_level_number,
path=path,
score=score,
question_type="long_question",
question_format="short_answer",
analytics_topic=analytics_topic,
topic_primary=topic_primary,
topic_tags=topic_tags,
skill_tags=skill_tags,
correct_answer=correct_answer,
page_number=page_number,
)
CHILDREN: list[ChildSpec] = [
ChildSpec("1a", "1", "1", ("a",), 1, "true_false", "true_false", "Python Fundamentals", "Python Fundamentals", ("Python Fundamentals",), ("concept_check", "code_tracing"), page_number=2),
ChildSpec("1b", "1", "1", ("b",), 1, "true_false", "true_false", "Probabilistic Models", "Probabilistic Models", ("Probabilistic Models",), ("concept_check", "classification_decision"), page_number=2),
ChildSpec("1c", "1", "1", ("c",), 1, "true_false", "true_false", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("concept_check", "algorithm_property"), page_number=2),
ChildSpec("1d", "1", "1", ("d",), 1, "true_false", "true_false", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("concept_check", "algorithm_property"), page_number=2),
ChildSpec("1e", "1", "1", ("e",), 1, "true_false", "true_false", "Perceptron and MLP", "Perceptron and MLP", ("Perceptron and MLP",), ("concept_check", "activation_reasoning"), page_number=2),
ChildSpec("1f", "1", "1", ("f",), 1, "true_false", "true_false", "Vision and CNN", "Vision and CNN", ("Vision and CNN",), ("concept_check", "image_processing"), page_number=2),
ChildSpec("1g", "1", "1", ("g",), 1, "true_false", "true_false", "Vision and CNN", "Vision and CNN", ("Vision and CNN",), ("concept_check", "cnn_complexity"), page_number=2),
ChildSpec("1h", "1", "1", ("h",), 1, "true_false", "true_false", "Vision and CNN", "Vision and CNN", ("Vision and CNN",), ("concept_check", "regularization"), page_number=2),
ChildSpec("1i", "1", "1", ("i",), 1, "true_false", "true_false", "Search and Games", "Search and Games", ("Search and Games",), ("concept_check", "pruning_reasoning"), page_number=2),
ChildSpec("1j", "1", "1", ("j",), 1, "true_false", "true_false", "Ethics of AI", "Ethics of AI", ("Ethics of AI",), ("concept_check", "research_ethics"), page_number=2),
ChildSpec("2a", "2", "2", ("a",), 4, "long_question", "coding", "Python Fundamentals", "Python Fundamentals", ("Python Fundamentals",), ("implementation", "vectorization", "masking"), page_number=3),
ChildSpec("2b", "2", "2", ("b",), 6, "long_question", "coding", "Python Fundamentals", "Python Fundamentals", ("Python Fundamentals",), ("implementation", "convolution", "array_manipulation"), page_number=4),
short_answer("3a_i", "3a", "3", ("a", "i"), 1.5, analytics_topic="Probabilistic Models", topic_primary="Probabilistic Models", topic_tags=("Probabilistic Models",), skill_tags=("manual_computation", "probability_reasoning"), page_number=6),
short_answer("3a_ii", "3a", "3", ("a", "ii"), 1.5, analytics_topic="Probabilistic Models", topic_primary="Probabilistic Models", topic_tags=("Probabilistic Models",), skill_tags=("manual_computation", "probability_reasoning"), page_number=6),
short_answer("3a_iii", "3a", "3", ("a", "iii"), 1.5, analytics_topic="Probabilistic Models", topic_primary="Probabilistic Models", topic_tags=("Probabilistic Models",), skill_tags=("manual_computation", "probability_reasoning"), page_number=6),
short_answer("3a_iv", "3a", "3", ("a", "iv"), 1.5, analytics_topic="Probabilistic Models", topic_primary="Probabilistic Models", topic_tags=("Probabilistic Models",), skill_tags=("manual_computation", "probability_reasoning"), page_number=6),
short_answer("3b_i", "3b", "3", ("b", "i"), 1.5, analytics_topic="Evaluation and Validation", topic_primary="Evaluation and Validation", topic_tags=("Evaluation and Validation",), skill_tags=("validation_reasoning",), page_number=6),
short_answer("3b_ii", "3b", "3", ("b", "ii"), 1.5, analytics_topic="Evaluation and Validation", topic_primary="Evaluation and Validation", topic_tags=("Evaluation and Validation",), skill_tags=("validation_reasoning",), page_number=6),
short_answer("3b_iii", "3b", "3", ("b", "iii"), 1.5, analytics_topic="Evaluation and Validation", topic_primary="Evaluation and Validation", topic_tags=("Evaluation and Validation",), skill_tags=("validation_reasoning",), page_number=6),
short_answer("3c", "3", "3", ("c",), 1.5, analytics_topic="Perceptron and MLP", topic_primary="Perceptron and MLP", topic_tags=("Perceptron and MLP",), skill_tags=("linearity_reasoning", "classification_decision"), page_number=6),
short_answer("4a_i", "4a", "4", ("a", "i"), 2.5, analytics_topic="Perceptron and MLP", topic_primary="Perceptron and MLP", topic_tags=("Perceptron and MLP",), skill_tags=("parameter_counting",), page_number=7),
short_answer("4a_ii", "4a", "4", ("a", "ii"), 2.5, analytics_topic="Perceptron and MLP", topic_primary="Perceptron and MLP", topic_tags=("Perceptron and MLP",), skill_tags=("model_selection",), page_number=7),
short_answer("4b", "4", "4", ("b",), 1, analytics_topic="Perceptron and MLP", topic_primary="Perceptron and MLP", topic_tags=("Perceptron and MLP",), skill_tags=("concept_explanation",), page_number=7),
short_answer("4c", "4", "4", ("c",), 2, analytics_topic="Perceptron and MLP", topic_primary="Perceptron and MLP", topic_tags=("Perceptron and MLP",), skill_tags=("activation_reasoning", "optimization_reasoning"), page_number=7),
ChildSpec("4d_i", "4d", "4", ("d", "i"), 1.5, "long_question", "long_answer", "Perceptron and MLP", "Perceptron and MLP", ("Perceptron and MLP",), ("forward_pass", "activation_reasoning"), page_number=8),
ChildSpec("4d_ii", "4d", "4", ("d", "ii"), 1.5, "long_question", "long_answer", "Perceptron and MLP", "Perceptron and MLP", ("Perceptron and MLP",), ("backpropagation", "weight_update"), page_number=8),
ChildSpec("5a", "5", "5", ("a",), 4.5, "long_question", "long_answer", "Vision and CNN", "Vision and CNN", ("Vision and CNN",), ("histogram_reasoning", "image_transform"), page_number=9),
ChildSpec("5b", "5", "5", ("b",), 3, "long_question", "long_answer", "Vision and CNN", "Vision and CNN", ("Vision and CNN",), ("thresholding", "manual_computation"), page_number=10),
ChildSpec("5c", "5", "5", ("c",), 2, "long_question", "long_answer", "Vision and CNN", "Vision and CNN", ("Vision and CNN",), ("padding", "manual_construction"), page_number=10),
short_answer("5d_i", "5d", "5", ("d", "i"), 0.5, analytics_topic="Vision and CNN", topic_primary="Vision and CNN", topic_tags=("Vision and CNN",), skill_tags=("filter_effect_reasoning",), page_number=11),
short_answer("5d_ii", "5d", "5", ("d", "ii"), 0.5, analytics_topic="Vision and CNN", topic_primary="Vision and CNN", topic_tags=("Vision and CNN",), skill_tags=("filter_effect_reasoning",), page_number=11),
short_answer("5d_iii", "5d", "5", ("d", "iii"), 0.5, analytics_topic="Vision and CNN", topic_primary="Vision and CNN", topic_tags=("Vision and CNN",), skill_tags=("filter_effect_reasoning",), page_number=11),
short_answer("5e", "5", "5", ("e",), 2, analytics_topic="Vision and CNN", topic_primary="Vision and CNN", topic_tags=("Vision and CNN",), skill_tags=("concept_explanation", "local_vs_global"), page_number=11),
ChildSpec("6a", "6", "6", ("a",), 10, "long_question", "coding", "Vision and CNN", "Vision and CNN", ("Vision and CNN",), ("implementation", "convolution", "debugging"), page_number=12),
ChildSpec("6b", "6", "6", ("b",), 3, "long_question", "coding", "Vision and CNN", "Vision and CNN", ("Vision and CNN",), ("implementation", "regularization"), page_number=15),
short_answer("7a_i", "7a", "7", ("a", "i"), 1, analytics_topic="Vision and CNN", topic_primary="Vision and CNN", topic_tags=("Vision and CNN",), skill_tags=("cnn_architecture",), page_number=16),
short_answer("7a_ii", "7a", "7", ("a", "ii"), 4, analytics_topic="Vision and CNN", topic_primary="Vision and CNN", topic_tags=("Vision and CNN",), skill_tags=("shape_reasoning", "parameter_counting"), page_number=16),
short_answer("7a_iii", "7a", "7", ("a", "iii"), 3, analytics_topic="Vision and CNN", topic_primary="Vision and CNN", topic_tags=("Vision and CNN",), skill_tags=("overfitting", "regularization"), page_number=16),
ChildSpec("7b", "7", "7", ("b",), 5, "long_question", "long_answer", "Vision and CNN", "Vision and CNN", ("Vision and CNN",), ("manual_computation", "cnn_forward_pass"), page_number=17),
short_answer("7c_i", "7c", "7", ("c", "i"), 2, analytics_topic="Vision and CNN", topic_primary="Vision and CNN", topic_tags=("Vision and CNN",), skill_tags=("shape_reasoning", "3d_convolution"), page_number=17),
short_answer("7c_ii", "7c", "7", ("c", "ii"), 1.5, analytics_topic="Vision and CNN", topic_primary="Vision and CNN", topic_tags=("Vision and CNN",), skill_tags=("parameter_counting", "3d_convolution"), page_number=17),
short_answer("7c_iii", "7c", "7", ("c", "iii"), 1.5, analytics_topic="Vision and CNN", topic_primary="Vision and CNN", topic_tags=("Vision and CNN",), skill_tags=("parameter_counting", "3d_convolution"), page_number=17),
short_answer("8a_i", "8a", "8", ("a", "i"), 1, analytics_topic="Search and Games", topic_primary="Search and Games", topic_tags=("Search and Games",), skill_tags=("tree_search", "manual_tracing"), page_number=18),
short_answer("8a_ii", "8a", "8", ("a", "ii"), 3, analytics_topic="Search and Games", topic_primary="Search and Games", topic_tags=("Search and Games",), skill_tags=("pruning", "manual_tracing"), page_number=18),
short_answer("8a_iii", "8a", "8", ("a", "iii"), 1, analytics_topic="Search and Games", topic_primary="Search and Games", topic_tags=("Search and Games",), skill_tags=("game_reasoning",), page_number=18),
short_answer("8b_i", "8b", "8", ("b", "i"), 2.5, analytics_topic="Search and Games", topic_primary="Search and Games", topic_tags=("Search and Games",), skill_tags=("utility_reasoning",), page_number=18),
short_answer("8b_ii", "8b", "8", ("b", "ii"), 2.5, analytics_topic="Search and Games", topic_primary="Search and Games", topic_tags=("Search and Games",), skill_tags=("pruning_reasoning", "concept_explanation"), page_number=18),
short_answer("9", "9", "9", (), 3, analytics_topic="Ethics of AI", topic_primary="Ethics of AI", topic_tags=("Ethics of AI",), skill_tags=("concept_explanation", "governance"), page_number=19),
]
MARKER_RE = re.compile(r"(?m)^\(([a-z]+|[ivx]+)\)\s*")
def split_sections(text: str) -> tuple[str, dict[str, str]]:
matches = list(MARKER_RE.finditer(text))
if not matches:
return text.strip(), {}
intro = text[: matches[0].start()].strip()
sections: dict[str, str] = {}
for idx, match in enumerate(matches):
marker = match.group(1)
end = matches[idx + 1].start() if idx + 1 < len(matches) else len(text)
sections[marker] = text[match.start() : end].strip()
return intro, sections
def extract_segment(text: str, path: tuple[str, ...]) -> str:
if not path:
return text.strip()
current = text.strip()
carried_intro: list[str] = []
for depth, marker in enumerate(path):
intro, sections = split_sections(current)
if depth == 0 and intro:
carried_intro.append(intro)
current = sections.get(marker, current)
return "\n".join(part for part in [*carried_intro, current] if part).strip()
def extract_true_false_answers(answer_text: str) -> dict[str, str]:
answers: dict[str, str] = {}
table_match = re.search(r"Answer\s+(T\s+F\s+T\s+F\s+F\s+T\s+F\s+F\s+F\s+T)", answer_text, re.S)
if table_match:
seq = re.findall(r"[TF]", table_match.group(1))
if len(seq) == 10:
for idx, val in enumerate(seq):
answers[chr(ord("a") + idx)] = val
return answers
seq = re.findall(r"\b([TF])\b", answer_text)
if len(seq) >= 10:
for idx, val in enumerate(seq[:10]):
answers[chr(ord("a") + idx)] = val
return answers
def load_seed_rows() -> dict[str, dict]:
data = json.loads(PROBLEM_SEED_PATH.read_text())
return {row["question_number"]: row for row in data if row["source_exam_key"] == EXAM_KEY}
def main() -> None:
sb = get_supabase()
paper = sb.table("papers").select("id").eq("source_exam_key", EXAM_KEY).execute().data[0]
paper_id = paper["id"]
current_rows = (
sb.table("paper_questions")
.select("*")
.eq("paper_id", paper_id)
.order("display_order")
.execute()
.data
)
existing_by_number = {row["question_number"]: row for row in current_rows}
parent_rows = load_seed_rows()
tf_answers = extract_true_false_answers(parent_rows["1"]["raw_answer_text"] or "")
inserts = []
for display_order, child in enumerate(CHILDREN, start=1):
parent = parent_rows[child.top_level_number]
existing = existing_by_number.get(child.question_number, {})
question_text = extract_segment(parent["question_text"] or "", child.path)
raw_answer_text = extract_segment(parent["raw_answer_text"] or "", child.path) if child.path else (parent["raw_answer_text"] or "")
options = None
correct_option = child.correct_option
if child.question_type == "true_false":
options = TRUE_FALSE_OPTIONS
correct_option = tf_answers.get(child.path[0])
elif child.options:
options = [{"label": label, "text": text} for label, text in child.options]
inserts.append(
{
"paper_id": paper_id,
"question_number": child.question_number,
"parent_question": child.parent_question,
"display_order": display_order,
"question_type": child.question_type,
"question_format": child.question_format,
"question_text": question_text,
"score": child.score,
"page_number": child.page_number,
"page_y_ratio": existing.get("page_y_ratio"),
"options": options,
"correct_option": correct_option,
"correct_answer": child.correct_answer,
"raw_answer_text": raw_answer_text,
"topics": existing.get("topics") or (list(child.topic_tags) if child.topic_tags else parent.get("topics")),
"topic_primary": existing.get("topic_primary") or child.topic_primary or parent.get("topic_primary"),
"analytics_topic": existing.get("analytics_topic") or child.analytics_topic or parent.get("analytics_topic"),
"topic_tags": existing.get("topic_tags") or (list(child.topic_tags) if child.topic_tags else parent.get("topic_tags")),
"skill_tags": existing.get("skill_tags") or (list(child.skill_tags) if child.skill_tags else parent.get("skill_tags")),
"difficulty": existing.get("difficulty") or parent.get("difficulty"),
"knowledge_reminder": existing.get("knowledge_reminder", ""),
"ai_hint": existing.get("ai_hint", ""),
"solution": existing.get("solution", ""),
}
)
sb.table("paper_questions").delete().eq("paper_id", paper_id).execute()
sb.table("paper_questions").insert(inserts).execute()
sb.table("papers").update({"question_count": len(inserts), "status": "processing"}).eq("id", paper_id).execute()
print(f"Inserted {len(inserts)} rows for {EXAM_KEY}.")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,291 @@
"""Rebuild COMP2211 Spring 2024 midterm into subquestions."""
from __future__ import annotations
import json
import re
from dataclasses import dataclass
from pathlib import Path
import fitz
from app.services.supabase_client import get_supabase
EXAM_KEY = "COMP2211-2024-spring-midterm"
ROOT = Path(__file__).resolve().parent.parent
QUESTION_PDF = ROOT / "pastpaper-scraper" / "papers" / "COMP2211" / "(COMP2211)[2024](s)midterm~=rcidkjgf^_82003.pdf"
ANSWER_PDF = ROOT / "pastpaper-scraper" / "papers" / "COMP2211" / "(COMP2211)[2024](s)midterm~=ubrzkjmz^_90406.pdf"
PROBLEM_SEED_PATH = ROOT / "pastpaper-scraper" / "reviews" / "COMP2211" / "problem_seed.json"
TRUE_FALSE_OPTIONS = [{"label": "True", "text": "True"}, {"label": "False", "text": "False"}]
@dataclass(frozen=True)
class ChildSpec:
question_number: str
parent_question: str
top_level_number: str
path: tuple[str, ...]
score: float
question_type: str
question_format: str | None = None
analytics_topic: str | None = None
topic_primary: str | None = None
topic_tags: tuple[str, ...] | None = None
skill_tags: tuple[str, ...] | None = None
page_number: int = 1
def short_answer(
question_number: str,
parent_question: str,
top_level_number: str,
path: tuple[str, ...],
score: float,
*,
analytics_topic: str | None = None,
topic_primary: str | None = None,
topic_tags: tuple[str, ...] | None = None,
skill_tags: tuple[str, ...] | None = None,
page_number: int,
) -> ChildSpec:
return ChildSpec(
question_number=question_number,
parent_question=parent_question,
top_level_number=top_level_number,
path=path,
score=score,
question_type="long_question",
question_format="short_answer",
analytics_topic=analytics_topic,
topic_primary=topic_primary,
topic_tags=topic_tags,
skill_tags=skill_tags,
page_number=page_number,
)
CHILDREN: list[ChildSpec] = [
ChildSpec("1a", "1", "1", ("a",), 0.5, "true_false", "true_false", "Python Fundamentals", "Python Fundamentals", ("Python Fundamentals",), ("concept_check", "code_tracing"), page_number=3),
ChildSpec("1b", "1", "1", ("b",), 0.5, "true_false", "true_false", "Python Fundamentals", "Python Fundamentals", ("Python Fundamentals",), ("concept_check", "broadcasting"), page_number=3),
ChildSpec("1c", "1", "1", ("c",), 0.5, "true_false", "true_false", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("concept_check", "algorithm_property"), page_number=3),
ChildSpec("1d", "1", "1", ("d",), 0.5, "true_false", "true_false", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("concept_check", "tie_reasoning"), page_number=3),
ChildSpec("1e", "1", "1", ("e",), 0.5, "true_false", "true_false", "Evaluation and Validation", "Evaluation and Validation", ("Evaluation and Validation",), ("concept_check", "cross_validation"), page_number=3),
ChildSpec("1f", "1", "1", ("f",), 0.5, "true_false", "true_false", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("concept_check", "clustering_property"), page_number=3),
ChildSpec("1g", "1", "1", ("g",), 0.5, "true_false", "true_false", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("concept_check", "robustness_reasoning"), page_number=3),
ChildSpec("1h", "1", "1", ("h",), 0.5, "true_false", "true_false", "Perceptron and MLP", "Perceptron and MLP", ("Perceptron and MLP",), ("concept_check", "decision_boundary"), page_number=3),
ChildSpec("1i", "1", "1", ("i",), 0.5, "true_false", "true_false", "Perceptron and MLP", "Perceptron and MLP", ("Perceptron and MLP",), ("concept_check", "optimization_reasoning"), page_number=3),
ChildSpec("1j", "1", "1", ("j",), 0.5, "true_false", "true_false", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("concept_check", "clustering_property"), page_number=3),
short_answer("2a_i", "2a", "2", ("a", "i"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("code_tracing",), page_number=4),
short_answer("2a_ii", "2a", "2", ("a", "ii"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("code_tracing",), page_number=4),
short_answer("2a_iii", "2a", "2", ("a", "iii"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("array_manipulation",), page_number=5),
short_answer("2a_iv", "2a", "2", ("a", "iv"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("array_construction",), page_number=5),
short_answer("2a_v", "2a", "2", ("a", "v"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("aggregation",), page_number=5),
short_answer("2a_vi", "2a", "2", ("a", "vi"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("transpose",), page_number=6),
short_answer("2a_vii", "2a", "2", ("a", "vii"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("matrix_multiplication",), page_number=6),
short_answer("2a_viii", "2a", "2", ("a", "viii"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("dot_product",), page_number=6),
short_answer("2a_ix", "2a", "2", ("a", "ix"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("broadcasting",), page_number=6),
short_answer("2a_x", "2a", "2", ("a", "x"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("error_reasoning",), page_number=7),
short_answer("2a_xi", "2a", "2", ("a", "xi"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("broadcasting",), page_number=7),
short_answer("2a_xii", "2a", "2", ("a", "xii"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("slicing",), page_number=7),
short_answer("2a_xiii", "2a", "2", ("a", "xiii"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("views_vs_copies",), page_number=7),
ChildSpec("2b", "2", "2", ("b",), 6, "long_question", "coding", "Python Fundamentals", "Python Fundamentals", ("Python Fundamentals",), ("implementation", "vectorization", "similarity_computation"), page_number=8),
ChildSpec("3a", "3", "3", ("a",), 5.5, "long_question", "long_answer", "Evaluation and Validation", "Evaluation and Validation", ("Evaluation and Validation",), ("manual_computation", "metric_reasoning"), page_number=10),
short_answer("3b", "3", "3", ("b",), 1, analytics_topic="Evaluation and Validation", topic_primary="Evaluation and Validation", topic_tags=("Evaluation and Validation",), skill_tags=("metric_reasoning",), page_number=11),
ChildSpec("3c", "3", "3", ("c",), 2.5, "long_question", "long_answer", "Evaluation and Validation", "Evaluation and Validation", ("Evaluation and Validation",), ("manual_computation", "metric_reasoning"), page_number=11),
short_answer("3d", "3", "3", ("d",), 1, analytics_topic="Evaluation and Validation", topic_primary="Evaluation and Validation", topic_tags=("Evaluation and Validation",), skill_tags=("metric_reasoning",), page_number=12),
ChildSpec("3e", "3", "3", ("e",), 6, "long_question", "coding", "Evaluation and Validation", "Evaluation and Validation", ("Evaluation and Validation",), ("implementation", "metrics", "vectorization"), page_number=12),
ChildSpec("4a", "4", "4", ("a",), 4, "long_question", "long_answer", "Probabilistic Models", "Probabilistic Models", ("Probabilistic Models",), ("manual_computation", "gaussian_nb"), page_number=15),
ChildSpec("4b", "4", "4", ("b",), 3, "long_question", "long_answer", "Probabilistic Models", "Probabilistic Models", ("Probabilistic Models",), ("manual_computation", "likelihood_reasoning"), page_number=15),
ChildSpec("4c", "4", "4", ("c",), 4, "long_question", "long_answer", "Probabilistic Models", "Probabilistic Models", ("Probabilistic Models",), ("laplace_smoothing", "likelihood_reasoning"), page_number=16),
short_answer("4d", "4", "4", ("d",), 2, analytics_topic="Probabilistic Models", topic_primary="Probabilistic Models", topic_tags=("Probabilistic Models",), skill_tags=("prior_reasoning",), page_number=17),
ChildSpec("4e", "4", "4", ("e",), 3, "long_question", "long_answer", "Probabilistic Models", "Probabilistic Models", ("Probabilistic Models",), ("posterior_reasoning", "classification_decision"), page_number=17),
ChildSpec("5a", "5", "5", ("a",), 3, "long_question", "long_answer", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("distance_calculation", "weighted_knn"), page_number=18),
ChildSpec("5b", "5", "5", ("b",), 13, "long_question", "long_answer", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("cross_validation", "manual_tracing", "model_selection"), page_number=18),
short_answer("5c", "5", "5", ("c",), 2, analytics_topic="KNN and Clustering", topic_primary="KNN and Clustering", topic_tags=("KNN and Clustering",), skill_tags=("test_error", "model_selection"), page_number=20),
ChildSpec("6a", "6", "6", ("a",), 6, "long_question", "long_answer", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("manual_computation", "clustering"), page_number=21),
ChildSpec("6b", "6", "6", ("b",), 6, "long_question", "long_answer", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("manual_computation", "clustering"), page_number=22),
short_answer("6c", "6", "6", ("c",), 2, analytics_topic="KNN and Clustering", topic_primary="KNN and Clustering", topic_tags=("KNN and Clustering",), skill_tags=("outlier_reasoning",), page_number=22),
short_answer("6d", "6", "6", ("d",), 2, analytics_topic="KNN and Clustering", topic_primary="KNN and Clustering", topic_tags=("KNN and Clustering",), skill_tags=("model_selection", "threshold_reasoning"), page_number=22),
ChildSpec("7", "7", "7", (), 10, "long_question", "long_answer", "Evaluation and Validation", "Evaluation and Validation", ("Evaluation and Validation",), ("cross_validation", "data_leakage_reasoning"), page_number=23),
]
MARKER_RE = re.compile(r"(?m)^\(([a-z]+|[ivx]+)\)\s*")
def split_sections(text: str) -> tuple[str, dict[str, str]]:
matches = list(MARKER_RE.finditer(text))
if not matches:
return text.strip(), {}
intro = text[: matches[0].start()].strip()
sections: dict[str, str] = {}
for idx, match in enumerate(matches):
marker = match.group(1)
end = matches[idx + 1].start() if idx + 1 < len(matches) else len(text)
sections[marker] = text[match.start() : end].strip()
return intro, sections
def extract_segment(text: str, path: tuple[str, ...]) -> str:
if not path:
return text.strip()
current = text.strip()
carried_intro: list[str] = []
for depth, marker in enumerate(path):
intro, sections = split_sections(current)
if depth == 0 and intro:
carried_intro.append(intro)
current = sections.get(marker, current)
return "\n".join(part for part in [*carried_intro, current] if part).strip()
def extract_pages(pdf_path: Path, start: int, end: int) -> str:
doc = fitz.open(pdf_path)
try:
return "\n".join(doc[i].get_text("text") for i in range(start - 1, end))
finally:
doc.close()
def load_seed_rows() -> dict[str, dict]:
data = json.loads(PROBLEM_SEED_PATH.read_text())
return {row["question_number"]: row for row in data if row["source_exam_key"] == EXAM_KEY}
def build_source_rows(existing_rows: dict[str, dict]) -> dict[str, dict]:
seed_rows = load_seed_rows()
rows = dict(seed_rows)
if "5" in rows:
rows["5"] = {
**rows["5"],
"question_text": extract_pages(QUESTION_PDF, 18, 20),
"raw_answer_text": extract_pages(ANSWER_PDF, 21, 25),
"page_number": 18,
"analytics_topic": "KNN and Clustering",
"topic_primary": "KNN and Clustering",
"topic_tags": ["KNN and Clustering"],
"skill_tags": ["manual_computation", "distance_calculation", "algorithm_tracing"],
"difficulty": "medium",
}
else:
rows["5"] = {
**seed_rows["5"],
"question_text": extract_pages(QUESTION_PDF, 18, 20),
"raw_answer_text": extract_pages(ANSWER_PDF, 21, 25),
"page_number": 18,
}
if "7" in rows:
rows["7"] = {
**rows["7"],
"question_text": extract_pages(QUESTION_PDF, 23, 24),
"raw_answer_text": extract_pages(ANSWER_PDF, 31, 34),
"page_number": 23,
"analytics_topic": "Evaluation and Validation",
"topic_primary": "Evaluation and Validation",
"topic_tags": ["Evaluation and Validation"],
"skill_tags": ["cross_validation", "data_leakage_reasoning"],
"difficulty": "medium",
}
else:
rows["7"] = {
**seed_rows["7"],
"question_text": extract_pages(QUESTION_PDF, 23, 24),
"raw_answer_text": extract_pages(ANSWER_PDF, 31, 34),
"page_number": 23,
}
return rows
def extract_true_false_answers(answer_text: str) -> dict[str, str]:
answers: dict[str, str] = {}
table_match = re.search(r"Answer\s+([TF\s]+)", answer_text, re.S)
if table_match:
seq = re.findall(r"[TF]", table_match.group(1))
if len(seq) >= 10:
for idx, val in enumerate(seq[:10]):
answers[chr(ord("a") + idx)] = val
return answers
lines = [line.strip() for line in answer_text.splitlines() if line.strip()]
current_letter: str | None = None
for line in lines:
m = re.fullmatch(r"\(([a-j])\)", line)
if m:
current_letter = m.group(1)
continue
if current_letter and line in {"T", "F"}:
answers[current_letter] = line
current_letter = None
if answers:
return answers
seq = re.findall(r"\b([TF])\b", answer_text)
if len(seq) >= 10:
for idx, val in enumerate(seq[:10]):
answers[chr(ord("a") + idx)] = val
return answers
def main() -> None:
sb = get_supabase()
paper = sb.table("papers").select("id").eq("source_exam_key", EXAM_KEY).execute().data[0]
paper_id = paper["id"]
current_rows = (
sb.table("paper_questions")
.select("*")
.eq("paper_id", paper_id)
.order("display_order")
.execute()
.data
)
existing_by_number = {row["question_number"]: row for row in current_rows}
parent_rows = build_source_rows(existing_by_number)
tf_answers = extract_true_false_answers(parent_rows["1"]["raw_answer_text"] or "")
inserts = []
for display_order, child in enumerate(CHILDREN, start=1):
parent = parent_rows[child.top_level_number]
existing = existing_by_number.get(child.question_number, {})
question_text = extract_segment(parent["question_text"] or "", child.path)
raw_answer_text = extract_segment(parent["raw_answer_text"] or "", child.path) if child.path else (parent["raw_answer_text"] or "")
options = None
correct_option = None
if child.question_type == "true_false":
options = TRUE_FALSE_OPTIONS
correct_option = tf_answers.get(child.path[0])
inserts.append(
{
"paper_id": paper_id,
"question_number": child.question_number,
"parent_question": child.parent_question,
"display_order": display_order,
"question_type": child.question_type,
"question_format": child.question_format,
"question_text": question_text,
"score": child.score,
"page_number": child.page_number,
"page_y_ratio": existing.get("page_y_ratio"),
"options": options,
"correct_option": correct_option,
"correct_answer": None,
"raw_answer_text": raw_answer_text,
"topics": existing.get("topics") or (list(child.topic_tags) if child.topic_tags else parent.get("topics")),
"topic_primary": existing.get("topic_primary") or child.topic_primary or parent.get("topic_primary"),
"analytics_topic": existing.get("analytics_topic") or child.analytics_topic or parent.get("analytics_topic"),
"topic_tags": existing.get("topic_tags") or (list(child.topic_tags) if child.topic_tags else parent.get("topic_tags")),
"skill_tags": existing.get("skill_tags") or (list(child.skill_tags) if child.skill_tags else parent.get("skill_tags")),
"difficulty": existing.get("difficulty") or parent.get("difficulty"),
"knowledge_reminder": existing.get("knowledge_reminder", ""),
"ai_hint": existing.get("ai_hint", ""),
"solution": existing.get("solution", ""),
}
)
sb.table("paper_questions").delete().eq("paper_id", paper_id).execute()
sb.table("paper_questions").insert(inserts).execute()
sb.table("papers").update({"question_count": len(inserts), "status": "processing"}).eq("id", paper_id).execute()
print(f"Inserted {len(inserts)} rows for {EXAM_KEY}.")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,121 @@
"""Upload COMP2211 course-library PDFs to Supabase Storage.
Run from the backend directory:
uv run python upload_course_library_pdfs.py
Each entry maps a storage path (inside the `papers` bucket) to the local
source file under pastpaper-scraper/papers/COMP2211/.
"""
from __future__ import annotations
import sys
from pathlib import Path
# ---------------------------------------------------------------------------
# Manifest: (storage_path, local_filename)
# storage_path is relative inside the `papers` bucket.
# local_filename is relative to PAPERS_DIR below.
# ---------------------------------------------------------------------------
MANIFEST: list[tuple[str, str]] = [
(
"course-library/COMP2211/COMP2211-2022-fall-midterm/paper.pdf",
"(COMP2211)[2022](f)midterm~=yjz8dxdd^_27002.pdf",
),
(
"course-library/COMP2211/COMP2211-2022-fall-midterm/answer.pdf",
"(COMP2211)[2022](f)midterm~=yjz8dxdd^_18747.pdf",
),
(
"course-library/COMP2211/COMP2211-2022-spring-midterm/paper.pdf",
"(COMP2211)[2022](s)midterm~=b8bidkgs^_14629.pdf",
),
(
"course-library/COMP2211/COMP2211-2022-spring-midterm/answer.pdf",
"(COMP2211)[2022](s)midterm~=6ma030^_89587.pdf",
),
(
"course-library/COMP2211/COMP2211-2022-spring-final-part-a/paper.pdf",
"(COMP2211)[2022](s)final~=b8bidkgs^_33018.pdf",
),
(
"course-library/COMP2211/COMP2211-2022-spring-final-part-a/answer.pdf",
"(COMP2211)[2022](s)final~=ajou6^_82011.pdf",
),
(
"course-library/COMP2211/COMP2211-2022-spring-final-part-b/paper.pdf",
"(COMP2211)[2022](s)final~=b8bidkgs^_40627.pdf",
),
(
"course-library/COMP2211/COMP2211-2022-spring-final-part-b/answer.pdf",
"(COMP2211)[2022](s)final~=ajou6^_51199.pdf",
),
(
"course-library/COMP2211/COMP2211-2023-spring-midterm/paper.pdf",
"(COMP2211)[2023](s)midterm~=bxbidkmj^_26587.pdf",
),
(
"course-library/COMP2211/COMP2211-2023-spring-midterm/answer.pdf",
"(COMP2211)[2023](s)midterm~clchanbg^_17297.pdf",
),
(
"course-library/COMP2211/COMP2211-2024-spring-midterm/paper.pdf",
"(COMP2211)[2024](s)midterm~=rcidkjgf^_82003.pdf",
),
(
"course-library/COMP2211/COMP2211-2024-spring-midterm/answer.pdf",
"(COMP2211)[2024](s)midterm~=ubrzkjmz^_90406.pdf",
),
(
"course-library/COMP2211/COMP2211-2024-spring-final/paper.pdf",
"(COMP2211)[2024](s)final~=igk5mmg^_90365.pdf",
),
(
"course-library/COMP2211/COMP2211-2024-spring-final/answer.pdf",
"(COMP2211)[2024](s)final~=igk5mmg^_58857.pdf",
),
]
PAPERS_DIR = (
Path(__file__).parent.parent
/ "pastpaper-scraper"
/ "papers"
/ "COMP2211"
)
def main() -> None:
from app.services.supabase_client import get_supabase
sb = get_supabase()
bucket = sb.storage.from_("papers")
ok = 0
skipped = 0
failed = 0
for storage_path, local_name in MANIFEST:
local_file = PAPERS_DIR / local_name
if not local_file.exists():
print(f" MISSING local file: {local_name}")
failed += 1
continue
data = local_file.read_bytes()
try:
bucket.upload(
storage_path,
data,
file_options={"content-type": "application/pdf", "upsert": "true"},
)
print(f" OK {storage_path}")
ok += 1
except Exception as exc:
print(f" ERR {storage_path}: {exc}")
failed += 1
print(f"\nDone: {ok} uploaded, {skipped} skipped, {failed} failed.")
if __name__ == "__main__":
main()

1969
backend/uv.lock generated Normal file

File diff suppressed because it is too large Load Diff