Initial commit: PastPaper Master full stack
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
16
backend/Dockerfile
Normal file
16
backend/Dockerfile
Normal file
@@ -0,0 +1,16 @@
|
||||
FROM python:3.12-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# System deps for PyMuPDF
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
libmupdf-dev gcc g++ && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
COPY pyproject.toml .
|
||||
RUN pip install --no-cache-dir .
|
||||
|
||||
COPY app/ app/
|
||||
|
||||
EXPOSE 8000
|
||||
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||
4
backend/add_progress_columns.sql
Normal file
4
backend/add_progress_columns.sql
Normal file
@@ -0,0 +1,4 @@
|
||||
ALTER TABLE papers
|
||||
ADD COLUMN IF NOT EXISTS processing_step text DEFAULT NULL,
|
||||
ADD COLUMN IF NOT EXISTS processing_progress integer DEFAULT 0,
|
||||
ADD COLUMN IF NOT EXISTS processing_total integer DEFAULT 0;
|
||||
0
backend/app/__init__.py
Normal file
0
backend/app/__init__.py
Normal file
36
backend/app/config.py
Normal file
36
backend/app/config.py
Normal file
@@ -0,0 +1,36 @@
|
||||
from pydantic_settings import BaseSettings
|
||||
from functools import lru_cache
|
||||
import os
|
||||
|
||||
|
||||
class Settings(BaseSettings):
|
||||
# Supabase
|
||||
supabase_url: str
|
||||
supabase_anon_key: str
|
||||
supabase_service_role_key: str
|
||||
|
||||
# LLM - laozhang (gpt-4o, gpt-4o-mini)
|
||||
laozhang_base_url: str = "https://api.laozhang.ai/v1"
|
||||
laozhang_api_key: str = ""
|
||||
|
||||
# LLM - DashScope (qwen-plus)
|
||||
dashscope_base_url: str = "https://dashscope.aliyuncs.com/compatible-mode/v1"
|
||||
dashscope_api_key: str = ""
|
||||
|
||||
# LLM - DeepSeek
|
||||
deepseek_base_url: str = "https://api.deepseek.com/v1"
|
||||
deepseek_api_key: str = ""
|
||||
|
||||
# Google Gemini (official)
|
||||
google_gemini_api_key: str = ""
|
||||
|
||||
model_config = {
|
||||
"env_file": os.path.join(os.path.dirname(__file__), "../../.env"),
|
||||
"env_file_encoding": "utf-8",
|
||||
"extra": "ignore",
|
||||
}
|
||||
|
||||
|
||||
@lru_cache
|
||||
def get_settings() -> Settings:
|
||||
return Settings()
|
||||
0
backend/app/dependencies/__init__.py
Normal file
0
backend/app/dependencies/__init__.py
Normal file
34
backend/app/dependencies/auth.py
Normal file
34
backend/app/dependencies/auth.py
Normal file
@@ -0,0 +1,34 @@
|
||||
"""Auth dependency: validate Supabase JWT and return user_id"""
|
||||
|
||||
from fastapi import Depends, HTTPException, status
|
||||
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
|
||||
from app.services.supabase_client import get_supabase
|
||||
|
||||
bearer_scheme = HTTPBearer(auto_error=False)
|
||||
|
||||
|
||||
async def get_current_user_id(
|
||||
credentials: HTTPAuthorizationCredentials | None = Depends(bearer_scheme),
|
||||
) -> str:
|
||||
"""Extract and validate Bearer token, return user_id."""
|
||||
if not credentials:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||
detail="Not authenticated",
|
||||
)
|
||||
token = credentials.credentials
|
||||
sb = get_supabase()
|
||||
try:
|
||||
result = sb.auth.get_user(token)
|
||||
user = result.user
|
||||
if not user:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||
detail="Invalid token",
|
||||
)
|
||||
return user.id
|
||||
except Exception:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||
detail="Invalid or expired token",
|
||||
)
|
||||
59
backend/app/main.py
Normal file
59
backend/app/main.py
Normal file
@@ -0,0 +1,59 @@
|
||||
import asyncio
|
||||
import threading
|
||||
from contextlib import asynccontextmanager
|
||||
|
||||
from fastapi import FastAPI
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from app.routers import analytics, papers, attempts, questions
|
||||
|
||||
|
||||
def _resume_stale_papers():
|
||||
"""启动时检查卡在 processing 的 paper,自动续传 AI trio"""
|
||||
try:
|
||||
from app.services.supabase_client import get_supabase
|
||||
from app.services.paper_processor import process_paper
|
||||
|
||||
sb = get_supabase()
|
||||
stale = sb.table("papers").select("id").eq("status", "processing").execute().data
|
||||
if not stale:
|
||||
return
|
||||
|
||||
for p in stale:
|
||||
paper_id = p["id"]
|
||||
print(f"[STARTUP] Resuming processing for paper {paper_id[:8]}...")
|
||||
|
||||
def run(pid=paper_id):
|
||||
asyncio.run(process_paper(pid, b"", None))
|
||||
|
||||
threading.Thread(target=run, daemon=True).start()
|
||||
except Exception as e:
|
||||
print(f"[STARTUP] Resume skipped: {e}")
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
# Startup
|
||||
_resume_stale_papers()
|
||||
yield
|
||||
# Shutdown (nothing to do)
|
||||
|
||||
|
||||
app = FastAPI(title="PastPaper Master API", version="0.1.0", lifespan=lifespan)
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"], # 开发阶段先放开,上线收紧
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
app.include_router(papers.router, prefix="/api/papers", tags=["papers"])
|
||||
app.include_router(attempts.router, prefix="/api/attempts", tags=["attempts"])
|
||||
app.include_router(questions.router, prefix="/api/questions", tags=["questions"])
|
||||
app.include_router(analytics.router, prefix="/api/analytics", tags=["analytics"])
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
def health():
|
||||
return {"status": "ok"}
|
||||
0
backend/app/routers/__init__.py
Normal file
0
backend/app/routers/__init__.py
Normal file
285
backend/app/routers/analytics.py
Normal file
285
backend/app/routers/analytics.py
Normal file
@@ -0,0 +1,285 @@
|
||||
"""Course-level analytics endpoints."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections import Counter, defaultdict
|
||||
|
||||
from fastapi import APIRouter
|
||||
|
||||
from app.services.supabase_client import get_supabase
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
DIFFICULTY_SCORE = {"easy": 1, "medium": 2, "hard": 3}
|
||||
DIFFICULTY_LABEL = {1: "Easy", 2: "Medium", 3: "Hard"}
|
||||
|
||||
# ── Topic normalization ──────────────────────────────────────
|
||||
# Map variant spellings to canonical label
|
||||
_TOPIC_ALIASES: dict[str, str] = {
|
||||
"numpy": "NumPy",
|
||||
"naïve bayes": "Naive Bayes",
|
||||
"naïve bayes classifier": "Naive Bayes",
|
||||
"naive bayes classifier": "Naive Bayes",
|
||||
"bayes classifier": "Naive Bayes",
|
||||
"bayes model": "Naive Bayes",
|
||||
"bayes' theorem": "Naive Bayes",
|
||||
"bayes' rule": "Naive Bayes",
|
||||
"k-nearest neighbors": "K-Nearest Neighbors (KNN)",
|
||||
"knn": "K-Nearest Neighbors (KNN)",
|
||||
"k-means clustering": "K-Means Clustering",
|
||||
"k-means": "K-Means Clustering",
|
||||
"k means": "K-Means Clustering",
|
||||
"multilayer perceptron": "Multilayer Perceptron (MLP)",
|
||||
"multi-layer perceptron": "Multilayer Perceptron (MLP)",
|
||||
"multi-layer perceptron (mlp)": "Multilayer Perceptron (MLP)",
|
||||
"mlp": "Multilayer Perceptron (MLP)",
|
||||
"single layer perceptron": "Perceptron",
|
||||
"convolutional neural network": "CNN",
|
||||
"convolutional neural network (cnn)": "CNN",
|
||||
"convolutional neural networks": "CNN",
|
||||
"cnn architecture": "CNN",
|
||||
"cnn properties": "CNN",
|
||||
"python fundamentals": "Python",
|
||||
"python programming": "Python",
|
||||
"python implementation": "Python",
|
||||
"advanced python programming": "Python",
|
||||
"python programming: convolutional neural network": "CNN",
|
||||
"cross-validation": "Cross Validation",
|
||||
"model evaluation implementation": "Model Evaluation",
|
||||
"digital image processing": "Image Processing",
|
||||
"computer vision": "Image Processing",
|
||||
"array slicing": "Array Slicing",
|
||||
"slicing": "Array Slicing",
|
||||
"array indexing": "Array Slicing",
|
||||
"array reshaping": "Reshape",
|
||||
"array views": "Array Slicing",
|
||||
"view vs copy": "Array Slicing",
|
||||
"boolean indexing": "Array Slicing",
|
||||
"arange": "NumPy",
|
||||
"newaxis": "NumPy",
|
||||
"expand dims": "NumPy",
|
||||
"transpose": "NumPy",
|
||||
"type casting": "NumPy",
|
||||
"element-wise operation": "NumPy",
|
||||
"array reduction": "NumPy",
|
||||
"multi-dimensional array": "NumPy",
|
||||
"dot product": "NumPy",
|
||||
"vectorization": "NumPy",
|
||||
"activation functions": "Activation Function",
|
||||
"linear activation function": "Activation Function",
|
||||
"neural network architecture": "Neural Networks",
|
||||
"hidden layer": "Neural Networks",
|
||||
"deep learning": "Neural Networks",
|
||||
"deep learning frameworks": "Neural Networks",
|
||||
"alpha-beta pruning": "Alpha-Beta Pruning",
|
||||
"minimax algorithm": "Minimax",
|
||||
"ethics of ai": "AI Ethics",
|
||||
"ethics": "AI Ethics",
|
||||
"cosine distance": "Cosine Similarity",
|
||||
"distance calculation": "Distance Metrics",
|
||||
"euclidean distance": "Distance Metrics",
|
||||
"manhattan distance": "Distance Metrics",
|
||||
"hamming distance": "Distance Metrics",
|
||||
"precision": "Model Evaluation",
|
||||
"recall": "Model Evaluation",
|
||||
"f1 score": "Model Evaluation",
|
||||
"macro f1 score": "Model Evaluation",
|
||||
"accuracy": "Model Evaluation",
|
||||
"classification accuracy": "Model Evaluation",
|
||||
"confusion matrix": "Model Evaluation",
|
||||
"convolution operation": "Convolution",
|
||||
"dilated convolution": "Convolution",
|
||||
"3d convolution": "Convolution",
|
||||
"gaussian likelihood": "Probability",
|
||||
"gaussian distribution": "Probability",
|
||||
"categorical likelihood": "Probability",
|
||||
"conditional probability": "Probability",
|
||||
"total probability theorem": "Probability",
|
||||
"probability assumptions": "Probability",
|
||||
"tensorflow": "Keras",
|
||||
"model summary": "Keras",
|
||||
"model construction": "Keras",
|
||||
"trainable parameters": "Parameter Calculation",
|
||||
"parameter reduction": "Parameter Calculation",
|
||||
"output shape calculation": "Parameter Calculation",
|
||||
"shape calculation": "Parameter Calculation",
|
||||
}
|
||||
|
||||
|
||||
def normalize_topic(label: str) -> str:
|
||||
return _TOPIC_ALIASES.get(label.lower().strip(), label)
|
||||
|
||||
|
||||
def extract_topic_labels(question: dict) -> list[str]:
|
||||
labels: list[str] = []
|
||||
raw_labels: list[str] = []
|
||||
|
||||
analytics_topic = question.get("analytics_topic")
|
||||
if analytics_topic:
|
||||
raw_labels.append(analytics_topic)
|
||||
|
||||
for tag in question.get("topic_tags") or []:
|
||||
if tag and tag not in raw_labels:
|
||||
raw_labels.append(tag)
|
||||
|
||||
if not raw_labels:
|
||||
for tag in question.get("topics") or []:
|
||||
if tag and tag not in raw_labels:
|
||||
raw_labels.append(tag)
|
||||
|
||||
# Normalize and deduplicate
|
||||
seen: set[str] = set()
|
||||
for raw in raw_labels:
|
||||
norm = normalize_topic(raw)
|
||||
if norm not in seen:
|
||||
seen.add(norm)
|
||||
labels.append(norm)
|
||||
|
||||
return labels
|
||||
|
||||
|
||||
def extract_question_family(question: dict) -> str:
|
||||
return (
|
||||
question.get("question_format")
|
||||
or question.get("question_type")
|
||||
or "unknown"
|
||||
)
|
||||
|
||||
|
||||
@router.get("/courses")
|
||||
async def list_courses():
|
||||
"""返回所有有 ready 状态试卷的课程列表"""
|
||||
sb = get_supabase()
|
||||
rows = (
|
||||
sb.table("papers")
|
||||
.select("course_code")
|
||||
.eq("status", "ready")
|
||||
.execute()
|
||||
.data
|
||||
)
|
||||
codes = sorted({row["course_code"] for row in rows if row.get("course_code")})
|
||||
return codes
|
||||
|
||||
|
||||
@router.get("/course/{course_code}")
|
||||
async def get_course_analytics(course_code: str):
|
||||
sb = get_supabase()
|
||||
|
||||
papers = (
|
||||
sb.table("papers")
|
||||
.select("id, course_code, year, term, exam_type, part_label, status")
|
||||
.eq("course_code", course_code.upper())
|
||||
.eq("status", "ready")
|
||||
.order("year", desc=True)
|
||||
.execute()
|
||||
.data
|
||||
)
|
||||
if not papers:
|
||||
return {
|
||||
"course_code": course_code.upper(),
|
||||
"kpi": {"papers": 0, "questions": 0, "topics": 0, "difficulty": "N/A"},
|
||||
"topic_frequency": [],
|
||||
"question_types": [],
|
||||
"difficulty_distribution": {"easy": 0, "medium": 0, "hard": 0},
|
||||
"high_yield_topics": [],
|
||||
}
|
||||
|
||||
paper_ids = [paper["id"] for paper in papers]
|
||||
questions = (
|
||||
sb.table("paper_questions")
|
||||
.select(
|
||||
"id, paper_id, question_number, question_type, question_format, "
|
||||
"question_text, score, topics, analytics_topic, topic_tags, difficulty"
|
||||
)
|
||||
.in_("paper_id", paper_ids)
|
||||
.order("display_order")
|
||||
.execute()
|
||||
.data
|
||||
)
|
||||
|
||||
papers_by_id = {paper["id"]: paper for paper in papers}
|
||||
total_questions = len(questions)
|
||||
topic_counter: Counter[str] = Counter()
|
||||
type_counter: Counter[str] = Counter()
|
||||
difficulty_counter: Counter[str] = Counter()
|
||||
topic_examples: dict[str, list[dict]] = defaultdict(list)
|
||||
difficulty_scores: list[int] = []
|
||||
all_question_items: list[dict] = []
|
||||
|
||||
for question in questions:
|
||||
question_type = extract_question_family(question)
|
||||
type_counter[question_type] += 1
|
||||
|
||||
difficulty = question.get("difficulty")
|
||||
if difficulty in DIFFICULTY_SCORE:
|
||||
difficulty_counter[difficulty] += 1
|
||||
difficulty_scores.append(DIFFICULTY_SCORE[difficulty])
|
||||
|
||||
paper = papers_by_id.get(question["paper_id"], {})
|
||||
source_label = (
|
||||
f"{paper.get('year', '')} {paper.get('term', '').title()} "
|
||||
f"{paper.get('exam_type', '').title()}"
|
||||
).strip()
|
||||
if paper.get("part_label"):
|
||||
source_label = f"{source_label} Part {paper['part_label']}"
|
||||
|
||||
topics = extract_topic_labels(question)
|
||||
q_item = {
|
||||
"paper_id": paper.get("id"),
|
||||
"source": source_label,
|
||||
"question_number": question["question_number"],
|
||||
"preview": question["question_text"][:220],
|
||||
"difficulty": question.get("difficulty"),
|
||||
"question_type": question_type,
|
||||
"year": paper.get("year"),
|
||||
"term": paper.get("term"),
|
||||
"exam_type": paper.get("exam_type"),
|
||||
"topics": topics,
|
||||
}
|
||||
all_question_items.append(q_item)
|
||||
|
||||
for topic in topics:
|
||||
topic_counter[topic] += 1
|
||||
topic_examples[topic].append(q_item)
|
||||
|
||||
avg_difficulty = "N/A"
|
||||
if difficulty_scores:
|
||||
rounded = round(sum(difficulty_scores) / len(difficulty_scores))
|
||||
avg_difficulty = DIFFICULTY_LABEL.get(rounded, "Medium")
|
||||
|
||||
topic_frequency = []
|
||||
for topic, count in topic_counter.most_common():
|
||||
pct = round((count / total_questions) * 100) if total_questions else 0
|
||||
topic_frequency.append(
|
||||
{
|
||||
"label": topic,
|
||||
"count": count,
|
||||
"pct": pct,
|
||||
"questions": topic_examples[topic],
|
||||
}
|
||||
)
|
||||
|
||||
question_types = []
|
||||
for label, count in type_counter.most_common():
|
||||
pct = round((count / total_questions) * 100) if total_questions else 0
|
||||
question_types.append({"label": label, "count": count, "pct": pct})
|
||||
|
||||
return {
|
||||
"course_code": course_code.upper(),
|
||||
"kpi": {
|
||||
"papers": len(papers),
|
||||
"questions": total_questions,
|
||||
"topics": len(topic_counter),
|
||||
"difficulty": avg_difficulty,
|
||||
},
|
||||
"topic_frequency": topic_frequency,
|
||||
"question_types": question_types,
|
||||
"all_questions": all_question_items,
|
||||
"difficulty_distribution": {
|
||||
"easy": difficulty_counter.get("easy", 0),
|
||||
"medium": difficulty_counter.get("medium", 0),
|
||||
"hard": difficulty_counter.get("hard", 0),
|
||||
},
|
||||
"high_yield_topics": [topic for topic, _ in topic_counter.most_common(5)],
|
||||
}
|
||||
208
backend/app/routers/attempts.py
Normal file
208
backend/app/routers/attempts.py
Normal file
@@ -0,0 +1,208 @@
|
||||
"""用户答题记录 + 拍照批改 + 错题本"""
|
||||
|
||||
import asyncio
|
||||
from fastapi import APIRouter, UploadFile, File, Form, HTTPException, Depends
|
||||
from pydantic import BaseModel
|
||||
from app.services.supabase_client import get_supabase
|
||||
from app.services.grader import ocr_photo, grade_answer
|
||||
from app.dependencies.auth import get_current_user_id
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
class AttemptCreate(BaseModel):
|
||||
question_id: str
|
||||
attempt_type: str # "select" | "input" | "photo"
|
||||
user_answer: str | None = None
|
||||
is_correct: bool | None = None
|
||||
|
||||
|
||||
class AttemptUpdate(BaseModel):
|
||||
in_error_book: bool | None = None
|
||||
mastered: bool | None = None
|
||||
|
||||
|
||||
@router.post("/")
|
||||
async def create_attempt(data: AttemptCreate, user_id: str = Depends(get_current_user_id)):
|
||||
"""记录一次答题"""
|
||||
sb = get_supabase()
|
||||
record = {
|
||||
"user_id": user_id,
|
||||
"question_id": data.question_id,
|
||||
"attempt_type": data.attempt_type,
|
||||
"user_answer": data.user_answer,
|
||||
"is_correct": data.is_correct,
|
||||
}
|
||||
# Auto add to error book if wrong
|
||||
if data.is_correct is False:
|
||||
record["in_error_book"] = True
|
||||
|
||||
result = sb.table("user_attempts").insert(record).execute()
|
||||
return result.data[0]
|
||||
|
||||
|
||||
@router.post("/photo")
|
||||
async def photo_attempt(
|
||||
question_id: str = Form(...),
|
||||
photo: UploadFile = File(...),
|
||||
user_id: str = Depends(get_current_user_id),
|
||||
):
|
||||
"""拍照上传 → OCR → AI批改"""
|
||||
sb = get_supabase()
|
||||
|
||||
# 1. Read photo
|
||||
photo_bytes = await photo.read()
|
||||
|
||||
# 2. Upload to storage
|
||||
storage_path = f"attempts/{user_id}/{question_id}/{photo.filename}"
|
||||
sb.storage.from_("attempt-photos").upload(
|
||||
storage_path, photo_bytes,
|
||||
file_options={"content-type": photo.content_type or "image/jpeg", "upsert": "true"},
|
||||
)
|
||||
photo_url = sb.storage.from_("attempt-photos").get_public_url(storage_path)
|
||||
|
||||
# 3. OCR (run in thread pool to avoid blocking event loop)
|
||||
ocr_text = await asyncio.to_thread(ocr_photo, photo_bytes)
|
||||
|
||||
# 4. Fetch question for grading context
|
||||
q_result = sb.table("paper_questions").select("*").eq("id", question_id).execute()
|
||||
if not q_result.data:
|
||||
raise HTTPException(status_code=404, detail="Question not found")
|
||||
question = q_result.data[0]
|
||||
|
||||
# 5. AI grading (run in thread pool)
|
||||
grade_result = await asyncio.to_thread(grade_answer, question, ocr_text)
|
||||
|
||||
# 6. Save attempt
|
||||
record = {
|
||||
"user_id": user_id,
|
||||
"question_id": question_id,
|
||||
"attempt_type": "photo",
|
||||
"photo_url": photo_url,
|
||||
"photo_ocr_text": ocr_text,
|
||||
"is_correct": grade_result.get("is_correct", False),
|
||||
"feedback": grade_result.get("feedback", ""),
|
||||
"error_at_step": grade_result.get("error_at_step"),
|
||||
"in_error_book": not grade_result.get("is_correct", False),
|
||||
}
|
||||
result = sb.table("user_attempts").insert(record).execute()
|
||||
|
||||
return {
|
||||
"attempt": result.data[0],
|
||||
"ocr_text": ocr_text,
|
||||
"grade": grade_result,
|
||||
}
|
||||
|
||||
|
||||
@router.get("/error-book")
|
||||
async def get_error_book(
|
||||
course_code: str | None = None,
|
||||
user_id: str = Depends(get_current_user_id),
|
||||
):
|
||||
"""获取错题本"""
|
||||
sb = get_supabase()
|
||||
attempts = (
|
||||
sb.table("user_attempts")
|
||||
.select("*")
|
||||
.eq("user_id", user_id)
|
||||
.eq("in_error_book", True)
|
||||
.eq("mastered", False)
|
||||
.order("created_at", desc=True)
|
||||
.execute()
|
||||
.data
|
||||
)
|
||||
if not attempts:
|
||||
return []
|
||||
|
||||
question_ids = list({attempt["question_id"] for attempt in attempts})
|
||||
questions = (
|
||||
sb.table("paper_questions")
|
||||
.select("*")
|
||||
.in_("id", question_ids)
|
||||
.execute()
|
||||
.data
|
||||
)
|
||||
questions_by_id = {question["id"]: question for question in questions}
|
||||
|
||||
paper_ids = list({question["paper_id"] for question in questions})
|
||||
papers = (
|
||||
sb.table("papers")
|
||||
.select("id, course_code, year, term, exam_type, part_label")
|
||||
.in_("id", paper_ids)
|
||||
.execute()
|
||||
.data
|
||||
)
|
||||
papers_by_id = {paper["id"]: paper for paper in papers}
|
||||
|
||||
enriched = []
|
||||
for attempt in attempts:
|
||||
question = questions_by_id.get(attempt["question_id"])
|
||||
if not question:
|
||||
continue
|
||||
paper = papers_by_id.get(question["paper_id"])
|
||||
if course_code and paper and paper.get("course_code") != course_code.upper():
|
||||
continue
|
||||
|
||||
enriched.append(
|
||||
{
|
||||
**attempt,
|
||||
"paper_questions": {
|
||||
**question,
|
||||
"paper": paper,
|
||||
},
|
||||
}
|
||||
)
|
||||
return enriched
|
||||
|
||||
|
||||
@router.get("/by-paper/{paper_id}")
|
||||
async def get_paper_attempts(paper_id: str, user_id: str = Depends(get_current_user_id)):
|
||||
"""获取某张试卷所有题目的最新判卷记录"""
|
||||
sb = get_supabase()
|
||||
attempts = (
|
||||
sb.table("user_attempts")
|
||||
.select("question_id, is_correct, feedback, photo_ocr_text, attempt_type, created_at")
|
||||
.eq("user_id", user_id)
|
||||
.order("created_at", desc=True)
|
||||
.execute()
|
||||
.data
|
||||
)
|
||||
# 只保留 photo 类型的,且只保留每题最新一条
|
||||
question_ids = (
|
||||
sb.table("paper_questions")
|
||||
.select("id")
|
||||
.eq("paper_id", paper_id)
|
||||
.execute()
|
||||
.data
|
||||
)
|
||||
qid_set = {q["id"] for q in question_ids}
|
||||
seen: set[str] = set()
|
||||
result = []
|
||||
for a in attempts:
|
||||
if a["question_id"] not in qid_set:
|
||||
continue
|
||||
if a["question_id"] in seen:
|
||||
continue
|
||||
if a["attempt_type"] != "photo":
|
||||
continue
|
||||
seen.add(a["question_id"])
|
||||
result.append(a)
|
||||
return result
|
||||
|
||||
|
||||
@router.patch("/{attempt_id}")
|
||||
async def update_attempt(attempt_id: str, data: AttemptUpdate):
|
||||
"""更新错题状态(标记掌握等)"""
|
||||
sb = get_supabase()
|
||||
update = {}
|
||||
if data.in_error_book is not None:
|
||||
update["in_error_book"] = data.in_error_book
|
||||
if data.mastered is not None:
|
||||
update["mastered"] = data.mastered
|
||||
if not update:
|
||||
raise HTTPException(status_code=400, detail="Nothing to update")
|
||||
|
||||
result = sb.table("user_attempts").update(update).eq("id", attempt_id).execute()
|
||||
if not result.data:
|
||||
raise HTTPException(status_code=404, detail="Attempt not found")
|
||||
return result.data[0]
|
||||
142
backend/app/routers/papers.py
Normal file
142
backend/app/routers/papers.py
Normal file
@@ -0,0 +1,142 @@
|
||||
"""试卷上传 + 处理管线"""
|
||||
|
||||
import asyncio
|
||||
import threading
|
||||
from fastapi import APIRouter, UploadFile, File, Form, HTTPException, Depends
|
||||
from app.services.supabase_client import get_supabase
|
||||
from app.services.text_extractor import extract_pdf, get_full_text
|
||||
from app.services.paper_processor import process_paper
|
||||
from app.dependencies.auth import get_current_user_id
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
def _upload_and_process_sync(
|
||||
paper_id: str,
|
||||
storage_path: str,
|
||||
paper_bytes: bytes,
|
||||
answer_bytes: bytes | None,
|
||||
):
|
||||
"""在独立线程中运行:Storage 上传 + AI 处理"""
|
||||
sb = get_supabase()
|
||||
try:
|
||||
paper_storage_path = f"{storage_path}/paper.pdf"
|
||||
sb.storage.from_("papers").upload(
|
||||
paper_storage_path, paper_bytes,
|
||||
file_options={"content-type": "application/pdf", "upsert": "true"},
|
||||
)
|
||||
paper_url = sb.storage.from_("papers").get_public_url(paper_storage_path)
|
||||
|
||||
update_data: dict = {"paper_file_url": paper_url}
|
||||
|
||||
if answer_bytes:
|
||||
answer_storage_path = f"{storage_path}/answer.pdf"
|
||||
sb.storage.from_("papers").upload(
|
||||
answer_storage_path, answer_bytes,
|
||||
file_options={"content-type": "application/pdf", "upsert": "true"},
|
||||
)
|
||||
update_data["answer_file_url"] = sb.storage.from_("papers").get_public_url(answer_storage_path)
|
||||
|
||||
sb.table("papers").update(update_data).eq("id", paper_id).execute()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# process_paper 是 async,在新事件循环里跑
|
||||
asyncio.run(process_paper(paper_id, paper_bytes, answer_bytes))
|
||||
|
||||
|
||||
@router.get("/")
|
||||
async def list_papers():
|
||||
"""获取试卷列表(公共资产,所有用户共享)"""
|
||||
sb = get_supabase()
|
||||
return (
|
||||
sb.table("papers")
|
||||
.select("id, course_code, year, term, exam_type, status, question_count, total_score, difficulty_level, processing_step, processing_progress, processing_total, created_at")
|
||||
.order("created_at", desc=True)
|
||||
.execute()
|
||||
.data
|
||||
)
|
||||
|
||||
|
||||
@router.get("/mine")
|
||||
async def my_papers(user_id: str = Depends(get_current_user_id)):
|
||||
"""当前用户上传的试卷(含 processing 状态)"""
|
||||
sb = get_supabase()
|
||||
return (
|
||||
sb.table("papers")
|
||||
.select("id, course_code, year, term, exam_type, part_label, status, question_count, processing_step, processing_progress, processing_total, created_at")
|
||||
.eq("user_id", user_id)
|
||||
.order("created_at", desc=True)
|
||||
.execute()
|
||||
.data
|
||||
)
|
||||
|
||||
|
||||
@router.post("/upload")
|
||||
async def upload_paper(
|
||||
paper_file: UploadFile = File(...),
|
||||
answer_file: UploadFile | None = File(None),
|
||||
course_code: str = Form(...),
|
||||
year: int = Form(...),
|
||||
term: str = Form(...),
|
||||
exam_type: str = Form(...),
|
||||
user_id: str = Depends(get_current_user_id),
|
||||
):
|
||||
"""上传试卷 PDF(可选答案 PDF),触发后台处理"""
|
||||
sb = get_supabase()
|
||||
|
||||
# 1. 读取文件内容(已在内存中,快)
|
||||
paper_bytes = await paper_file.read()
|
||||
answer_bytes = await answer_file.read() if answer_file else None
|
||||
|
||||
# 2. 立即创建记录(status=processing),马上返回
|
||||
storage_path = f"{course_code.upper()}/{year}_{term}_{exam_type}"
|
||||
paper_record = sb.table("papers").insert({
|
||||
"user_id": user_id,
|
||||
"course_code": course_code.upper(),
|
||||
"year": year,
|
||||
"term": term,
|
||||
"exam_type": exam_type,
|
||||
"paper_file_url": "", # 后台上传后更新
|
||||
"answer_file_url": None,
|
||||
"status": "processing",
|
||||
}).execute()
|
||||
|
||||
paper_id = paper_record.data[0]["id"]
|
||||
|
||||
# 3. 在独立线程中运行,完全不阻塞事件循环
|
||||
threading.Thread(
|
||||
target=_upload_and_process_sync,
|
||||
args=(paper_id, storage_path, paper_bytes, answer_bytes),
|
||||
daemon=True,
|
||||
).start()
|
||||
|
||||
return {
|
||||
"paper_id": paper_id,
|
||||
"status": "processing",
|
||||
"message": "试卷已上传,正在处理中...",
|
||||
}
|
||||
|
||||
|
||||
@router.get("/{paper_id}")
|
||||
async def get_paper(paper_id: str):
|
||||
"""获取试卷信息 + 处理状态"""
|
||||
sb = get_supabase()
|
||||
result = sb.table("papers").select("*").eq("id", paper_id).execute()
|
||||
if not result.data:
|
||||
raise HTTPException(status_code=404, detail="Paper not found")
|
||||
return result.data[0]
|
||||
|
||||
|
||||
@router.get("/{paper_id}/questions")
|
||||
async def get_questions(paper_id: str):
|
||||
"""获取试卷的所有题目(含 AI 三件套)"""
|
||||
sb = get_supabase()
|
||||
result = (
|
||||
sb.table("paper_questions")
|
||||
.select("*")
|
||||
.eq("paper_id", paper_id)
|
||||
.order("display_order")
|
||||
.execute()
|
||||
)
|
||||
return result.data
|
||||
325
backend/app/routers/questions.py
Normal file
325
backend/app/routers/questions.py
Normal file
@@ -0,0 +1,325 @@
|
||||
"""题目相关:变式题生成 + 相似题召回"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
from fastapi import APIRouter, HTTPException, Depends
|
||||
from pydantic import BaseModel
|
||||
from app.services.supabase_client import get_supabase
|
||||
from app.services.grader import generate_variant
|
||||
from app.dependencies.auth import get_current_user_id
|
||||
|
||||
# Simple in-memory cache: question_id → (timestamp, result)
|
||||
_similar_cache: dict[str, tuple[float, list]] = {}
|
||||
_CACHE_TTL = 300 # 5 minutes
|
||||
|
||||
|
||||
class VariantUpdate(BaseModel):
|
||||
favorited: bool | None = None
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
def normalized_labels(values: list[str] | None) -> dict[str, str]:
|
||||
labels: dict[str, str] = {}
|
||||
for value in values or []:
|
||||
if value:
|
||||
labels[value.lower()] = value
|
||||
return labels
|
||||
|
||||
|
||||
def question_family(question: dict) -> str:
|
||||
return question.get("question_format") or question.get("question_type") or "unknown"
|
||||
|
||||
|
||||
def display_topics(question: dict) -> list[str]:
|
||||
labels: list[str] = []
|
||||
analytics_topic = question.get("analytics_topic")
|
||||
if analytics_topic:
|
||||
labels.append(analytics_topic)
|
||||
for topic in question.get("topic_tags") or []:
|
||||
if topic and topic not in labels:
|
||||
labels.append(topic)
|
||||
if labels:
|
||||
return labels
|
||||
for topic in question.get("topics") or []:
|
||||
if topic and topic not in labels:
|
||||
labels.append(topic)
|
||||
return labels
|
||||
|
||||
|
||||
def similarity_score(
|
||||
target: dict,
|
||||
candidate: dict,
|
||||
text_score: float = 0.0,
|
||||
) -> tuple[int, list[str]]:
|
||||
score = 0
|
||||
reasons: list[str] = []
|
||||
|
||||
# Primary topic bucket: 40 pts
|
||||
target_topic = target.get("analytics_topic")
|
||||
candidate_topic = candidate.get("analytics_topic")
|
||||
if target_topic and target_topic == candidate_topic:
|
||||
score += 40
|
||||
reasons.append(f"Same topic: {target_topic}")
|
||||
|
||||
# Concept overlap: up to 20 pts
|
||||
target_topics = normalized_labels(target.get("topic_tags"))
|
||||
candidate_topics = normalized_labels(candidate.get("topic_tags"))
|
||||
shared_topics = sorted(set(target_topics) & set(candidate_topics))
|
||||
if shared_topics:
|
||||
score += min(len(shared_topics) * 10, 20)
|
||||
# Only show concept reason if analytics_topic didn't already match (avoid redundancy)
|
||||
if not (target_topic and target_topic == candidate_topic):
|
||||
reasons.append(
|
||||
"Shared concept: "
|
||||
+ ", ".join(target_topics[key] for key in shared_topics[:2])
|
||||
)
|
||||
|
||||
# Skill overlap: up to 20 pts
|
||||
target_skills = normalized_labels(target.get("skill_tags"))
|
||||
candidate_skills = normalized_labels(candidate.get("skill_tags"))
|
||||
shared_skills = sorted(set(target_skills) & set(candidate_skills))
|
||||
if shared_skills:
|
||||
score += min(len(shared_skills) * 10, 20)
|
||||
reasons.append(
|
||||
"Shared skill: "
|
||||
+ ", ".join(target_skills[key] for key in shared_skills[:2])
|
||||
)
|
||||
|
||||
# Same question format: 10 pts
|
||||
if question_family(candidate) == question_family(target):
|
||||
score += 10
|
||||
reasons.append("Same format")
|
||||
|
||||
# Same difficulty: 5 pts
|
||||
if candidate.get("difficulty") and candidate.get("difficulty") == target.get("difficulty"):
|
||||
score += 5
|
||||
reasons.append("Same difficulty")
|
||||
|
||||
# Full-text similarity from PostgreSQL ts_rank_cd: up to 20 pts
|
||||
if text_score > 0:
|
||||
text_pts = min(round(text_score * 60), 20)
|
||||
score += text_pts
|
||||
if text_pts >= 4:
|
||||
reasons.append("Similar wording")
|
||||
|
||||
return min(score, 99), reasons
|
||||
|
||||
|
||||
@router.get("/variants/favorited")
|
||||
async def get_favorited_variants(user_id: str = Depends(get_current_user_id)):
|
||||
"""获取用户收藏的所有 variant(用于 Error Book)"""
|
||||
sb = get_supabase()
|
||||
rows = (
|
||||
sb.table("question_variants")
|
||||
.select("*, paper_questions(question_number, paper_id, papers(id, course_code, year, term, exam_type, part_label))")
|
||||
.eq("user_id", user_id)
|
||||
.eq("favorited", True)
|
||||
.order("created_at", desc=True)
|
||||
.execute()
|
||||
.data
|
||||
)
|
||||
return rows
|
||||
|
||||
|
||||
@router.post("/{question_id}/variant")
|
||||
async def create_variant(question_id: str, user_id: str = Depends(get_current_user_id)):
|
||||
"""生成变式题并入库"""
|
||||
sb = get_supabase()
|
||||
result = sb.table("paper_questions").select("*").eq("id", question_id).execute()
|
||||
if not result.data:
|
||||
raise HTTPException(status_code=404, detail="Question not found")
|
||||
|
||||
question = result.data[0]
|
||||
variant_data = await asyncio.to_thread(generate_variant, question)
|
||||
variant_data["knowledge_reminder"] = question.get("knowledge_reminder", "")
|
||||
|
||||
saved = sb.table("question_variants").insert({
|
||||
"user_id": user_id,
|
||||
"source_question_id": question_id,
|
||||
"variant_data": variant_data,
|
||||
"favorited": False,
|
||||
}).execute()
|
||||
|
||||
row = saved.data[0]
|
||||
row["source_question_number"] = question["question_number"]
|
||||
return row
|
||||
|
||||
|
||||
@router.get("/{question_id}/variants")
|
||||
async def list_variants(question_id: str, user_id: str = Depends(get_current_user_id)):
|
||||
"""获取某道题的用户所有 variant"""
|
||||
sb = get_supabase()
|
||||
q_result = sb.table("paper_questions").select("question_number").eq("id", question_id).execute()
|
||||
question_number = q_result.data[0]["question_number"] if q_result.data else ""
|
||||
|
||||
rows = (
|
||||
sb.table("question_variants")
|
||||
.select("*")
|
||||
.eq("user_id", user_id)
|
||||
.eq("source_question_id", question_id)
|
||||
.order("created_at", desc=True)
|
||||
.execute()
|
||||
.data
|
||||
)
|
||||
for row in rows:
|
||||
row["source_question_number"] = question_number
|
||||
return rows
|
||||
|
||||
|
||||
@router.patch("/variant/{variant_id}")
|
||||
async def update_variant(variant_id: str, data: VariantUpdate, user_id: str = Depends(get_current_user_id)):
|
||||
"""更新 variant(收藏/取消收藏)"""
|
||||
sb = get_supabase()
|
||||
update: dict = {}
|
||||
if data.favorited is not None:
|
||||
update["favorited"] = data.favorited
|
||||
if not update:
|
||||
raise HTTPException(status_code=400, detail="Nothing to update")
|
||||
|
||||
result = (
|
||||
sb.table("question_variants")
|
||||
.update(update)
|
||||
.eq("id", variant_id)
|
||||
.eq("user_id", user_id)
|
||||
.execute()
|
||||
)
|
||||
if not result.data:
|
||||
raise HTTPException(status_code=404, detail="Variant not found")
|
||||
return result.data[0]
|
||||
|
||||
|
||||
@router.delete("/variant/{variant_id}", status_code=204)
|
||||
async def delete_variant(variant_id: str, user_id: str = Depends(get_current_user_id)):
|
||||
"""删除 variant"""
|
||||
sb = get_supabase()
|
||||
sb.table("question_variants").delete().eq("id", variant_id).eq("user_id", user_id).execute()
|
||||
|
||||
|
||||
@router.get("/{question_id}/similar")
|
||||
async def get_similar_questions(question_id: str, limit: int = 6):
|
||||
"""Retrieve similar questions from the same course."""
|
||||
# Cache hit
|
||||
cached = _similar_cache.get(question_id)
|
||||
if cached and (time.time() - cached[0]) < _CACHE_TTL:
|
||||
return cached[1][:max(1, min(limit, 12))]
|
||||
|
||||
sb = get_supabase()
|
||||
result = sb.table("paper_questions").select("*, similar_questions").eq("id", question_id).execute()
|
||||
if not result.data:
|
||||
raise HTTPException(status_code=404, detail="Question not found")
|
||||
|
||||
target = result.data[0]
|
||||
|
||||
# Return pre-computed immediately; schedule background refresh
|
||||
if target.get("similar_questions"):
|
||||
precomputed = target["similar_questions"]
|
||||
_similar_cache[question_id] = (time.time(), precomputed)
|
||||
return precomputed[:max(1, min(limit, 12))]
|
||||
|
||||
paper_result = sb.table("papers").select("id, course_code").eq("id", target["paper_id"]).execute()
|
||||
# (fallback: compute on-the-fly for questions not yet backfilled)
|
||||
if not paper_result.data:
|
||||
raise HTTPException(status_code=404, detail="Paper not found")
|
||||
|
||||
course_code = paper_result.data[0]["course_code"]
|
||||
papers = (
|
||||
sb.table("papers")
|
||||
.select("id, course_code, year, term, exam_type, part_label")
|
||||
.eq("course_code", course_code)
|
||||
.eq("status", "ready")
|
||||
.execute()
|
||||
.data
|
||||
)
|
||||
paper_ids = [paper["id"] for paper in papers if paper["id"] != target["paper_id"]]
|
||||
if not paper_ids:
|
||||
return []
|
||||
|
||||
papers_by_id = {paper["id"]: paper for paper in papers}
|
||||
|
||||
# Pre-filter by analytics_topic in DB when possible (cuts candidates from ~250 to ~30)
|
||||
candidates_query = (
|
||||
sb.table("paper_questions")
|
||||
.select(
|
||||
"id, paper_id, question_number, question_type, question_format, "
|
||||
"question_text, score, topics, analytics_topic, topic_tags, skill_tags, "
|
||||
"difficulty, knowledge_reminder, ai_hint, solution"
|
||||
)
|
||||
.in_("paper_id", paper_ids)
|
||||
)
|
||||
target_topic = target.get("analytics_topic")
|
||||
if target_topic:
|
||||
candidates_query = candidates_query.eq("analytics_topic", target_topic)
|
||||
|
||||
candidates = candidates_query.execute().data
|
||||
if not candidates:
|
||||
return []
|
||||
|
||||
# Batch full-text scores from PostgreSQL (skip if too many candidates — slow)
|
||||
text_scores: dict[str, float] = {}
|
||||
if len(candidates) <= 50:
|
||||
try:
|
||||
rpc_result = sb.rpc(
|
||||
"text_similarity_scores",
|
||||
{
|
||||
"query_text": target.get("question_text") or "",
|
||||
"candidate_ids": [c["id"] for c in candidates],
|
||||
},
|
||||
).execute()
|
||||
for row in rpc_result.data or []:
|
||||
text_scores[row["question_id"]] = float(row["text_score"] or 0)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
ranked = []
|
||||
for candidate in candidates:
|
||||
text_score = text_scores.get(candidate["id"], 0.0)
|
||||
match_percent, reasons = similarity_score(target, candidate, text_score)
|
||||
if match_percent < 20:
|
||||
continue
|
||||
paper = papers_by_id.get(candidate["paper_id"], {})
|
||||
source = (
|
||||
f"{paper.get('year', '')} {paper.get('term', '').title()} "
|
||||
f"{paper.get('exam_type', '').title()}"
|
||||
).strip()
|
||||
if paper.get("part_label"):
|
||||
source = f"{source} Part {paper['part_label']}"
|
||||
ranked.append(
|
||||
{
|
||||
"id": candidate["id"],
|
||||
"paper_id": candidate["paper_id"],
|
||||
"source": source,
|
||||
"question_number": candidate["question_number"],
|
||||
"match_percent": match_percent,
|
||||
"match_reasons": reasons,
|
||||
"question_type": question_family(candidate),
|
||||
"question_text": candidate["question_text"],
|
||||
"topics": display_topics(candidate),
|
||||
"difficulty": candidate.get("difficulty"),
|
||||
"knowledge_reminder": candidate.get("knowledge_reminder", ""),
|
||||
"ai_hint": candidate.get("ai_hint", ""),
|
||||
"solution": candidate.get("solution", ""),
|
||||
}
|
||||
)
|
||||
|
||||
ranked.sort(key=lambda item: (-item["match_percent"], item["source"], item["question_number"]))
|
||||
|
||||
# Keep only the best-scoring question per paper
|
||||
seen_papers: set[str] = set()
|
||||
deduped = []
|
||||
for item in ranked:
|
||||
if item["paper_id"] not in seen_papers:
|
||||
seen_papers.add(item["paper_id"])
|
||||
deduped.append(item)
|
||||
|
||||
_similar_cache[question_id] = (time.time(), deduped)
|
||||
|
||||
# Persist to DB so future requests are instant
|
||||
try:
|
||||
sb.table("paper_questions").update({"similar_questions": deduped}).eq("id", question_id).execute()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return deduped[:max(1, min(limit, 12))]
|
||||
0
backend/app/services/__init__.py
Normal file
0
backend/app/services/__init__.py
Normal file
146
backend/app/services/grader.py
Normal file
146
backend/app/services/grader.py
Normal file
@@ -0,0 +1,146 @@
|
||||
"""OCR, grading, and variant generation prompts"""
|
||||
|
||||
import json
|
||||
import base64
|
||||
from app.services.llm_clients import get_vision_client, get_deepseek_client
|
||||
|
||||
OCR_PROMPT = """You are an expert at recognizing handwritten answers. Analyze this photo of a student's handwritten answer and extract the text and mathematical formulas.
|
||||
|
||||
Requirements:
|
||||
- Faithfully extract what the student wrote, do not modify or correct
|
||||
- Use LaTeX format for math formulas (e.g. $x^2 + 1$)
|
||||
- If there are multiple steps, list them in original order
|
||||
- If some handwriting is unclear, mark with [unclear]
|
||||
|
||||
Return only the extracted text, no additional explanation."""
|
||||
|
||||
GRADING_PROMPT = """You are an expert academic grader. Grade the following student answer. ALL output must be in English.
|
||||
|
||||
Question info:
|
||||
- Number: {question_number}
|
||||
- Type: {question_type}
|
||||
- Question: {question_text}
|
||||
- Score: {score}
|
||||
|
||||
Reference answer / solution:
|
||||
{reference_answer}
|
||||
|
||||
Student answer:
|
||||
{student_answer}
|
||||
|
||||
Grade and return JSON:
|
||||
{{
|
||||
"is_correct": true/false,
|
||||
"score_given": 0-{score},
|
||||
"feedback": "<HTML> Step-by-step analysis of the student's answer, pointing out correct parts and errors, using KaTeX formulas </HTML>",
|
||||
"error_at_step": null or the step number where errors begin (integer)
|
||||
}}
|
||||
|
||||
Grading rules:
|
||||
- MC / fill-blank: only correct if answer matches exactly
|
||||
- Long questions: give partial credit for correct steps even if the final answer is wrong
|
||||
- feedback in HTML format, supports KaTeX ($..$ inline, $$...$$ block)
|
||||
- Mark errors with <div class="common-error">...</div>
|
||||
- Identify exactly which step the error starts"""
|
||||
|
||||
VARIANT_PROMPT = """You are an expert exam question creator. Generate a similar but different variant question based on the original below. ALL output must be in English.
|
||||
|
||||
Original question info:
|
||||
- Type: {question_type}
|
||||
- Question: {question_text}
|
||||
- Topics: {topics}
|
||||
- Difficulty: {difficulty}
|
||||
- Reference answer: {answer}
|
||||
|
||||
Requirements:
|
||||
- Variant must test the same knowledge points at similar difficulty
|
||||
- Data/scenario/wording must differ — don't just change numbers
|
||||
- Must provide a complete correct answer
|
||||
|
||||
Format requirements (CRITICAL):
|
||||
- All text in HTML format, absolutely NO markdown syntax
|
||||
- Code: <pre><code class="language-xxx">...</code></pre>, NOT ```
|
||||
- Math: $...$ (inline) or $$...$$ (block), KaTeX compatible
|
||||
- Line breaks: <br>, paragraphs: <p>
|
||||
|
||||
Return JSON:
|
||||
{{
|
||||
"question_text": "HTML formatted variant question",
|
||||
"question_type": "{question_type}",
|
||||
"options": [MC only, format {{"label":"A","text":"..."}}, ...] or null,
|
||||
"correct_answer": "Correct answer (plain text)",
|
||||
"ai_hint": "HTML formatted hint that guides thinking WITHOUT giving the answer",
|
||||
"solution": "HTML formatted complete step-by-step solution"
|
||||
}}"""
|
||||
|
||||
|
||||
def ocr_photo(photo_bytes: bytes) -> str:
|
||||
"""Gemini Vision OCR for handwritten answers"""
|
||||
client = get_vision_client()
|
||||
b64 = base64.b64encode(photo_bytes).decode("utf-8")
|
||||
|
||||
resp = client.chat.completions.create(
|
||||
model="gemini-2.5-flash",
|
||||
messages=[
|
||||
{"role": "system", "content": OCR_PROMPT},
|
||||
{"role": "user", "content": [
|
||||
{"type": "image_url", "image_url": {
|
||||
"url": f"data:image/jpeg;base64,{b64}",
|
||||
}},
|
||||
]},
|
||||
],
|
||||
temperature=0,
|
||||
max_tokens=2000,
|
||||
)
|
||||
return resp.choices[0].message.content or ""
|
||||
|
||||
|
||||
def grade_answer(question: dict, student_answer: str) -> dict:
|
||||
"""Qwen grades student answer"""
|
||||
reference = question.get("raw_answer_text") or question.get("solution") or "No reference answer"
|
||||
score = question.get("score") or "unknown"
|
||||
|
||||
ds = get_deepseek_client()
|
||||
resp = ds.chat.completions.create(
|
||||
model="deepseek-chat",
|
||||
messages=[
|
||||
{"role": "system", "content": GRADING_PROMPT.format(
|
||||
question_number=question["question_number"],
|
||||
question_type=question["question_type"],
|
||||
question_text=question["question_text"],
|
||||
score=score,
|
||||
reference_answer=reference,
|
||||
student_answer=student_answer,
|
||||
)},
|
||||
],
|
||||
temperature=0.2,
|
||||
response_format={"type": "json_object"},
|
||||
)
|
||||
return json.loads(resp.choices[0].message.content)
|
||||
|
||||
|
||||
def generate_variant(question: dict) -> dict:
|
||||
"""Gemini generates a variant question"""
|
||||
answer = (
|
||||
question.get("correct_option")
|
||||
or question.get("correct_answer")
|
||||
or question.get("raw_answer_text")
|
||||
or "N/A"
|
||||
)
|
||||
|
||||
ds = get_deepseek_client()
|
||||
resp = ds.chat.completions.create(
|
||||
model="deepseek-chat",
|
||||
messages=[
|
||||
{"role": "system", "content": VARIANT_PROMPT.format(
|
||||
question_type=question["question_type"],
|
||||
question_text=question["question_text"],
|
||||
topics=", ".join(question.get("topics", [])),
|
||||
difficulty=question.get("difficulty", "medium"),
|
||||
answer=answer,
|
||||
)},
|
||||
],
|
||||
temperature=0.5,
|
||||
response_format={"type": "json_object"},
|
||||
)
|
||||
return json.loads(resp.choices[0].message.content)
|
||||
74
backend/app/services/llm_clients.py
Normal file
74
backend/app/services/llm_clients.py
Normal file
@@ -0,0 +1,74 @@
|
||||
import httpx
|
||||
from openai import OpenAI
|
||||
from app.config import get_settings
|
||||
|
||||
_TIMEOUT = httpx.Timeout(connect=10, read=300, write=60, pool=10)
|
||||
|
||||
_gpt_client: OpenAI | None = None
|
||||
_qwen_client: OpenAI | None = None
|
||||
_gemini_flash_client: OpenAI | None = None
|
||||
_gemini_lite_client: OpenAI | None = None
|
||||
_deepseek_client: OpenAI | None = None
|
||||
|
||||
|
||||
def get_gpt_client() -> OpenAI:
|
||||
"""laozhang API — gpt-4o / gpt-4o-mini"""
|
||||
global _gpt_client
|
||||
if _gpt_client is None:
|
||||
s = get_settings()
|
||||
_gpt_client = OpenAI(
|
||||
base_url=s.laozhang_base_url,
|
||||
api_key=s.laozhang_api_key,
|
||||
)
|
||||
return _gpt_client
|
||||
|
||||
|
||||
def get_qwen_client() -> OpenAI:
|
||||
"""DashScope — qwen-plus"""
|
||||
global _qwen_client
|
||||
if _qwen_client is None:
|
||||
s = get_settings()
|
||||
_qwen_client = OpenAI(
|
||||
base_url=s.dashscope_base_url,
|
||||
api_key=s.dashscope_api_key,
|
||||
)
|
||||
return _qwen_client
|
||||
|
||||
|
||||
def get_vision_client() -> OpenAI:
|
||||
"""Google Gemini 官方 API(视觉,用于拆题+OCR)— 部署在新加坡可用"""
|
||||
global _gemini_flash_client
|
||||
if _gemini_flash_client is None:
|
||||
s = get_settings()
|
||||
_gemini_flash_client = OpenAI(
|
||||
base_url="https://generativelanguage.googleapis.com/v1beta/openai/",
|
||||
api_key=s.google_gemini_api_key,
|
||||
timeout=_TIMEOUT,
|
||||
)
|
||||
return _gemini_flash_client
|
||||
|
||||
|
||||
def get_gemini_lite_client() -> OpenAI:
|
||||
"""laozhang — gemini-3.1-flash-lite-preview(轻量,用于 AI trio)"""
|
||||
global _gemini_lite_client
|
||||
if _gemini_lite_client is None:
|
||||
s = get_settings()
|
||||
_gemini_lite_client = OpenAI(
|
||||
base_url=s.laozhang_base_url,
|
||||
api_key=s.laozhang_api_key,
|
||||
timeout=_TIMEOUT,
|
||||
)
|
||||
return _gemini_lite_client
|
||||
|
||||
|
||||
def get_deepseek_client() -> OpenAI:
|
||||
"""DeepSeek — deepseek-chat(用于 AI trio)"""
|
||||
global _deepseek_client
|
||||
if _deepseek_client is None:
|
||||
s = get_settings()
|
||||
_deepseek_client = OpenAI(
|
||||
base_url=s.deepseek_base_url,
|
||||
api_key=s.deepseek_api_key,
|
||||
timeout=_TIMEOUT,
|
||||
)
|
||||
return _deepseek_client
|
||||
576
backend/app/services/paper_processor.py
Normal file
576
backend/app/services/paper_processor.py
Normal file
@@ -0,0 +1,576 @@
|
||||
"""试卷处理管线:PDF → 结构化题目 → AI 三件套(Vision 模式)"""
|
||||
|
||||
import asyncio
|
||||
import base64
|
||||
import io
|
||||
import json
|
||||
import re
|
||||
import traceback
|
||||
from contextlib import redirect_stdout
|
||||
import fitz # pymupdf
|
||||
from app.services.supabase_client import get_supabase
|
||||
from app.services.llm_clients import get_vision_client, get_deepseek_client
|
||||
|
||||
|
||||
def strip_nulls(obj):
|
||||
"""Recursively remove \\u0000 null bytes from strings (PostgreSQL rejects them)."""
|
||||
if isinstance(obj, str):
|
||||
return obj.replace("\u0000", "")
|
||||
if isinstance(obj, dict):
|
||||
return {k: strip_nulls(v) for k, v in obj.items()}
|
||||
if isinstance(obj, list):
|
||||
return [strip_nulls(i) for i in obj]
|
||||
return obj
|
||||
|
||||
|
||||
# ============================================
|
||||
# Prompts
|
||||
# ============================================
|
||||
|
||||
STRUCTURE_PROMPT = """You are an expert exam paper structure analyst. You are given images of a past exam paper. Analyze every page carefully and extract all questions into structured JSON.
|
||||
All generated values must be in English. Do not output Chinese.
|
||||
|
||||
CRITICAL RULES for question_text:
|
||||
- Each question's question_text must be FULLY SELF-CONTAINED. Include ALL context needed to solve it.
|
||||
- For sub-questions (e.g. (a)(i)), copy the ENTIRE parent question setup (variable definitions, code blocks, problem description) into the question_text, then append the specific sub-question.
|
||||
- For Python/code questions: include ALL variable definitions and import statements verbatim, exactly as they appear in the exam, preserving multi-line arrays and data structures completely.
|
||||
- Never truncate code. If a variable is defined across multiple lines (e.g. a numpy array), include every line.
|
||||
|
||||
Output JSON format (strictly follow):
|
||||
{
|
||||
"total_score": 100,
|
||||
"difficulty_level": "medium",
|
||||
"topics_summary": {"Topic A": 40, "Topic B": 30, "Topic C": 30},
|
||||
"questions": [
|
||||
{
|
||||
"question_number": "1a",
|
||||
"parent_question": "1",
|
||||
"question_type": "mc",
|
||||
"question_text": "Original question text...",
|
||||
"score": 5,
|
||||
"page_number": 1,
|
||||
"options": [{"label": "A", "text": "Option content"}, {"label": "B", "text": "..."}],
|
||||
"topics": ["Linked List", "Pointer"],
|
||||
"difficulty": "easy"
|
||||
},
|
||||
{
|
||||
"question_number": "2",
|
||||
"parent_question": null,
|
||||
"question_type": "long_question",
|
||||
"question_text": "Original question text...",
|
||||
"score": 15,
|
||||
"page_number": 2,
|
||||
"options": null,
|
||||
"topics": ["Recursion"],
|
||||
"difficulty": "hard"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
Rules:
|
||||
- question_type must be one of: "mc" (multiple choice), "true_false" (true/false), "fill_blank" (fill in blank), "long_question" (long question)
|
||||
- True/False questions MUST use "true_false" type, with options set to [{"label":"True","text":"True"},{"label":"False","text":"False"}], correct_option as "True" or "False"
|
||||
- Multiple choice must extract the options array
|
||||
- Sub-questions use parent_question to link to parent: "1a" parent is "1"
|
||||
- Independent questions without sub-questions set parent_question to null
|
||||
- page_number inferred from where the question appears
|
||||
- topics inferred from the question content
|
||||
- difficulty: "easy" | "medium" | "hard"
|
||||
- Extract ALL questions, do not miss any
|
||||
- Keep topic labels in English only
|
||||
"""
|
||||
|
||||
ANSWER_MATCH_PROMPT = """You are an expert exam answer matching specialist. Below is the answer text for an exam paper. Extract and match answers to their corresponding question numbers.
|
||||
All generated values must be in English. Do not output Chinese.
|
||||
|
||||
Question structure:
|
||||
{questions_json}
|
||||
|
||||
Answer text:
|
||||
{answer_text}
|
||||
|
||||
Output JSON format:
|
||||
{{
|
||||
"answers": [
|
||||
{{
|
||||
"question_number": "1a",
|
||||
"correct_option": "B",
|
||||
"correct_answer": null,
|
||||
"raw_answer_text": "Original answer text..."
|
||||
}},
|
||||
{{
|
||||
"question_number": "2",
|
||||
"correct_option": null,
|
||||
"correct_answer": null,
|
||||
"raw_answer_text": "Complete solution process and answer..."
|
||||
}}
|
||||
]
|
||||
}}
|
||||
|
||||
Rules:
|
||||
- For MC questions, fill correct_option (e.g. "B")
|
||||
- For fill-blank questions, fill correct_answer (e.g. "O(n log n)")
|
||||
- For long questions, only fill raw_answer_text (complete solution process)
|
||||
- Match all questions where answers can be found
|
||||
- Keep raw_answer_text faithful to the source answer, but do not add Chinese commentary
|
||||
"""
|
||||
|
||||
ANALYSIS_PROMPT = """You are an expert academic answer analyst. Generate three sections for the following exam question. ALL output must be in English.
|
||||
|
||||
Question info:
|
||||
- Number: {question_number}
|
||||
- Type: {question_type}
|
||||
- Score: {score}
|
||||
- Question: {question_text}
|
||||
- Topics: {topics}
|
||||
{answer_section}
|
||||
|
||||
Generate THREE sections in HTML format (supports KaTeX: block $$ ... $$ inline $ ... $):
|
||||
|
||||
Output JSON:
|
||||
{{
|
||||
"knowledge_reminder": "<HTML> Prerequisite knowledge points needed for this question, as a concise bullet list </HTML>",
|
||||
"ai_hint": "<HTML> A hint that guides thinking direction WITHOUT giving away the answer </HTML>",
|
||||
"solution": "<HTML> Complete step-by-step solution (Step 1, Step 2, ...) with derivations, formulas, and common mistake warnings </HTML>"
|
||||
}}
|
||||
|
||||
Solution requirements:
|
||||
- Must include complete working process, not just the answer
|
||||
- Each step must have an explanation
|
||||
- If a reference answer is provided, derive the solution based on it
|
||||
- If no reference answer, work out the complete solution independently
|
||||
- For MC questions, explain why the correct option is right AND why others are wrong
|
||||
- Use <ol> or numbered steps
|
||||
- Mark common mistakes with <div class="common-error">...</div>
|
||||
|
||||
KaTeX formula rules:
|
||||
- Block formula: $$ on its own line, with blank lines before and after
|
||||
- Inline formula: $x^2$ no line break
|
||||
- Matrix: \\begin{{bmatrix}} ... \\end{{bmatrix}}
|
||||
- Fraction: \\frac{{a}}{{b}}
|
||||
"""
|
||||
|
||||
BATCH_ANALYSIS_PROMPT = """You are an expert academic answer analyst. Generate three study sections for each question below. ALL output must be in English.
|
||||
|
||||
For every question, return:
|
||||
- knowledge_reminder: concise prerequisite bullets in HTML
|
||||
- ai_hint: a helpful hint in HTML without revealing the final answer
|
||||
- solution: a complete step-by-step solution in HTML
|
||||
|
||||
Return JSON in this exact format:
|
||||
{{
|
||||
"analyses": [
|
||||
{{
|
||||
"question_number": "1a",
|
||||
"knowledge_reminder": "<HTML>...</HTML>",
|
||||
"ai_hint": "<HTML>...</HTML>",
|
||||
"solution": "<HTML>...</HTML>"
|
||||
}}
|
||||
]
|
||||
}}
|
||||
|
||||
Rules:
|
||||
- Return one item for every provided question_number
|
||||
- Keep each item matched to the same question_number
|
||||
- All text must be in English
|
||||
- HTML only, KaTeX compatible
|
||||
- For MC questions, explain why the correct option is right and why the others are wrong
|
||||
- For long questions, show a complete derivation or reasoning chain
|
||||
- Use <ol> or numbered steps in solution when appropriate
|
||||
- Mark common mistakes with <div class="common-error">...</div>
|
||||
- CRITICAL: When a question_text contains "[Context from parent question X]" followed by "[Sub-question Y]", the parent section is background context only. You MUST solve ONLY the specific sub-question labeled [Sub-question Y]. Do NOT solve other sub-questions listed in the parent context. Give one precise answer for that single sub-question only.
|
||||
|
||||
Questions:
|
||||
{questions_payload}
|
||||
"""
|
||||
|
||||
|
||||
# ============================================
|
||||
# 处理管线
|
||||
# ============================================
|
||||
|
||||
RETRYABLE_ERROR_MARKERS = (
|
||||
"429",
|
||||
"rate limit",
|
||||
"rate_limit",
|
||||
"too many requests",
|
||||
"timeout",
|
||||
"timed out",
|
||||
"connection",
|
||||
)
|
||||
|
||||
|
||||
def is_retryable_error(exc: Exception) -> bool:
|
||||
message = str(exc).lower()
|
||||
return any(marker in message for marker in RETRYABLE_ERROR_MARKERS)
|
||||
|
||||
|
||||
def pdf_to_images(pdf_bytes: bytes, dpi: int = 96) -> list[str]:
|
||||
"""将 PDF 每页渲染为 base64 PNG 图片列表(96dpi 平衡清晰度与成本)"""
|
||||
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
||||
images = []
|
||||
mat = fitz.Matrix(dpi / 72, dpi / 72)
|
||||
for page in doc:
|
||||
pix = page.get_pixmap(matrix=mat, colorspace=fitz.csRGB)
|
||||
img_bytes = pix.tobytes("png")
|
||||
images.append(base64.b64encode(img_bytes).decode())
|
||||
doc.close()
|
||||
return images
|
||||
|
||||
|
||||
def parse_json_response(text: str) -> dict:
|
||||
"""解析模型返回的 JSON,兼容 markdown 代码块包装"""
|
||||
text = text.strip()
|
||||
# 去掉 ```json ... ``` 包装
|
||||
if text.startswith("```"):
|
||||
lines = text.splitlines()
|
||||
text = "\n".join(lines[1:-1] if lines[-1].strip() == "```" else lines[1:])
|
||||
# 移除 JSON 字符串中的非法控制字符(0x00-0x1F 除了 \t \n \r)
|
||||
text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', text)
|
||||
# 修复模型返回的无效 JSON 转义序列:只修奇数个反斜杠后的非法字符
|
||||
text = re.sub(r'(?<!\\)((?:\\\\)*)\\([^"\\/bfnrtu])', r'\1\\\\\2', text)
|
||||
return json.loads(text)
|
||||
|
||||
|
||||
async def gemini_vision_json(
|
||||
*,
|
||||
system_prompt: str,
|
||||
images: list[str],
|
||||
user_text: str = "",
|
||||
temperature: float = 0,
|
||||
max_attempts: int = 6,
|
||||
) -> dict:
|
||||
"""发送图片 + prompt 给 Gemini vision 模型,返回 JSON"""
|
||||
client = get_vision_client()
|
||||
delay_seconds = 2
|
||||
|
||||
content: list = []
|
||||
for b64 in images:
|
||||
content.append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{b64}"}})
|
||||
if user_text:
|
||||
content.append({"type": "text", "text": user_text})
|
||||
|
||||
for attempt in range(1, max_attempts + 1):
|
||||
try:
|
||||
response = client.chat.completions.create(
|
||||
model="gemini-2.5-flash",
|
||||
messages=[
|
||||
{"role": "system", "content": system_prompt + "\n\nIMPORTANT: Your entire response must be valid JSON only. No markdown, no code fences, no extra text."},
|
||||
{"role": "user", "content": content},
|
||||
],
|
||||
temperature=temperature,
|
||||
max_tokens=16384,
|
||||
)
|
||||
return parse_json_response(response.choices[0].message.content)
|
||||
except Exception as exc:
|
||||
if attempt == max_attempts or not is_retryable_error(exc):
|
||||
raise
|
||||
await asyncio.sleep(delay_seconds)
|
||||
delay_seconds = min(delay_seconds * 2, 30)
|
||||
|
||||
|
||||
async def deepseek_json_completion(
|
||||
*,
|
||||
system_prompt: str,
|
||||
user_prompt: str | None = None,
|
||||
temperature: float = 0,
|
||||
max_attempts: int = 6,
|
||||
) -> dict:
|
||||
"""DeepSeek 纯文本 JSON completion(用于 AI trio 生成)"""
|
||||
client = get_deepseek_client()
|
||||
delay_seconds = 2
|
||||
|
||||
for attempt in range(1, max_attempts + 1):
|
||||
try:
|
||||
messages = [{"role": "system", "content": system_prompt}]
|
||||
if user_prompt:
|
||||
messages.append({"role": "user", "content": user_prompt})
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model="deepseek-chat",
|
||||
messages=messages,
|
||||
temperature=temperature,
|
||||
max_tokens=8192,
|
||||
response_format={"type": "json_object"},
|
||||
)
|
||||
raw = response.choices[0].message.content
|
||||
raw = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', raw)
|
||||
raw = re.sub(r'(?<!\\)((?:\\\\)*)\\([^"\\/bfnrtu])', r'\1\\\\\2', raw)
|
||||
return json.loads(raw)
|
||||
except Exception as exc:
|
||||
if attempt == max_attempts or not is_retryable_error(exc):
|
||||
raise
|
||||
await asyncio.sleep(delay_seconds)
|
||||
delay_seconds = min(delay_seconds * 2, 30)
|
||||
|
||||
|
||||
def chunked(items: list[dict], size: int) -> list[list[dict]]:
|
||||
return [items[i:i + size] for i in range(0, len(items), size)]
|
||||
|
||||
|
||||
def _question_sort_key(qnum: str) -> tuple:
|
||||
"""自然排序题号:1a < 1b < ... < 1i < 1j < 2ai < 2aii < 10a"""
|
||||
parts = re.findall(r'(\d+|[a-zA-Z]+|[()]+)', qnum)
|
||||
key = []
|
||||
for idx, p in enumerate(parts):
|
||||
if p.isdigit():
|
||||
key.append((0, int(p), ''))
|
||||
elif p in ('(', ')'):
|
||||
continue
|
||||
else:
|
||||
# Single letter (a-z): always sort alphabetically (a=1, b=2, ..., j=10)
|
||||
if len(p) == 1 and p.isalpha():
|
||||
key.append((1, ord(p.lower()) - ord('a') + 1, p))
|
||||
else:
|
||||
# Multi-letter: roman numerals for sub-sub-questions (i=1, ii=2, iii=3, ...)
|
||||
romans = {'i':1,'ii':2,'iii':3,'iv':4,'v':5,'vi':6,'vii':7,'viii':8,'ix':9,'x':10,'xi':11,'xii':12,'xiii':13}
|
||||
if p.lower() in romans:
|
||||
key.append((2, romans[p.lower()], p))
|
||||
else:
|
||||
key.append((1, 0, p))
|
||||
return tuple(key)
|
||||
|
||||
|
||||
def sort_questions(questions: list[dict]) -> list[dict]:
|
||||
"""按题号自然排序"""
|
||||
return sorted(questions, key=lambda q: _question_sort_key(q.get("question_number", "")))
|
||||
|
||||
|
||||
def extract_code_block(text: str) -> str:
|
||||
"""
|
||||
从题目文本中提取 Python 代码块。
|
||||
策略:找到第一个明确的代码起始行(import/赋值/print),
|
||||
然后把后续所有缩进或延续行一并带上,直到明显的非代码段落。
|
||||
"""
|
||||
lines = text.splitlines()
|
||||
result = []
|
||||
in_code = False
|
||||
open_brackets = 0
|
||||
|
||||
CODE_START = re.compile(r"^\s*(import |from \w|[A-Za-z_]\w*\s*=|print\()")
|
||||
|
||||
for line in lines:
|
||||
stripped = line.strip()
|
||||
|
||||
# 已在代码块内:括号未闭合时继续收集
|
||||
if in_code and open_brackets > 0:
|
||||
result.append(stripped)
|
||||
open_brackets += stripped.count("(") + stripped.count("[") + stripped.count("{")
|
||||
open_brackets -= stripped.count(")") + stripped.count("]") + stripped.count("}")
|
||||
continue
|
||||
|
||||
# 检测新的代码起始行
|
||||
if CODE_START.match(line):
|
||||
in_code = True
|
||||
result.append(stripped)
|
||||
open_brackets += stripped.count("(") + stripped.count("[") + stripped.count("{")
|
||||
open_brackets -= stripped.count(")") + stripped.count("]") + stripped.count("}")
|
||||
continue
|
||||
|
||||
# 非代码行:重置(但保留 in_code=True 以便继续接后续代码行)
|
||||
in_code = False
|
||||
|
||||
return "\n".join(result)
|
||||
|
||||
|
||||
# 保持向后兼容
|
||||
extract_code_lines = extract_code_block
|
||||
|
||||
|
||||
def try_exec_python(code: str, shared_ns: dict) -> str | None:
|
||||
"""
|
||||
在 shared_ns 命名空间中执行 code,捕获 stdout。
|
||||
返回输出字符串,失败返回 None。
|
||||
"""
|
||||
buf = io.StringIO()
|
||||
try:
|
||||
with redirect_stdout(buf):
|
||||
exec(code, shared_ns) # noqa: S102
|
||||
output = buf.getvalue().strip()
|
||||
return output if output else None
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
async def _resume_ai_trio(sb, paper_id: str, questions: list[dict]):
|
||||
"""为缺 solution 的题目生成 AI trio,逐条写回 DB。支持断点续传。"""
|
||||
need = [q for q in questions if not q.get("solution")]
|
||||
if not need:
|
||||
# 全部已有 solution,直接标记完成
|
||||
sb.table("papers").update({"status": "ready", "processing_step": None}).eq("id", paper_id).execute()
|
||||
return
|
||||
|
||||
total_q = len(questions)
|
||||
done_q = total_q - len(need)
|
||||
|
||||
# 构建 payload
|
||||
id_map = {q["question_number"]: q["id"] for q in need}
|
||||
# 需要完整的 question_text 来生成 AI trio
|
||||
full_data = sb.table("paper_questions").select(
|
||||
"id, question_number, question_type, question_text, score, correct_option, correct_answer, raw_answer_text"
|
||||
).eq("paper_id", paper_id).in_("id", [q["id"] for q in need]).execute().data
|
||||
|
||||
payloads = []
|
||||
for q in full_data:
|
||||
answer_section = q.get("raw_answer_text") or ""
|
||||
if not answer_section and q.get("correct_option"):
|
||||
answer_section = f"Correct option: {q['correct_option']}"
|
||||
elif not answer_section and q.get("correct_answer"):
|
||||
answer_section = f"Correct answer: {q['correct_answer']}"
|
||||
payloads.append({
|
||||
"question_number": q["question_number"],
|
||||
"question_type": q["question_type"] or "long_question",
|
||||
"score": q.get("score") or "unknown",
|
||||
"question_text": q["question_text"] or "",
|
||||
"reference_answer": answer_section,
|
||||
})
|
||||
|
||||
batches = chunked(payloads, 3)
|
||||
for batch_idx, batch in enumerate(batches, 1):
|
||||
current = done_q + batch_idx * 3
|
||||
_update_progress(sb, paper_id, f"Generating solutions ({min(current, total_q)}/{total_q} questions)", batch_idx, len(batches))
|
||||
try:
|
||||
result = await deepseek_json_completion(
|
||||
system_prompt=BATCH_ANALYSIS_PROMPT.format(
|
||||
questions_payload=json.dumps(batch, ensure_ascii=False),
|
||||
),
|
||||
temperature=0.3,
|
||||
)
|
||||
for item in result.get("analyses", []):
|
||||
qnum = item.get("question_number")
|
||||
qid = id_map.get(qnum)
|
||||
if qid:
|
||||
sb.table("paper_questions").update({
|
||||
"knowledge_reminder": item.get("knowledge_reminder", ""),
|
||||
"ai_hint": item.get("ai_hint", ""),
|
||||
"solution": item.get("solution", ""),
|
||||
}).eq("id", qid).execute()
|
||||
except Exception:
|
||||
pass # 单批失败不影响其他批
|
||||
await asyncio.sleep(1)
|
||||
|
||||
# 标记完成
|
||||
sb.table("papers").update({"status": "ready", "processing_step": None}).eq("id", paper_id).execute()
|
||||
|
||||
|
||||
def _update_progress(sb, paper_id: str, step: str, progress: int = 0, total: int = 0):
|
||||
"""更新处理进度到 DB"""
|
||||
sb.table("papers").update({
|
||||
"processing_step": step,
|
||||
"processing_progress": progress,
|
||||
"processing_total": total,
|
||||
}).eq("id", paper_id).execute()
|
||||
|
||||
|
||||
async def process_paper(paper_id: str, paper_bytes: bytes, answer_bytes: bytes | None):
|
||||
"""后台处理管线: PDF pages → Vision 结构化 → AI 三件套
|
||||
|
||||
设计原则:每个步骤完成后立即持久化到 DB,支持断点续传。
|
||||
"""
|
||||
sb = get_supabase()
|
||||
|
||||
try:
|
||||
# 检查是否已有题目(断点续传场景)
|
||||
existing = sb.table("paper_questions").select("id, question_number, solution").eq("paper_id", paper_id).execute().data
|
||||
|
||||
if existing:
|
||||
# 已有题目 → 跳过提取,直接补 AI trio
|
||||
await _resume_ai_trio(sb, paper_id, existing)
|
||||
return
|
||||
|
||||
# ── Step 1: PDF → 图片 ──
|
||||
_update_progress(sb, paper_id, "Rendering PDF pages...")
|
||||
paper_images = pdf_to_images(paper_bytes)
|
||||
|
||||
# ── Step 2: Vision 结构化拆题 ──
|
||||
PAGE_BATCH = 8
|
||||
all_questions: list = []
|
||||
meta: dict = {}
|
||||
num_page_batches = -(-len(paper_images) // PAGE_BATCH)
|
||||
for i in range(0, len(paper_images), PAGE_BATCH):
|
||||
batch_imgs = paper_images[i:i + PAGE_BATCH]
|
||||
batch_idx = i // PAGE_BATCH + 1
|
||||
_update_progress(sb, paper_id, f"Reading pages {i+1}-{i+len(batch_imgs)}...", batch_idx, num_page_batches)
|
||||
batch_result = await gemini_vision_json(
|
||||
system_prompt=STRUCTURE_PROMPT,
|
||||
images=batch_imgs,
|
||||
user_text=f"Pages {i+1}-{i+len(batch_imgs)} of the exam paper. Extract all questions visible on these pages.",
|
||||
temperature=0,
|
||||
)
|
||||
if not meta:
|
||||
meta = {k: batch_result.get(k) for k in ("total_score", "difficulty_level", "topics_summary")}
|
||||
all_questions.extend(batch_result.get("questions", []))
|
||||
|
||||
all_questions = sort_questions(all_questions)
|
||||
questions = all_questions
|
||||
|
||||
# 更新 paper 概览
|
||||
sb.table("papers").update({
|
||||
"total_score": meta.get("total_score"),
|
||||
"question_count": len(questions),
|
||||
"topics_summary": meta.get("topics_summary"),
|
||||
"difficulty_level": meta.get("difficulty_level"),
|
||||
}).eq("id", paper_id).execute()
|
||||
|
||||
# ── Step 3: 答案匹配(分批,失败跳过)──
|
||||
answers_map = {}
|
||||
if answer_bytes:
|
||||
_update_progress(sb, paper_id, "Matching answers...")
|
||||
try:
|
||||
answer_images = pdf_to_images(answer_bytes)
|
||||
questions_json = json.dumps(
|
||||
[{"question_number": q["question_number"], "question_type": q["question_type"]}
|
||||
for q in questions], ensure_ascii=False,
|
||||
)
|
||||
all_answers: list = []
|
||||
for ai in range(0, len(answer_images), 8):
|
||||
batch_ans_imgs = answer_images[ai:ai + 8]
|
||||
try:
|
||||
match_result = await gemini_vision_json(
|
||||
system_prompt=ANSWER_MATCH_PROMPT.format(
|
||||
questions_json=questions_json, answer_text="(See images)",
|
||||
),
|
||||
images=batch_ans_imgs,
|
||||
user_text=f"Match answers to these questions: {questions_json}",
|
||||
temperature=0,
|
||||
)
|
||||
all_answers.extend(match_result.get("answers", []))
|
||||
except Exception:
|
||||
pass
|
||||
answers_map = {a["question_number"]: a for a in all_answers}
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# ── Step 4: 立即写入题目到 DB(先不含 AI trio)──
|
||||
_update_progress(sb, paper_id, "Saving questions...")
|
||||
for i, q in enumerate(questions):
|
||||
qnum = q["question_number"]
|
||||
answer = answers_map.get(qnum, {})
|
||||
sb.table("paper_questions").insert(strip_nulls({
|
||||
"paper_id": paper_id,
|
||||
"question_number": qnum,
|
||||
"parent_question": q.get("parent_question"),
|
||||
"display_order": i,
|
||||
"question_type": q["question_type"],
|
||||
"question_text": q["question_text"],
|
||||
"score": q.get("score"),
|
||||
"page_number": q.get("page_number"),
|
||||
"options": q.get("options"),
|
||||
"correct_option": answer.get("correct_option"),
|
||||
"correct_answer": answer.get("correct_answer"),
|
||||
"raw_answer_text": answer.get("raw_answer_text"),
|
||||
"topics": q.get("topics", []),
|
||||
"analytics_topic": q.get("topics", [None])[0],
|
||||
"topic_tags": q.get("topics", []),
|
||||
"difficulty": q.get("difficulty"),
|
||||
})).execute()
|
||||
|
||||
# ── Step 5: AI trio(逐条更新,支持断点续传)──
|
||||
saved = sb.table("paper_questions").select("id, question_number, solution").eq("paper_id", paper_id).execute().data
|
||||
await _resume_ai_trio(sb, paper_id, saved)
|
||||
|
||||
except Exception as e:
|
||||
sb.table("papers").update({
|
||||
"status": "error",
|
||||
"error_message": f"{type(e).__name__}: {str(e)}\n{traceback.format_exc()[-500:]}",
|
||||
}).eq("id", paper_id).execute()
|
||||
raise
|
||||
13
backend/app/services/supabase_client.py
Normal file
13
backend/app/services/supabase_client.py
Normal file
@@ -0,0 +1,13 @@
|
||||
from supabase import create_client, Client
|
||||
from app.config import get_settings
|
||||
|
||||
_client: Client | None = None
|
||||
|
||||
|
||||
def get_supabase() -> Client:
|
||||
"""获取 Supabase client (service_role,绕过 RLS)"""
|
||||
global _client
|
||||
if _client is None:
|
||||
s = get_settings()
|
||||
_client = create_client(s.supabase_url, s.supabase_service_role_key)
|
||||
return _client
|
||||
48
backend/app/services/text_extractor.py
Normal file
48
backend/app/services/text_extractor.py
Normal file
@@ -0,0 +1,48 @@
|
||||
"""PDF 文本提取 — 复用 SOS 的 text_extractor 逻辑"""
|
||||
|
||||
import base64
|
||||
import fitz # PyMuPDF
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExtractedContent:
|
||||
pages_text: list[str] # 每页文本
|
||||
page_images: dict[int, str] # 页码 → base64 图片(图片密集型页面)
|
||||
total_pages: int
|
||||
has_images: bool
|
||||
|
||||
|
||||
def extract_pdf(file_bytes: bytes) -> ExtractedContent:
|
||||
"""从 PDF 提取文本和图片"""
|
||||
doc = fitz.open(stream=file_bytes, filetype="pdf")
|
||||
pages_text = []
|
||||
page_images = {}
|
||||
|
||||
for i, page in enumerate(doc):
|
||||
text = page.get_text("text")
|
||||
pages_text.append(text)
|
||||
|
||||
# 如果某页文本很少但有图片,可能是扫描件 → 保存为图片用于 Vision OCR
|
||||
if len(text.strip()) < 50:
|
||||
pix = page.get_pixmap(dpi=200)
|
||||
img_bytes = pix.tobytes("png")
|
||||
page_images[i] = base64.b64encode(img_bytes).decode("utf-8")
|
||||
|
||||
doc.close()
|
||||
|
||||
return ExtractedContent(
|
||||
pages_text=pages_text,
|
||||
page_images=page_images,
|
||||
total_pages=len(pages_text),
|
||||
has_images=len(page_images) > 0,
|
||||
)
|
||||
|
||||
|
||||
def get_full_text(extracted: ExtractedContent) -> str:
|
||||
"""合并所有页面文本"""
|
||||
return "\n\n".join(
|
||||
f"--- Page {i+1} ---\n{text}"
|
||||
for i, text in enumerate(extracted.pages_text)
|
||||
if text.strip()
|
||||
)
|
||||
252
backend/backfill_ai_trio_with_context.py
Normal file
252
backend/backfill_ai_trio_with_context.py
Normal file
@@ -0,0 +1,252 @@
|
||||
"""
|
||||
重新生成所有题目的 AI trio,子题带父题上下文。
|
||||
用法: python backfill_ai_trio_with_context.py [--paper-id <id>] [--course <code>]
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import io
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import argparse
|
||||
from contextlib import redirect_stdout
|
||||
from app.services.supabase_client import get_supabase
|
||||
from app.services.llm_clients import get_deepseek_client
|
||||
|
||||
|
||||
def extract_code_lines(text: str) -> str:
|
||||
lines = (text or "").splitlines()
|
||||
result = []
|
||||
in_code = False
|
||||
open_brackets = 0
|
||||
CODE_START = re.compile(r"^\s*(import |from \w|[A-Za-z_]\w*\s*=|print\()")
|
||||
for line in lines:
|
||||
stripped = line.strip()
|
||||
if in_code and open_brackets > 0:
|
||||
result.append(stripped)
|
||||
open_brackets += stripped.count("(") + stripped.count("[") + stripped.count("{")
|
||||
open_brackets -= stripped.count(")") + stripped.count("]") + stripped.count("}")
|
||||
continue
|
||||
if CODE_START.match(line):
|
||||
in_code = True
|
||||
result.append(stripped)
|
||||
open_brackets += stripped.count("(") + stripped.count("[") + stripped.count("{")
|
||||
open_brackets -= stripped.count(")") + stripped.count("]") + stripped.count("}")
|
||||
continue
|
||||
in_code = False
|
||||
return "\n".join(result)
|
||||
|
||||
|
||||
def try_exec_python(code: str, shared_ns: dict) -> str | None:
|
||||
buf = io.StringIO()
|
||||
try:
|
||||
with redirect_stdout(buf):
|
||||
exec(code, shared_ns) # noqa: S102
|
||||
output = buf.getvalue().strip()
|
||||
return output if output else None
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
BATCH_ANALYSIS_PROMPT = """You are an expert academic answer analyst. Generate three study sections for each question below. ALL output must be in English.
|
||||
|
||||
For every question, return:
|
||||
- knowledge_reminder: concise prerequisite bullets in HTML
|
||||
- ai_hint: a helpful hint in HTML without revealing the final answer
|
||||
- solution: a complete step-by-step solution in HTML
|
||||
|
||||
Return JSON in this exact format:
|
||||
{{
|
||||
"analyses": [
|
||||
{{
|
||||
"question_number": "1a",
|
||||
"knowledge_reminder": "<HTML>...</HTML>",
|
||||
"ai_hint": "<HTML>...</HTML>",
|
||||
"solution": "<HTML>...</HTML>"
|
||||
}}
|
||||
]
|
||||
}}
|
||||
|
||||
Rules:
|
||||
- Return one item for every provided question_number
|
||||
- All text must be in English
|
||||
- HTML only, KaTeX compatible (block $$ ... $$ inline $ ... $)
|
||||
- For MC questions, explain why the correct option is right and why others are wrong
|
||||
- For long questions, show a complete derivation or reasoning chain
|
||||
- Use <ol> or numbered steps in solution when appropriate
|
||||
- Mark common mistakes with <div class="common-error">...</div>
|
||||
- CRITICAL: When a question_text contains "[Context from parent question X]" followed by "[Sub-question Y]", the parent section is background context only. You MUST solve ONLY the specific sub-question labeled [Sub-question Y]. Do NOT solve other sub-questions listed in the parent context. Give one precise answer for that single sub-question only.
|
||||
|
||||
Questions:
|
||||
{questions_payload}
|
||||
"""
|
||||
|
||||
|
||||
def chunked(lst, size):
|
||||
return [lst[i:i+size] for i in range(0, len(lst), size)]
|
||||
|
||||
|
||||
async def deepseek_batch(batch: list[dict]) -> list[dict]:
|
||||
client = get_deepseek_client()
|
||||
for attempt in range(5):
|
||||
try:
|
||||
resp = client.chat.completions.create(
|
||||
model="deepseek-chat",
|
||||
messages=[{
|
||||
"role": "system",
|
||||
"content": BATCH_ANALYSIS_PROMPT.format(
|
||||
questions_payload=json.dumps(batch, ensure_ascii=False)
|
||||
)
|
||||
}],
|
||||
temperature=0.3,
|
||||
max_tokens=8192,
|
||||
response_format={"type": "json_object"},
|
||||
)
|
||||
raw = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', resp.choices[0].message.content)
|
||||
raw = re.sub(r'(?<!\\)((?:\\\\)*)\\([^"\\/bfnrtu])', r'\1\\\\\2', raw)
|
||||
data = json.loads(raw)
|
||||
return data.get("analyses", [])
|
||||
except Exception as e:
|
||||
print(f" attempt {attempt+1} failed: {e}")
|
||||
if attempt < 4:
|
||||
await asyncio.sleep(2 ** attempt * 2)
|
||||
return []
|
||||
|
||||
|
||||
async def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--paper-id", help="Only process this paper")
|
||||
parser.add_argument("--course", help="Only process papers with this course code")
|
||||
parser.add_argument("--missing-only", action="store_true", help="Only process questions missing solution")
|
||||
args = parser.parse_args()
|
||||
|
||||
sb = get_supabase()
|
||||
|
||||
# Fetch all questions (with paper info for filtering)
|
||||
query = sb.table("paper_questions").select(
|
||||
"id, paper_id, question_number, question_type, question_text, "
|
||||
"parent_question, score, correct_option, correct_answer, raw_answer_text, "
|
||||
"analytics_topic, topic_tags, solution"
|
||||
)
|
||||
if args.paper_id:
|
||||
query = query.eq("paper_id", args.paper_id)
|
||||
result = query.order("paper_id").order("display_order").execute()
|
||||
all_questions = result.data
|
||||
|
||||
if args.course:
|
||||
# Filter by course via papers table
|
||||
papers_res = sb.table("papers").select("id").eq("course_code", args.course.upper()).execute()
|
||||
paper_ids = {p["id"] for p in papers_res.data}
|
||||
all_questions = [q for q in all_questions if q["paper_id"] in paper_ids]
|
||||
|
||||
if args.missing_only:
|
||||
all_questions = [q for q in all_questions if not q.get("solution")]
|
||||
print(f"Questions missing solution: {len(all_questions)}")
|
||||
else:
|
||||
print(f"Total questions to process: {len(all_questions)}")
|
||||
|
||||
# Group by paper_id
|
||||
from collections import defaultdict
|
||||
by_paper: dict[str, list] = defaultdict(list)
|
||||
for q in all_questions:
|
||||
by_paper[q["paper_id"]].append(q)
|
||||
|
||||
total_updated = 0
|
||||
|
||||
for paper_id, questions in by_paper.items():
|
||||
print(f"\nPaper {paper_id} — {len(questions)} questions")
|
||||
|
||||
# 所有题都可能是别的题的父题
|
||||
parent_text_map: dict[str, str] = {
|
||||
q["question_number"]: q["question_text"] or ""
|
||||
for q in questions
|
||||
}
|
||||
|
||||
# Build payloads with context + Python exec
|
||||
payloads = []
|
||||
exec_namespaces: dict[str, dict] = {}
|
||||
|
||||
for q in questions:
|
||||
parent_q = q.get("parent_question")
|
||||
if parent_q and parent_q in parent_text_map:
|
||||
full_text = (
|
||||
f"[Context from parent question {parent_q}]\n"
|
||||
f"{parent_text_map[parent_q]}\n\n"
|
||||
f"[Sub-question {q['question_number']}]\n"
|
||||
f"{q['question_text'] or ''}"
|
||||
)
|
||||
else:
|
||||
full_text = q["question_text"] or ""
|
||||
|
||||
answer_section = ""
|
||||
if q.get("raw_answer_text"):
|
||||
answer_section = q["raw_answer_text"]
|
||||
elif q.get("correct_option"):
|
||||
answer_section = f"Correct option: {q['correct_option']}"
|
||||
elif q.get("correct_answer"):
|
||||
answer_section = f"Correct answer: {q['correct_answer']}"
|
||||
|
||||
# 尝试 Python exec 拿真实输出
|
||||
if not answer_section:
|
||||
group_key = parent_q or q["question_number"]
|
||||
if group_key not in exec_namespaces:
|
||||
ns: dict = {}
|
||||
try:
|
||||
import numpy as np
|
||||
ns["np"] = np
|
||||
except ImportError:
|
||||
pass
|
||||
# 先执行父题 setup 代码
|
||||
if parent_q and parent_q in parent_text_map:
|
||||
setup = extract_code_lines(parent_text_map[parent_q])
|
||||
try_exec_python(setup, ns)
|
||||
exec_namespaces[group_key] = ns
|
||||
|
||||
ns = exec_namespaces[group_key]
|
||||
sub_code = extract_code_lines(q["question_text"] or "")
|
||||
if sub_code:
|
||||
exec_out = try_exec_python(sub_code, ns)
|
||||
if exec_out is not None:
|
||||
answer_section = f"Executed output: {exec_out}"
|
||||
print(f" [exec] {q['question_number']}: {exec_out[:60]}")
|
||||
|
||||
payloads.append({
|
||||
"_id": q["id"],
|
||||
"question_number": q["question_number"],
|
||||
"question_type": q["question_type"] or "long_question",
|
||||
"score": q.get("score") or "unknown",
|
||||
"question_text": full_text,
|
||||
"reference_answer": answer_section,
|
||||
})
|
||||
|
||||
# Process in batches of 3
|
||||
id_map = {q["question_number"]: q["id"] for q in questions}
|
||||
|
||||
for batch in chunked(payloads, 3):
|
||||
# Strip internal _id before sending to model
|
||||
model_batch = [{k: v for k, v in p.items() if k != "_id"} for p in batch]
|
||||
nums = [p["question_number"] for p in batch]
|
||||
print(f" Batch {nums} ...", end=" ", flush=True)
|
||||
|
||||
analyses = await deepseek_batch(model_batch)
|
||||
|
||||
for item in analyses:
|
||||
qnum = item.get("question_number")
|
||||
qid = id_map.get(qnum)
|
||||
if not qid:
|
||||
continue
|
||||
sb.table("paper_questions").update({
|
||||
"knowledge_reminder": item.get("knowledge_reminder"),
|
||||
"ai_hint": item.get("ai_hint"),
|
||||
"solution": item.get("solution"),
|
||||
}).eq("id", qid).execute()
|
||||
total_updated += 1
|
||||
|
||||
print(f"done ({len(analyses)} updated)")
|
||||
await asyncio.sleep(1)
|
||||
|
||||
print(f"\nDone. Total updated: {total_updated}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
160
backend/backfill_comp2211_page_y.py
Normal file
160
backend/backfill_comp2211_page_y.py
Normal file
@@ -0,0 +1,160 @@
|
||||
"""Backfill page_y_ratio for COMP2211 subquestions."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import time
|
||||
from pathlib import Path
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
|
||||
import fitz
|
||||
import httpx
|
||||
|
||||
from app.services.supabase_client import get_supabase
|
||||
|
||||
|
||||
ROOT = Path(__file__).resolve().parent.parent
|
||||
PAPERS_DIR = ROOT / "pastpaper-scraper" / "papers" / "COMP2211"
|
||||
|
||||
PDF_BY_EXAM_KEY = {
|
||||
"COMP2211-2022-fall-midterm": "(COMP2211)[2022](f)midterm~=yjz8dxdd^_27002.pdf",
|
||||
"COMP2211-2022-spring-midterm": "(COMP2211)[2022](s)midterm~=b8bidkgs^_14629.pdf",
|
||||
"COMP2211-2022-spring-final-part-a": "(COMP2211)[2022](s)final~=b8bidkgs^_33018.pdf",
|
||||
"COMP2211-2022-spring-final-part-b": "(COMP2211)[2022](s)final~=b8bidkgs^_40627.pdf",
|
||||
"COMP2211-2023-spring-midterm": "(COMP2211)[2023](s)midterm~=bxbidkmj^_26587.pdf",
|
||||
"COMP2211-2024-spring-midterm": "(COMP2211)[2024](s)midterm~=rcidkjgf^_82003.pdf",
|
||||
"COMP2211-2024-spring-final": "(COMP2211)[2024](s)final~=igk5mmg^_90365.pdf",
|
||||
}
|
||||
|
||||
|
||||
def marker_candidates(question_number: str) -> list[str]:
|
||||
if "_" in question_number:
|
||||
left, right = question_number.split("_", 1)
|
||||
tokens: list[str] = []
|
||||
m = re.fullmatch(r"(\d+)([a-z])", left)
|
||||
if m:
|
||||
tokens.append(f"({m.group(2)})")
|
||||
elif re.fullmatch(r"\d+[a-z]+", left):
|
||||
tokens.append(f"({re.sub(r'^\\d+', '', left)})")
|
||||
tokens.append(f"({right})")
|
||||
return tokens[::-1]
|
||||
|
||||
m = re.fullmatch(r"(\d+)([a-z])", question_number)
|
||||
if m:
|
||||
return [f"({m.group(2)})", f"Problem {m.group(1)}"]
|
||||
|
||||
if question_number.isdigit():
|
||||
return [f"Problem {question_number}"]
|
||||
|
||||
return [question_number]
|
||||
|
||||
|
||||
def line_matches(line_text: str, marker: str) -> bool:
|
||||
text = re.sub(r"\s+", " ", line_text.strip())
|
||||
if not text:
|
||||
return False
|
||||
if marker.startswith("("):
|
||||
return text.startswith(marker)
|
||||
return marker.lower() in text.lower()
|
||||
|
||||
|
||||
def line_y_ratio(page: fitz.Page, marker: str) -> float | None:
|
||||
data = page.get_text("dict")
|
||||
hits: list[float] = []
|
||||
for block in data.get("blocks", []):
|
||||
if block.get("type") != 0:
|
||||
continue
|
||||
for line in block.get("lines", []):
|
||||
line_text = "".join(
|
||||
span.get("text", "")
|
||||
for span in line.get("spans", [])
|
||||
)
|
||||
if line_matches(line_text, marker):
|
||||
bbox = line.get("bbox")
|
||||
if bbox:
|
||||
hits.append(float(bbox[1]))
|
||||
if not hits:
|
||||
return None
|
||||
y = min(hits)
|
||||
return max(0.0, min((y - page.rect.y0) / page.rect.height, 0.98))
|
||||
|
||||
|
||||
def search_y_ratio(page: fitz.Page, marker: str) -> float | None:
|
||||
ratios: list[float] = []
|
||||
for rect in page.search_for(marker):
|
||||
ratios.append(max(0.0, min((rect.y0 - page.rect.y0) / page.rect.height, 0.98)))
|
||||
return min(ratios) if ratios else None
|
||||
|
||||
|
||||
def infer_y_ratio(page: fitz.Page, question_number: str) -> float:
|
||||
for marker in marker_candidates(question_number):
|
||||
ratio = line_y_ratio(page, marker)
|
||||
if ratio is not None:
|
||||
return ratio
|
||||
ratio = search_y_ratio(page, marker)
|
||||
if ratio is not None:
|
||||
return ratio
|
||||
return 0.05
|
||||
|
||||
|
||||
def main() -> None:
|
||||
sb = get_supabase()
|
||||
papers = (
|
||||
sb.table("papers")
|
||||
.select("id, source_exam_key")
|
||||
.eq("course_code", "COMP2211")
|
||||
.eq("source_kind", "course_library")
|
||||
.execute()
|
||||
.data
|
||||
or []
|
||||
)
|
||||
|
||||
updates: list[tuple[str, float]] = []
|
||||
for paper in papers:
|
||||
exam_key = paper["source_exam_key"]
|
||||
pdf_name = PDF_BY_EXAM_KEY.get(exam_key)
|
||||
if not pdf_name:
|
||||
continue
|
||||
pdf_path = PAPERS_DIR / pdf_name
|
||||
doc = fitz.open(pdf_path)
|
||||
try:
|
||||
questions = (
|
||||
sb.table("paper_questions")
|
||||
.select("id, question_number, page_number")
|
||||
.eq("paper_id", paper["id"])
|
||||
.order("display_order")
|
||||
.execute()
|
||||
.data
|
||||
or []
|
||||
)
|
||||
for question in questions:
|
||||
page_number = question.get("page_number") or 1
|
||||
page = doc[page_number - 1]
|
||||
ratio = infer_y_ratio(page, question["question_number"])
|
||||
updates.append((question["id"], round(ratio, 4)))
|
||||
finally:
|
||||
doc.close()
|
||||
|
||||
def apply_update(payload: tuple[str, float]) -> None:
|
||||
question_id, ratio = payload
|
||||
attempts = 0
|
||||
while True:
|
||||
try:
|
||||
sb.table("paper_questions").update({"page_y_ratio": ratio}).eq("id", question_id).execute()
|
||||
return
|
||||
except httpx.HTTPError:
|
||||
attempts += 1
|
||||
if attempts >= 5:
|
||||
raise
|
||||
time.sleep(0.4 * attempts)
|
||||
|
||||
with ThreadPoolExecutor(max_workers=3) as executor:
|
||||
futures = [executor.submit(apply_update, payload) for payload in updates]
|
||||
for future in as_completed(futures):
|
||||
future.result()
|
||||
|
||||
print(f"Backfilled page_y_ratio for {len(updates)} COMP2211 questions.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
365
backend/backfill_comp2211_tags.py
Normal file
365
backend/backfill_comp2211_tags.py
Normal file
@@ -0,0 +1,365 @@
|
||||
"""Backfill COMP2211 tags to the revised retrieval schema."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from collections import OrderedDict
|
||||
|
||||
from app.services.supabase_client import get_supabase
|
||||
|
||||
|
||||
SKILL_LABELS = {
|
||||
"concept_check": "Concept Check",
|
||||
"code_tracing": "Code Tracing",
|
||||
"algorithm_tracing": "Algorithm Tracing",
|
||||
"distance_calculation": "Distance Calculation",
|
||||
"centroid_update": "Centroid Update",
|
||||
"weight_update": "Weight Update",
|
||||
"decision_boundary": "Decision Boundary",
|
||||
"implementation": "Implementation",
|
||||
"debugging": "Debugging",
|
||||
"model_selection": "Model Selection",
|
||||
"concept_explanation": "Concept Explanation",
|
||||
"architecture_reasoning": "Architecture Reasoning",
|
||||
"convergence_reasoning": "Convergence Reasoning",
|
||||
"generalization_reasoning": "Generalization Reasoning",
|
||||
"classification_decision": "Classification Decision",
|
||||
}
|
||||
|
||||
ACRONYMS = {
|
||||
"ai": "AI",
|
||||
"cnn": "CNN",
|
||||
"knn": "KNN",
|
||||
"mlp": "MLP",
|
||||
"nb": "NB",
|
||||
"numpy": "NumPy",
|
||||
}
|
||||
|
||||
|
||||
def title_case_with_acronyms(value: str) -> str:
|
||||
words = re.split(r"[\s_]+", value.strip())
|
||||
parts: list[str] = []
|
||||
for word in words:
|
||||
if not word:
|
||||
continue
|
||||
lowered = word.lower()
|
||||
parts.append(ACRONYMS.get(lowered, lowered.capitalize()))
|
||||
return " ".join(parts)
|
||||
|
||||
|
||||
def normalize_skill_tag(tag: str) -> str:
|
||||
if tag in SKILL_LABELS:
|
||||
return SKILL_LABELS[tag]
|
||||
return title_case_with_acronyms(tag)
|
||||
|
||||
|
||||
def text_blob(question: dict) -> str:
|
||||
parts = [
|
||||
question.get("question_text") or "",
|
||||
question.get("raw_answer_text") or "",
|
||||
" ".join(question.get("topic_tags") or []),
|
||||
" ".join(question.get("skill_tags") or []),
|
||||
question.get("analytics_topic") or "",
|
||||
]
|
||||
return " ".join(parts).lower()
|
||||
|
||||
|
||||
def has_any(text: str, phrases: list[str]) -> bool:
|
||||
return any(phrase in text for phrase in phrases)
|
||||
|
||||
|
||||
def infer_analytics_topic(question: dict) -> str:
|
||||
text = text_blob(question)
|
||||
broad = question.get("analytics_topic") or ""
|
||||
skills = {normalize_skill_tag(tag) for tag in (question.get("skill_tags") or [])}
|
||||
|
||||
if has_any(text, ["ethics", "bias", "privacy", "autonomous vehicle", "informed consent", "human participants", "ethically"]):
|
||||
return "Ethics of AI"
|
||||
if has_any(text, ["minimax", "alpha-beta", "alpha beta", "game tree", "tic-tac-toe", "tic tac toe"]):
|
||||
return "Game Trees"
|
||||
if has_any(text, ["search algorithm", "best-first", "breadth-first", "depth-first", "a* search", "a star"]):
|
||||
return "Search Algorithms"
|
||||
if has_any(text, ["cross validation", "d-fold", "k-fold", "train/val", "validation set", "fold "]) or broad == "Cross Validation":
|
||||
return "Cross Validation"
|
||||
if has_any(text, ["confusion matrix", "precision", "recall", "macro f1", "f1 score", "accuracy score", "evaluation metric"]):
|
||||
return "Evaluation Metrics"
|
||||
if has_any(text, ["naive bayes", "gaussian distribution", "laplace smoothing", "likelihood", "posterior probability"]) or broad == "Naive Bayes":
|
||||
return "Naive Bayes"
|
||||
if has_any(text, ["bayes classifier", "conditional probability", "bayesian inference", "prior probability", "posterior"]) or broad == "Bayesian Inference":
|
||||
return "Bayesian Inference"
|
||||
if has_any(text, ["leader clustering", "k-means", "k means", "centroid", "elbow method", "silhouette", "cluster assignments", "closest centroid", "new cluster"]):
|
||||
return "K-Means"
|
||||
if has_any(text, ["k-nearest", "nearest neighbors", "weighted knn", "cosine distance", "euclidean distance", "manhattan distance", "6-cross-validation error for k", "class for cosine distance"]):
|
||||
return "KNN"
|
||||
if has_any(text, ["multilayer perceptron", "mlp", "back propagation", "backpropagation", "hidden layer", "output layer", "dropout", "softmax", "sigmoid function", "relu as the activation"]) or broad == "MLP":
|
||||
return "MLP"
|
||||
if has_any(text, ["perceptron", "decision boundary", "single neuron", "weight update", "activation function f(z)", "linearly separable"]) or broad == "Perceptron":
|
||||
return "Perceptron"
|
||||
if has_any(text, ["convolutional neural network", "cnn", "kernel", "padding", "stride", "pooling", "dilated convolution", "3d convolution", "otsu", "histogram", "image processing", "grayscale image"]):
|
||||
return "CNN"
|
||||
if has_any(text, ["numpy", "python", "np.", "broadcasting", "reshape", "transpose", "mask", "vectorized", "np.arange", "np.mean", "np.dot", "np.convolve"]):
|
||||
return "Python and NumPy"
|
||||
|
||||
if broad == "KNN and Clustering":
|
||||
if (
|
||||
has_any(text, ["k-means", "k means", "centroid", "leader clustering", "elbow", "silhouette"])
|
||||
or "Centroid Update" in skills
|
||||
or "Convergence Reasoning" in skills
|
||||
or "Algorithm Tracing" in skills
|
||||
or "Model Selection" in skills
|
||||
):
|
||||
return "K-Means"
|
||||
return "KNN"
|
||||
|
||||
if broad == "Perceptron and MLP":
|
||||
if (
|
||||
has_any(text, ["hidden layer", "backprop", "activation function", "softmax", "relu", "sigmoid", "multilayer perceptron", "mlp"])
|
||||
or "Architecture Reasoning" in skills
|
||||
):
|
||||
return "MLP"
|
||||
return "Perceptron"
|
||||
|
||||
if broad == "Probabilistic Models":
|
||||
if has_any(text, ["naive bayes", "gaussian", "laplace", "likelihood"]):
|
||||
return "Naive Bayes"
|
||||
return "Bayesian Inference"
|
||||
|
||||
if broad == "Evaluation and Validation":
|
||||
if has_any(text, ["cross validation", "cross-validation", "k-fold", "d-fold", "validation set", "train/val"]):
|
||||
return "Cross Validation"
|
||||
return "Evaluation Metrics"
|
||||
|
||||
if broad == "Search and Games":
|
||||
if has_any(text, ["minimax", "alpha-beta", "alpha beta", "game tree"]):
|
||||
return "Game Trees"
|
||||
return "Search Algorithms"
|
||||
|
||||
broad_map = {
|
||||
"Vision and CNN": "CNN",
|
||||
"Python Fundamentals": "Python and NumPy",
|
||||
"Ethics of AI": "Ethics of AI",
|
||||
}
|
||||
return broad_map.get(broad, "Python and NumPy")
|
||||
|
||||
|
||||
TOPIC_CONCEPTS = {
|
||||
"Naive Bayes": [
|
||||
("Naive Bayes", ["naive bayes"]),
|
||||
("Prior", ["prior"]),
|
||||
("Likelihood", ["likelihood"]),
|
||||
("Posterior", ["posterior"]),
|
||||
("Gaussian", ["gaussian"]),
|
||||
("Laplace Smoothing", ["laplace"]),
|
||||
("Missing Data", ["missing data", "missing value"]),
|
||||
],
|
||||
"Bayesian Inference": [
|
||||
("Bayesian Inference", ["bayes", "conditional probability", "posterior"]),
|
||||
("Conditional Probability", ["conditional probability"]),
|
||||
("Bayes Rule", ["bayes rule", "posterior"]),
|
||||
("Prior", ["prior"]),
|
||||
("Posterior", ["posterior"]),
|
||||
],
|
||||
"KNN": [
|
||||
("KNN", ["k-nearest", "nearest neighbors", "knn"]),
|
||||
("Euclidean Distance", ["euclidean distance"]),
|
||||
("Manhattan Distance", ["manhattan distance"]),
|
||||
("Cosine Distance", ["cosine distance"]),
|
||||
("Weighted KNN", ["weighted k-nearest", "weighted knn", "inverse of the distance"]),
|
||||
("Classification", ["class label", "predict", "classification"]),
|
||||
("Cross Validation", ["cross-validation", "cross validation"]),
|
||||
("Test Error", ["test error"]),
|
||||
],
|
||||
"K-Means": [
|
||||
("K-Means", ["k-means", "k means"]),
|
||||
("Centroid Update", ["centroid"]),
|
||||
("Convergence", ["converged", "convergence"]),
|
||||
("Leader Clustering", ["leader clustering"]),
|
||||
("Outliers", ["outlier"]),
|
||||
("Model Selection", ["elbow method", "silhouette", "suitable k"]),
|
||||
],
|
||||
"Perceptron": [
|
||||
("Perceptron", ["perceptron"]),
|
||||
("Decision Boundary", ["decision boundary", "linearly separable"]),
|
||||
("Weight Update", ["weight update", "∆w", "deltaw", "backward propagation"]),
|
||||
("Convergence", ["converged", "convergence"]),
|
||||
("Activation Function", ["activation function"]),
|
||||
],
|
||||
"MLP": [
|
||||
("MLP", ["mlp", "multilayer perceptron"]),
|
||||
("Backpropagation", ["back propagation", "backpropagation", "backward propagation"]),
|
||||
("Activation Function", ["activation function", "relu", "sigmoid", "softmax"]),
|
||||
("Hidden Layer", ["hidden layer"]),
|
||||
("Output Layer", ["output layer"]),
|
||||
("Parameter Count", ["number of parameters", "parameter"]),
|
||||
("Overfitting", ["overfitting", "dropout"]),
|
||||
],
|
||||
"CNN": [
|
||||
("CNN", ["cnn", "convolutional neural network"]),
|
||||
("Convolution", ["convolution", "kernel"]),
|
||||
("Padding", ["padding", "reflection padding", "zero padding"]),
|
||||
("Stride", ["stride"]),
|
||||
("Pooling", ["pooling", "max pooling", "average pooling"]),
|
||||
("Image Processing", ["image processing", "grayscale image"]),
|
||||
("Histogram", ["histogram"]),
|
||||
("Otsu Thresholding", ["otsu"]),
|
||||
("Dilated Convolution", ["dilated convolution"]),
|
||||
("3D Convolution", ["3d convolution"]),
|
||||
("Dropout", ["dropout"]),
|
||||
],
|
||||
"Evaluation Metrics": [
|
||||
("Evaluation Metrics", ["evaluation", "metric"]),
|
||||
("Confusion Matrix", ["confusion matrix"]),
|
||||
("Accuracy", ["accuracy"]),
|
||||
("Precision", ["precision"]),
|
||||
("Recall", ["recall"]),
|
||||
("F1 Score", ["f1"]),
|
||||
("Macro F1", ["macro f1"]),
|
||||
],
|
||||
"Cross Validation": [
|
||||
("Cross Validation", ["cross validation", "cross-validation", "d-fold", "k-fold"]),
|
||||
("Train Validation Split", ["validation set", "train", "test fold"]),
|
||||
("Model Selection", ["choose k", "which k", "fold"]),
|
||||
("Data Shuffling", ["shuffle", "shuffling"]),
|
||||
],
|
||||
"Python and NumPy": [
|
||||
("Python and NumPy", ["numpy", "python"]),
|
||||
("NumPy", ["numpy", "np."]),
|
||||
("Broadcasting", ["broadcast"]),
|
||||
("Array Indexing", ["index", "slice"]),
|
||||
("Vectorization", ["no explicit loops", "vectorized"]),
|
||||
("Matrix Multiplication", ["matmul", "matrix multiplication", "@"]),
|
||||
("Reshape", ["reshape"]),
|
||||
("Transpose", ["transpose"]),
|
||||
("Masking", ["mask"]),
|
||||
("Convolution", ["convolve"]),
|
||||
],
|
||||
"Search Algorithms": [
|
||||
("Search Algorithms", ["search"]),
|
||||
("Breadth-First Search", ["breadth-first", "breadth first", "bfs"]),
|
||||
("Depth-First Search", ["depth-first", "depth first", "dfs"]),
|
||||
("Best-First Search", ["best-first", "best first"]),
|
||||
("A* Search", ["a* search", "a star", "astar"]),
|
||||
("Heuristic", ["heuristic"]),
|
||||
],
|
||||
"Game Trees": [
|
||||
("Game Trees", ["game tree", "minimax", "alpha-beta", "alpha beta"]),
|
||||
("Minimax", ["minimax"]),
|
||||
("Alpha-Beta Pruning", ["alpha-beta", "alpha beta", "pruned"]),
|
||||
("Utility", ["utility"]),
|
||||
],
|
||||
"Ethics of AI": [
|
||||
("Ethics of AI", ["ethics", "ethical"]),
|
||||
("Bias", ["bias"]),
|
||||
("Privacy", ["privacy"]),
|
||||
("Fairness", ["fair"]),
|
||||
("Research Ethics", ["informed consent", "human participants"]),
|
||||
("Governance", ["monitoring", "production", "organizations"]),
|
||||
("Autonomous Vehicles", ["autonomous vehicle"]),
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
TOPIC_DEFAULTS = {
|
||||
"Naive Bayes": ["Likelihood", "Posterior"],
|
||||
"Bayesian Inference": ["Conditional Probability", "Bayes Rule"],
|
||||
"KNN": ["Classification", "Distance Calculation"],
|
||||
"K-Means": ["Centroid Update", "Convergence"],
|
||||
"Perceptron": ["Decision Boundary", "Weight Update"],
|
||||
"MLP": ["Activation Function", "Hidden Layer"],
|
||||
"CNN": ["Convolution", "Padding"],
|
||||
"Evaluation Metrics": ["Confusion Matrix", "F1 Score"],
|
||||
"Cross Validation": ["Train Validation Split", "Model Selection"],
|
||||
"Python and NumPy": ["NumPy", "Vectorization"],
|
||||
"Search Algorithms": ["Breadth-First Search", "Heuristic"],
|
||||
"Game Trees": ["Minimax", "Alpha-Beta Pruning"],
|
||||
"Ethics of AI": ["Bias", "Fairness"],
|
||||
}
|
||||
|
||||
DEFAULT_SKILLS = {
|
||||
"Naive Bayes": ["Probability Reasoning"],
|
||||
"Bayesian Inference": ["Probability Reasoning"],
|
||||
"KNN": ["Classification Decision"],
|
||||
"K-Means": ["Centroid Update"],
|
||||
"Perceptron": ["Decision Boundary"],
|
||||
"MLP": ["Concept Explanation"],
|
||||
"CNN": ["Concept Explanation"],
|
||||
"Evaluation Metrics": ["Metric Reasoning"],
|
||||
"Cross Validation": ["Model Selection"],
|
||||
"Python and NumPy": ["Code Tracing"],
|
||||
"Search Algorithms": ["Algorithm Tracing"],
|
||||
"Game Trees": ["Game Reasoning"],
|
||||
"Ethics of AI": ["Ethical Reasoning"],
|
||||
}
|
||||
|
||||
|
||||
def unique_keep_order(values: list[str]) -> list[str]:
|
||||
return list(OrderedDict((value, None) for value in values if value).keys())
|
||||
|
||||
|
||||
def build_topic_tags(question: dict, analytics_topic: str) -> list[str]:
|
||||
text = text_blob(question)
|
||||
tags: list[str] = [analytics_topic]
|
||||
for label, keywords in TOPIC_CONCEPTS.get(analytics_topic, []):
|
||||
if label == analytics_topic:
|
||||
continue
|
||||
if has_any(text, keywords):
|
||||
tags.append(label)
|
||||
for default in TOPIC_DEFAULTS.get(analytics_topic, []):
|
||||
if len(unique_keep_order(tags)) >= 2:
|
||||
break
|
||||
tags.append(default)
|
||||
tags = unique_keep_order(tags)
|
||||
return tags[:5]
|
||||
|
||||
|
||||
def build_skill_tags(question: dict, analytics_topic: str) -> list[str]:
|
||||
raw = question.get("skill_tags") or []
|
||||
converted = unique_keep_order([normalize_skill_tag(tag) for tag in raw])
|
||||
if not converted:
|
||||
converted = DEFAULT_SKILLS.get(analytics_topic, ["Concept Check"])
|
||||
return converted[:3]
|
||||
|
||||
|
||||
def main() -> None:
|
||||
sb = get_supabase()
|
||||
papers = (
|
||||
sb.table("papers")
|
||||
.select("id")
|
||||
.eq("course_code", "COMP2211")
|
||||
.eq("source_kind", "course_library")
|
||||
.execute()
|
||||
.data
|
||||
)
|
||||
paper_ids = [paper["id"] for paper in papers]
|
||||
if not paper_ids:
|
||||
print("No COMP2211 course-library papers found.")
|
||||
return
|
||||
|
||||
questions = (
|
||||
sb.table("paper_questions")
|
||||
.select("id, paper_id, question_number, question_text, raw_answer_text, analytics_topic, topic_tags, skill_tags, topics")
|
||||
.in_("paper_id", paper_ids)
|
||||
.order("paper_id")
|
||||
.order("display_order")
|
||||
.execute()
|
||||
.data
|
||||
)
|
||||
|
||||
for question in questions:
|
||||
analytics_topic = infer_analytics_topic(question)
|
||||
topic_tags = build_topic_tags(question, analytics_topic)
|
||||
skill_tags = build_skill_tags(question, analytics_topic)
|
||||
payload = {
|
||||
"analytics_topic": analytics_topic,
|
||||
"topic_primary": analytics_topic,
|
||||
"topic_tags": topic_tags,
|
||||
"topics": topic_tags,
|
||||
"skill_tags": skill_tags,
|
||||
}
|
||||
sb.table("paper_questions").update(payload).eq("id", question["id"]).execute()
|
||||
|
||||
print(f"Backfilled {len(questions)} COMP2211 questions.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
169
backend/backfill_null_ai_trio.py
Normal file
169
backend/backfill_null_ai_trio.py
Normal file
@@ -0,0 +1,169 @@
|
||||
"""Backfill AI trio for questions where knowledge_reminder IS NULL.
|
||||
|
||||
For each question, generates fields in two separate LLM calls to avoid token truncation:
|
||||
Call 1 → knowledge_reminder + ai_hint (short, ~500 tokens output)
|
||||
Call 2 → solution (long, up to 4096 tokens output)
|
||||
|
||||
Run from the backend directory:
|
||||
uv run python backfill_null_ai_trio.py [--dry-run]
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import sys
|
||||
from app.services.supabase_client import get_supabase
|
||||
from app.services.paper_processor import qwen_json_completion
|
||||
|
||||
|
||||
KNOWLEDGE_HINT_PROMPT = """\
|
||||
You are an expert tutor. Given a past-paper question, produce two short study aids in English.
|
||||
|
||||
Return JSON exactly:
|
||||
{{
|
||||
"knowledge_reminder": "2-4 sentences summarising the key concept or formula the student must recall.",
|
||||
"ai_hint": "1-3 sentence nudge that guides WITHOUT giving the answer away."
|
||||
}}
|
||||
|
||||
Question:
|
||||
{payload}
|
||||
"""
|
||||
|
||||
SOLUTION_PROMPT = """\
|
||||
You are an expert tutor. Given a past-paper question and its reference answer, write a clear, \
|
||||
step-by-step model solution in English. Show all working. Be thorough but stop when the answer \
|
||||
is complete — do not pad.
|
||||
|
||||
Return JSON exactly:
|
||||
{{
|
||||
"solution": "<full step-by-step solution as a single string, use \\n for line breaks>"
|
||||
}}
|
||||
|
||||
Question:
|
||||
{payload}
|
||||
"""
|
||||
|
||||
|
||||
def build_payload(q: dict) -> dict:
|
||||
ref = ""
|
||||
if q.get("raw_answer_text"):
|
||||
ref = q["raw_answer_text"]
|
||||
elif q.get("correct_option"):
|
||||
ref = f"Correct option: {q['correct_option']}"
|
||||
elif q.get("correct_answer"):
|
||||
ref = f"Correct answer: {q['correct_answer']}"
|
||||
|
||||
return {
|
||||
"question_number": q["question_number"],
|
||||
"question_type": q["question_type"] or "long_question",
|
||||
"score": q.get("score") or "unknown",
|
||||
"question_text": q.get("question_text") or "",
|
||||
"topics": q.get("topics") or [],
|
||||
"reference_answer": ref,
|
||||
}
|
||||
|
||||
|
||||
async def process_one(sb, q: dict, dry_run: bool) -> bool:
|
||||
payload_str = json.dumps(build_payload(q), ensure_ascii=False)
|
||||
row_id = q["id"]
|
||||
qnum = q["question_number"]
|
||||
|
||||
if dry_run:
|
||||
print(f" [dry-run] would process {qnum}")
|
||||
return True
|
||||
|
||||
update: dict = {}
|
||||
|
||||
# ── Call 1: knowledge_reminder + ai_hint ─────────────────────────
|
||||
try:
|
||||
r1 = await qwen_json_completion(
|
||||
system_prompt=KNOWLEDGE_HINT_PROMPT.format(payload=payload_str),
|
||||
temperature=0.3,
|
||||
max_tokens=1024,
|
||||
)
|
||||
if r1.get("knowledge_reminder"):
|
||||
update["knowledge_reminder"] = r1["knowledge_reminder"]
|
||||
if r1.get("ai_hint"):
|
||||
update["ai_hint"] = r1["ai_hint"]
|
||||
except Exception as e:
|
||||
print(f" WARN call-1 failed for {qnum}: {e}")
|
||||
|
||||
await asyncio.sleep(1)
|
||||
|
||||
# ── Call 2: solution ──────────────────────────────────────────────
|
||||
try:
|
||||
r2 = await qwen_json_completion(
|
||||
system_prompt=SOLUTION_PROMPT.format(payload=payload_str),
|
||||
temperature=0.3,
|
||||
max_tokens=4096,
|
||||
)
|
||||
if r2.get("solution"):
|
||||
update["solution"] = r2["solution"]
|
||||
except Exception as e:
|
||||
print(f" WARN call-2 failed for {qnum}: {e}")
|
||||
|
||||
if not update:
|
||||
print(f" SKIP {qnum}: both calls returned nothing")
|
||||
return False
|
||||
|
||||
sb.table("paper_questions").update(update).eq("id", row_id).execute()
|
||||
return True
|
||||
|
||||
|
||||
async def backfill(dry_run: bool = False) -> None:
|
||||
sb = get_supabase()
|
||||
|
||||
papers = (
|
||||
sb.table("papers")
|
||||
.select("id")
|
||||
.eq("course_code", "COMP2211")
|
||||
.eq("source_kind", "course_library")
|
||||
.execute()
|
||||
.data
|
||||
)
|
||||
paper_ids = [p["id"] for p in papers]
|
||||
if not paper_ids:
|
||||
print("No COMP2211 course-library papers found.")
|
||||
return
|
||||
|
||||
questions = (
|
||||
sb.table("paper_questions")
|
||||
.select("id, paper_id, question_number, question_type, score, question_text, topics, raw_answer_text, correct_option, correct_answer")
|
||||
.in_("paper_id", paper_ids)
|
||||
.is_("knowledge_reminder", "null")
|
||||
.order("paper_id")
|
||||
.order("display_order")
|
||||
.execute()
|
||||
.data
|
||||
)
|
||||
|
||||
if not questions:
|
||||
print("No NULL questions found — all done!")
|
||||
return
|
||||
|
||||
print(f"Found {len(questions)} questions with NULL knowledge_reminder.")
|
||||
|
||||
# Group by paper for cleaner output
|
||||
from collections import defaultdict
|
||||
by_paper: dict[str, list] = defaultdict(list)
|
||||
for q in questions:
|
||||
by_paper[q["paper_id"]].append(q)
|
||||
|
||||
total_updated = 0
|
||||
for paper_idx, (paper_id, qs) in enumerate(by_paper.items(), 1):
|
||||
print(f"\n[{paper_idx}/{len(by_paper)}] paper_id={paper_id} — {len(qs)} NULL questions")
|
||||
for q in qs:
|
||||
print(f" Processing {q['question_number']}...", end=" ", flush=True)
|
||||
ok = await process_one(sb, q, dry_run)
|
||||
if ok:
|
||||
total_updated += 1
|
||||
print("done")
|
||||
await asyncio.sleep(1.5)
|
||||
|
||||
print(f"\nDone. {total_updated}/{len(questions)} questions updated.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
dry_run = "--dry-run" in sys.argv
|
||||
asyncio.run(backfill(dry_run=dry_run))
|
||||
135
backend/backfill_similar_questions.py
Normal file
135
backend/backfill_similar_questions.py
Normal file
@@ -0,0 +1,135 @@
|
||||
"""Pre-compute similar_questions for all COMP2211 course-library questions.
|
||||
|
||||
For each question, runs the same similarity logic as the API and writes the result
|
||||
into paper_questions.similar_questions (JSONB). The API will then return this
|
||||
pre-computed value directly with no computation overhead.
|
||||
|
||||
Run from the backend directory:
|
||||
uv run python backfill_similar_questions.py [--dry-run]
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
from collections import Counter
|
||||
from app.services.supabase_client import get_supabase
|
||||
from app.routers.questions import (
|
||||
similarity_score,
|
||||
question_family,
|
||||
display_topics,
|
||||
)
|
||||
|
||||
|
||||
def run(dry_run: bool = False) -> None:
|
||||
sb = get_supabase()
|
||||
|
||||
# Fetch all ready COMP2211 papers
|
||||
papers = (
|
||||
sb.table("papers")
|
||||
.select("id, year, term, exam_type, part_label")
|
||||
.eq("course_code", "COMP2211")
|
||||
.eq("status", "ready")
|
||||
.execute()
|
||||
.data
|
||||
)
|
||||
if not papers:
|
||||
print("No ready COMP2211 papers found.")
|
||||
return
|
||||
|
||||
papers_by_id = {p["id"]: p for p in papers}
|
||||
paper_ids = list(papers_by_id.keys())
|
||||
|
||||
# Fetch all questions for these papers
|
||||
all_questions = (
|
||||
sb.table("paper_questions")
|
||||
.select(
|
||||
"id, paper_id, question_number, question_type, question_format, "
|
||||
"question_text, score, topics, analytics_topic, topic_tags, skill_tags, "
|
||||
"difficulty, knowledge_reminder, ai_hint, solution"
|
||||
)
|
||||
.in_("paper_id", paper_ids)
|
||||
.execute()
|
||||
.data
|
||||
)
|
||||
print(f"Found {len(all_questions)} questions across {len(papers)} papers.")
|
||||
|
||||
# Batch full-text scores not practical here; skip RPC, rely on tag/topic scoring
|
||||
# (text_score = 0 for all, still produces good tag-based results)
|
||||
|
||||
updated = 0
|
||||
skipped = 0
|
||||
|
||||
for i, target in enumerate(all_questions, 1):
|
||||
target_paper_id = target["paper_id"]
|
||||
target_topic = target.get("analytics_topic")
|
||||
|
||||
# Candidates: same course, different paper
|
||||
candidates = [
|
||||
q for q in all_questions
|
||||
if q["paper_id"] != target_paper_id
|
||||
]
|
||||
|
||||
# Pre-filter by analytics_topic if available
|
||||
if target_topic:
|
||||
candidates = [c for c in candidates if c.get("analytics_topic") == target_topic]
|
||||
|
||||
if not candidates:
|
||||
skipped += 1
|
||||
print(f" [{i}/{len(all_questions)}] {target['question_number']} — no candidates, skip")
|
||||
continue
|
||||
|
||||
ranked = []
|
||||
for candidate in candidates:
|
||||
match_percent, reasons = similarity_score(target, candidate, text_score=0.0)
|
||||
if match_percent < 20:
|
||||
continue
|
||||
paper = papers_by_id.get(candidate["paper_id"], {})
|
||||
source = (
|
||||
f"{paper.get('year', '')} {paper.get('term', '').title()} "
|
||||
f"{paper.get('exam_type', '').title()}"
|
||||
).strip()
|
||||
if paper.get("part_label"):
|
||||
source = f"{source} Part {paper['part_label']}"
|
||||
ranked.append({
|
||||
"id": candidate["id"],
|
||||
"paper_id": candidate["paper_id"],
|
||||
"source": source,
|
||||
"question_number": candidate["question_number"],
|
||||
"match_percent": match_percent,
|
||||
"match_reasons": reasons,
|
||||
"question_type": question_family(candidate),
|
||||
"question_text": candidate["question_text"],
|
||||
"topics": display_topics(candidate),
|
||||
"difficulty": candidate.get("difficulty"),
|
||||
"knowledge_reminder": candidate.get("knowledge_reminder", ""),
|
||||
"ai_hint": candidate.get("ai_hint", ""),
|
||||
"solution": candidate.get("solution", ""),
|
||||
})
|
||||
|
||||
ranked.sort(key=lambda item: (-item["match_percent"], item["source"], item["question_number"]))
|
||||
|
||||
# Deduplicate: best per paper
|
||||
seen_papers: set[str] = set()
|
||||
deduped = []
|
||||
for item in ranked:
|
||||
if item["paper_id"] not in seen_papers:
|
||||
seen_papers.add(item["paper_id"])
|
||||
deduped.append(item)
|
||||
deduped = deduped[:12]
|
||||
|
||||
print(f" [{i}/{len(all_questions)}] {target['question_number']} → {len(deduped)} similar", end="")
|
||||
|
||||
if dry_run:
|
||||
print(" [dry-run]")
|
||||
continue
|
||||
|
||||
sb.table("paper_questions").update({"similar_questions": deduped}).eq("id", target["id"]).execute()
|
||||
updated += 1
|
||||
print()
|
||||
|
||||
print(f"\nDone. {updated} updated, {skipped} skipped (no candidates).")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
dry_run = "--dry-run" in sys.argv
|
||||
run(dry_run=dry_run)
|
||||
238
backend/backfill_vision.py
Normal file
238
backend/backfill_vision.py
Normal file
@@ -0,0 +1,238 @@
|
||||
"""
|
||||
用 Vision 模式重新处理所有已 ready 的试卷:
|
||||
- 从 Supabase Storage 拉 PDF → 图片 → Vision 拆题 → exec → AI trio → 更新 DB
|
||||
|
||||
用法:
|
||||
python backfill_vision.py --course COMP2211
|
||||
python backfill_vision.py --paper-id <uuid>
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import argparse
|
||||
import requests
|
||||
from app.services.supabase_client import get_supabase
|
||||
from app.services.paper_processor import (
|
||||
process_paper,
|
||||
strip_nulls,
|
||||
pdf_to_images,
|
||||
gemini_vision_json,
|
||||
deepseek_json_completion,
|
||||
parse_json_response,
|
||||
extract_code_lines,
|
||||
try_exec_python,
|
||||
chunked,
|
||||
sort_questions,
|
||||
STRUCTURE_PROMPT,
|
||||
ANSWER_MATCH_PROMPT,
|
||||
BATCH_ANALYSIS_PROMPT,
|
||||
)
|
||||
import json
|
||||
import traceback
|
||||
|
||||
|
||||
async def reprocess_paper(paper: dict):
|
||||
"""重新处理单张试卷(Vision 模式)"""
|
||||
sb = get_supabase()
|
||||
paper_id = paper["id"]
|
||||
label = f"{paper['course_code']} {paper['year']} {paper['term']} {paper['exam_type']}"
|
||||
print(f"\n=== {label} ({paper_id[:8]}) ===")
|
||||
|
||||
# 1. 拉 PDF
|
||||
try:
|
||||
pdf_bytes = requests.get(paper["paper_file_url"], timeout=60).content
|
||||
except Exception as e:
|
||||
print(f" SKIP: failed to fetch PDF: {e}")
|
||||
return
|
||||
|
||||
answer_bytes = None
|
||||
if paper.get("answer_file_url"):
|
||||
try:
|
||||
answer_bytes = requests.get(paper["answer_file_url"], timeout=60).content
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 2. PDF → 图片
|
||||
print(f" Rendering {len(pdf_to_images(pdf_bytes))} pages...", end=" ", flush=True)
|
||||
paper_images = pdf_to_images(pdf_bytes)
|
||||
print("done")
|
||||
|
||||
# 3. Vision 拆题(分批,每批 8 页)
|
||||
PAGE_BATCH = 8
|
||||
all_questions: list = []
|
||||
meta: dict = {}
|
||||
print(f" Vision extraction ({len(paper_images)} pages, {-(-len(paper_images)//PAGE_BATCH)} batches)...")
|
||||
for i in range(0, len(paper_images), PAGE_BATCH):
|
||||
batch_imgs = paper_images[i:i + PAGE_BATCH]
|
||||
print(f" Pages {i+1}-{i+len(batch_imgs)}...", end=" ", flush=True)
|
||||
try:
|
||||
batch_result = await gemini_vision_json(
|
||||
system_prompt=STRUCTURE_PROMPT,
|
||||
images=batch_imgs,
|
||||
user_text=f"Pages {i+1}-{i+len(batch_imgs)} of the exam paper. Extract all questions visible on these pages.",
|
||||
temperature=0,
|
||||
)
|
||||
if not meta:
|
||||
meta = {k: batch_result.get(k) for k in ("total_score", "difficulty_level", "topics_summary")}
|
||||
qs = batch_result.get("questions", [])
|
||||
all_questions.extend(qs)
|
||||
print(f"done ({len(qs)} questions)")
|
||||
except Exception as e:
|
||||
print(f"FAILED: {e}")
|
||||
structure = {**meta, "questions": all_questions}
|
||||
questions = sort_questions(all_questions)
|
||||
print(f" Total: {len(questions)} questions extracted")
|
||||
|
||||
# 4. 答案匹配
|
||||
answers_map = {}
|
||||
if answer_bytes:
|
||||
print(" Vision answer matching...", end=" ", flush=True)
|
||||
answer_images = pdf_to_images(answer_bytes)
|
||||
questions_json = json.dumps(
|
||||
[{"question_number": q["question_number"], "question_type": q["question_type"]}
|
||||
for q in questions], ensure_ascii=False
|
||||
)
|
||||
try:
|
||||
match_result = await gemini_vision_json(
|
||||
system_prompt=ANSWER_MATCH_PROMPT.format(
|
||||
questions_json=questions_json, answer_text="(See images)"
|
||||
),
|
||||
images=answer_images,
|
||||
user_text=f"Match answers to these questions: {questions_json}",
|
||||
temperature=0,
|
||||
)
|
||||
answers_map = {a["question_number"]: a for a in match_result.get("answers", [])}
|
||||
print(f"done ({len(answers_map)} matched)")
|
||||
except Exception as e:
|
||||
print(f"FAILED: {e}")
|
||||
|
||||
# 5. 构建 payloads(exec Python)
|
||||
import numpy as np
|
||||
exec_namespaces: dict = {}
|
||||
batched_payloads = []
|
||||
|
||||
for q in questions:
|
||||
qnum = q["question_number"]
|
||||
answer = answers_map.get(qnum, {})
|
||||
full_text = q["question_text"] or ""
|
||||
|
||||
answer_section = ""
|
||||
if answer.get("raw_answer_text"):
|
||||
answer_section = answer["raw_answer_text"]
|
||||
elif answer.get("correct_option"):
|
||||
answer_section = f"Correct option: {answer['correct_option']}"
|
||||
elif answer.get("correct_answer"):
|
||||
answer_section = f"Correct answer: {answer['correct_answer']}"
|
||||
|
||||
if not answer_section:
|
||||
parent_q = q.get("parent_question")
|
||||
group_key = parent_q or qnum
|
||||
if group_key not in exec_namespaces:
|
||||
ns: dict = {"np": np}
|
||||
setup = extract_code_lines(full_text)
|
||||
try_exec_python(setup, ns)
|
||||
exec_namespaces[group_key] = ns
|
||||
ns = exec_namespaces[group_key]
|
||||
print_lines = [l.strip() for l in full_text.splitlines() if l.strip().startswith("print(")]
|
||||
if print_lines:
|
||||
out = try_exec_python(print_lines[-1], ns)
|
||||
if out is not None:
|
||||
answer_section = f"Executed output: {out}"
|
||||
print(f" [exec] {qnum}: {out[:60]}")
|
||||
|
||||
batched_payloads.append({
|
||||
"question_number": qnum,
|
||||
"question_type": q["question_type"],
|
||||
"score": q.get("score", "unknown"),
|
||||
"question_text": full_text,
|
||||
"topics": q.get("topics", []),
|
||||
"reference_answer": answer_section,
|
||||
})
|
||||
|
||||
# 6. AI trio
|
||||
print(f" Generating AI trio ({len(batched_payloads)} questions, {len(list(chunked(batched_payloads, 3)))} batches)...")
|
||||
analyses: dict = {}
|
||||
for batch in chunked(batched_payloads, 3):
|
||||
nums = [p["question_number"] for p in batch]
|
||||
print(f" Batch {nums}...", end=" ", flush=True)
|
||||
try:
|
||||
result = await deepseek_json_completion(
|
||||
system_prompt=BATCH_ANALYSIS_PROMPT.format(
|
||||
questions_payload=json.dumps(batch, ensure_ascii=False)
|
||||
),
|
||||
temperature=0.3,
|
||||
)
|
||||
for item in result.get("analyses", []):
|
||||
if item.get("question_number"):
|
||||
analyses[item["question_number"]] = item
|
||||
print(f"done ({len(result.get('analyses', []))})")
|
||||
except Exception as e:
|
||||
print(f"FAILED: {e}")
|
||||
await asyncio.sleep(1)
|
||||
|
||||
# 7. 删除旧题目,写入新题目
|
||||
print(" Writing to DB...", end=" ", flush=True)
|
||||
sb.table("paper_questions").delete().eq("paper_id", paper_id).execute()
|
||||
|
||||
for i, q in enumerate(questions):
|
||||
qnum = q["question_number"]
|
||||
answer = answers_map.get(qnum, {})
|
||||
analysis = analyses.get(qnum, {})
|
||||
sb.table("paper_questions").insert(strip_nulls({
|
||||
"paper_id": paper_id,
|
||||
"question_number": qnum,
|
||||
"parent_question": q.get("parent_question"),
|
||||
"display_order": i,
|
||||
"question_type": q["question_type"],
|
||||
"question_text": q["question_text"],
|
||||
"score": q.get("score"),
|
||||
"page_number": q.get("page_number"),
|
||||
"options": q.get("options"),
|
||||
"correct_option": answer.get("correct_option"),
|
||||
"correct_answer": answer.get("correct_answer"),
|
||||
"raw_answer_text": answer.get("raw_answer_text"),
|
||||
"topics": q.get("topics", []),
|
||||
"analytics_topic": q.get("topics", [None])[0],
|
||||
"topic_tags": q.get("topics", []),
|
||||
"difficulty": q.get("difficulty"),
|
||||
"knowledge_reminder": analysis.get("knowledge_reminder", ""),
|
||||
"ai_hint": analysis.get("ai_hint", ""),
|
||||
"solution": analysis.get("solution", ""),
|
||||
})).execute()
|
||||
|
||||
sb.table("papers").update({
|
||||
"question_count": len(questions),
|
||||
"total_score": structure.get("total_score"),
|
||||
"topics_summary": structure.get("topics_summary"),
|
||||
"difficulty_level": structure.get("difficulty_level"),
|
||||
}).eq("id", paper_id).execute()
|
||||
|
||||
print(f"done ({len(questions)} questions written)")
|
||||
|
||||
|
||||
async def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--course", help="Course code")
|
||||
parser.add_argument("--paper-id", help="Single paper ID")
|
||||
args = parser.parse_args()
|
||||
|
||||
sb = get_supabase()
|
||||
query = sb.table("papers").select("*").eq("status", "ready")
|
||||
if args.paper_id:
|
||||
query = query.eq("id", args.paper_id)
|
||||
elif args.course:
|
||||
query = query.eq("course_code", args.course.upper())
|
||||
papers = query.order("created_at").execute().data
|
||||
|
||||
print(f"Papers to reprocess: {len(papers)}")
|
||||
for paper in papers:
|
||||
try:
|
||||
await reprocess_paper(paper)
|
||||
except Exception as e:
|
||||
print(f" ERROR: {e}")
|
||||
traceback.print_exc()
|
||||
|
||||
print("\nAll done.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
29
backend/fill_manual_study_aids.py
Normal file
29
backend/fill_manual_study_aids.py
Normal file
@@ -0,0 +1,29 @@
|
||||
"""Deprecated: study aids must come from LLM output, not template fillers."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
|
||||
|
||||
MESSAGE = """
|
||||
fill_manual_study_aids.py is intentionally disabled.
|
||||
|
||||
Reason:
|
||||
- knowledge_reminder / ai_hint / solution must be generated by LLM
|
||||
- template-based filler content polluted the COMP2211 course library
|
||||
|
||||
Use one of these paths instead:
|
||||
1. Regenerate study aids through the real LLM pipeline in app/services/paper_processor.py
|
||||
2. Rebuild paper_questions from a reviewed source and then run LLM generation
|
||||
|
||||
This script must not be used to backfill production study aids.
|
||||
""".strip()
|
||||
|
||||
|
||||
def main() -> None:
|
||||
print(MESSAGE, file=sys.stderr)
|
||||
raise SystemExit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
240
backend/import_course_manifest.py
Normal file
240
backend/import_course_manifest.py
Normal file
@@ -0,0 +1,240 @@
|
||||
"""Import a canonical course manifest into Supabase-backed papers."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from app.services.paper_processor import process_paper
|
||||
from app.services.supabase_client import get_supabase
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Import a canonical course paper manifest into Supabase."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--manifest",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="Path to the manifest JSON file.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--papers-root",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="Root folder that contains the course PDF files referenced by the manifest.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--user-id",
|
||||
required=False,
|
||||
help="Existing auth.users UUID used as the owner of imported course-library rows.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--course-code",
|
||||
help="Optional filter to only import entries from one course.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--exam-key",
|
||||
action="append",
|
||||
dest="exam_keys",
|
||||
default=[],
|
||||
help="Optional exam_key filter. Repeat the flag to import multiple entries.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--process",
|
||||
action="store_true",
|
||||
help="Run the full paper processing pipeline after the files are uploaded.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Print what would be imported without uploading or writing database rows.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_manifest(path: Path) -> list[dict[str, Any]]:
|
||||
with path.open("r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
if not isinstance(data, list):
|
||||
raise ValueError("Manifest must be a JSON array.")
|
||||
return data
|
||||
|
||||
|
||||
def should_import(entry: dict[str, Any], args: argparse.Namespace) -> bool:
|
||||
if args.course_code and entry.get("course_code") != args.course_code:
|
||||
return False
|
||||
if args.exam_keys and entry.get("exam_key") not in set(args.exam_keys):
|
||||
return False
|
||||
return bool(entry.get("importable"))
|
||||
|
||||
|
||||
def resolve_file_path(root: Path, filename: str | None) -> Path | None:
|
||||
if not filename:
|
||||
return None
|
||||
|
||||
direct = root / filename
|
||||
if direct.exists():
|
||||
return direct
|
||||
|
||||
all_files = [candidate for candidate in root.iterdir() if candidate.is_file()]
|
||||
|
||||
def normalize(name: str) -> str:
|
||||
return name.replace(" (1)", "")
|
||||
|
||||
target_name = normalize(filename)
|
||||
normalized = [candidate for candidate in all_files if normalize(candidate.name) == target_name]
|
||||
if len(normalized) == 1:
|
||||
return normalized[0]
|
||||
|
||||
path = Path(filename)
|
||||
normalized_stem = normalize(path.stem)
|
||||
suffix = path.suffix
|
||||
stem_matches = [
|
||||
candidate
|
||||
for candidate in all_files
|
||||
if candidate.suffix == suffix and normalize(candidate.stem) == normalized_stem
|
||||
]
|
||||
if len(stem_matches) == 1:
|
||||
return stem_matches[0]
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def read_file_bytes(root: Path, filename: str | None) -> bytes | None:
|
||||
if not filename:
|
||||
return None
|
||||
path = resolve_file_path(root, filename)
|
||||
if path is None or not path.exists():
|
||||
raise FileNotFoundError(f"Referenced file does not exist under {root}: {filename}")
|
||||
return path.read_bytes()
|
||||
|
||||
|
||||
def build_storage_path(entry: dict[str, Any], kind: str) -> str:
|
||||
exam_key = entry["exam_key"]
|
||||
return f"course-library/{entry['course_code']}/{exam_key}/{kind}.pdf"
|
||||
|
||||
|
||||
def upsert_paper_record(
|
||||
entry: dict[str, Any],
|
||||
user_id: str | None,
|
||||
paper_url: str,
|
||||
answer_url: str | None,
|
||||
) -> str:
|
||||
sb = get_supabase()
|
||||
payload = {
|
||||
"user_id": user_id,
|
||||
"course_code": entry["course_code"],
|
||||
"year": entry["year"],
|
||||
"term": entry["term"],
|
||||
"exam_type": entry["exam_type"],
|
||||
"part_label": entry.get("part_label"),
|
||||
"paper_file_url": paper_url,
|
||||
"answer_file_url": answer_url,
|
||||
"status": "processing",
|
||||
"source_kind": "course_library",
|
||||
"source_exam_key": entry["exam_key"],
|
||||
"source_question_filename": entry.get("question_pdf"),
|
||||
"source_answer_filename": entry.get("primary_answer_pdf"),
|
||||
}
|
||||
|
||||
existing = (
|
||||
sb.table("papers")
|
||||
.select("id")
|
||||
.eq("source_kind", "course_library")
|
||||
.eq("source_exam_key", entry["exam_key"])
|
||||
.limit(1)
|
||||
.execute()
|
||||
.data
|
||||
)
|
||||
if existing:
|
||||
paper_id = existing[0]["id"]
|
||||
sb.table("papers").update(payload).eq("id", paper_id).execute()
|
||||
return paper_id
|
||||
|
||||
created = sb.table("papers").insert(payload).execute().data
|
||||
return created[0]["id"]
|
||||
|
||||
|
||||
def reset_existing_processed_data(paper_id: str) -> None:
|
||||
sb = get_supabase()
|
||||
sb.table("paper_questions").delete().eq("paper_id", paper_id).execute()
|
||||
sb.table("papers").update(
|
||||
{
|
||||
"status": "processing",
|
||||
"error_message": None,
|
||||
"paper_extracted_text": None,
|
||||
"answer_extracted_text": None,
|
||||
"total_score": None,
|
||||
"question_count": None,
|
||||
"topics_summary": None,
|
||||
"difficulty_level": None,
|
||||
}
|
||||
).eq("id", paper_id).execute()
|
||||
|
||||
|
||||
async def import_entry(
|
||||
entry: dict[str, Any],
|
||||
args: argparse.Namespace,
|
||||
) -> None:
|
||||
paper_bytes = read_file_bytes(args.papers_root, entry.get("question_pdf"))
|
||||
answer_bytes = read_file_bytes(args.papers_root, entry.get("primary_answer_pdf"))
|
||||
|
||||
if paper_bytes is None:
|
||||
raise ValueError(f"Importable entry is missing question PDF: {entry['exam_key']}")
|
||||
|
||||
if args.dry_run:
|
||||
print(
|
||||
f"[dry-run] {entry['exam_key']}: "
|
||||
f"question={entry.get('question_pdf')} answer={entry.get('primary_answer_pdf')}"
|
||||
)
|
||||
return
|
||||
|
||||
sb = get_supabase()
|
||||
paper_path = build_storage_path(entry, "paper")
|
||||
sb.storage.from_("papers").upload(
|
||||
paper_path,
|
||||
paper_bytes,
|
||||
file_options={"content-type": "application/pdf", "upsert": "true"},
|
||||
)
|
||||
paper_url = sb.storage.from_("papers").get_public_url(paper_path)
|
||||
|
||||
answer_url = None
|
||||
if answer_bytes:
|
||||
answer_path = build_storage_path(entry, "answer")
|
||||
sb.storage.from_("papers").upload(
|
||||
answer_path,
|
||||
answer_bytes,
|
||||
file_options={"content-type": "application/pdf", "upsert": "true"},
|
||||
)
|
||||
answer_url = sb.storage.from_("papers").get_public_url(answer_path)
|
||||
|
||||
paper_id = upsert_paper_record(entry, args.user_id, paper_url, answer_url)
|
||||
print(f"Imported metadata for {entry['exam_key']} -> paper_id={paper_id}")
|
||||
|
||||
if args.process:
|
||||
reset_existing_processed_data(paper_id)
|
||||
await process_paper(paper_id, paper_bytes, answer_bytes)
|
||||
print(f"Processed {entry['exam_key']}")
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
args = parse_args()
|
||||
manifest = load_manifest(args.manifest)
|
||||
entries = [entry for entry in manifest if should_import(entry, args)]
|
||||
|
||||
if not entries:
|
||||
print("No manifest entries matched the provided filters.")
|
||||
return
|
||||
|
||||
print(f"Preparing to import {len(entries)} manifest entries.")
|
||||
for entry in entries:
|
||||
await import_entry(entry, args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
17
backend/pyproject.toml
Normal file
17
backend/pyproject.toml
Normal file
@@ -0,0 +1,17 @@
|
||||
[project]
|
||||
name = "pastpaper-master-backend"
|
||||
version = "0.1.0"
|
||||
requires-python = ">=3.11"
|
||||
dependencies = [
|
||||
"fastapi>=0.115.0",
|
||||
"uvicorn[standard]>=0.30.0",
|
||||
"python-dotenv>=1.0.0",
|
||||
"python-multipart>=0.0.9",
|
||||
"supabase>=2.0.0",
|
||||
"openai>=1.50.0",
|
||||
"PyMuPDF>=1.24.0",
|
||||
"pydantic>=2.0.0",
|
||||
"pydantic-settings>=2.0.0",
|
||||
"httpx>=0.27.0",
|
||||
"numpy>=2.4.4",
|
||||
]
|
||||
174
backend/regen_ai_trio_comp2211.py
Normal file
174
backend/regen_ai_trio_comp2211.py
Normal file
@@ -0,0 +1,174 @@
|
||||
"""Regenerate AI trio (knowledge_reminder, ai_hint, solution) for all COMP2211 course-library questions.
|
||||
|
||||
Reads existing paper_questions rows and runs the same BATCH_ANALYSIS_PROMPT used by
|
||||
paper_processor.py — but does UPDATE instead of INSERT, so question structure is untouched.
|
||||
|
||||
Run from the backend directory:
|
||||
uv run python regen_ai_trio_comp2211.py
|
||||
|
||||
Pass --dry-run to print batches without calling the LLM or writing to the database.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import sys
|
||||
from app.services.supabase_client import get_supabase
|
||||
from app.services.paper_processor import BATCH_ANALYSIS_PROMPT, qwen_json_completion, chunked
|
||||
|
||||
|
||||
def build_reference_answer(q: dict) -> str:
|
||||
if q.get("raw_answer_text"):
|
||||
return q["raw_answer_text"]
|
||||
if q.get("correct_option"):
|
||||
return f"Correct option: {q['correct_option']}"
|
||||
if q.get("correct_answer"):
|
||||
return f"Correct answer: {q['correct_answer']}"
|
||||
return ""
|
||||
|
||||
|
||||
async def regen(dry_run: bool = False) -> None:
|
||||
sb = get_supabase()
|
||||
|
||||
papers = (
|
||||
sb.table("papers")
|
||||
.select("id")
|
||||
.eq("course_code", "COMP2211")
|
||||
.eq("source_kind", "course_library")
|
||||
.execute()
|
||||
.data
|
||||
)
|
||||
paper_ids = [p["id"] for p in papers]
|
||||
if not paper_ids:
|
||||
print("No COMP2211 course-library papers found.")
|
||||
return
|
||||
|
||||
questions = (
|
||||
sb.table("paper_questions")
|
||||
.select("id, paper_id, question_number, question_type, score, question_text, topics, raw_answer_text, correct_option, correct_answer")
|
||||
.in_("paper_id", paper_ids)
|
||||
.order("paper_id")
|
||||
.order("display_order")
|
||||
.execute()
|
||||
.data
|
||||
)
|
||||
print(f"Found {len(questions)} questions across {len(paper_ids)} papers.")
|
||||
|
||||
payloads = [
|
||||
{
|
||||
"question_number": q["question_number"],
|
||||
"question_type": q["question_type"] or "long_question",
|
||||
"score": q.get("score") or "unknown",
|
||||
"question_text": q.get("question_text") or "",
|
||||
"topics": q.get("topics") or [],
|
||||
"reference_answer": build_reference_answer(q),
|
||||
}
|
||||
for q in questions
|
||||
]
|
||||
|
||||
id_by_qnum_paper: dict[tuple[str, str], str] = {
|
||||
(q["paper_id"], q["question_number"]): q["id"]
|
||||
for q in questions
|
||||
}
|
||||
paper_id_by_qnum: dict[str, str] = {
|
||||
q["question_number"]: q["paper_id"] for q in questions
|
||||
}
|
||||
|
||||
# Group payloads by paper so batches don't mix papers (cleaner context for LLM)
|
||||
from collections import defaultdict
|
||||
payloads_by_paper: dict[str, list[dict]] = defaultdict(list)
|
||||
for q, payload in zip(questions, payloads):
|
||||
payloads_by_paper[q["paper_id"]].append((q["id"], payload))
|
||||
|
||||
total_updated = 0
|
||||
total_papers = len(payloads_by_paper)
|
||||
|
||||
for paper_idx, (paper_id, items) in enumerate(payloads_by_paper.items(), 1):
|
||||
ids = [item[0] for item in items]
|
||||
batch_payloads = [item[1] for item in items]
|
||||
|
||||
print(f"\n[{paper_idx}/{total_papers}] paper_id={paper_id} — {len(batch_payloads)} questions")
|
||||
|
||||
for batch_idx, batch in enumerate(chunked(batch_payloads, 3), 1):
|
||||
print(f" Batch {batch_idx}: questions {[b['question_number'] for b in batch]}", end="", flush=True)
|
||||
|
||||
if dry_run:
|
||||
print(" [dry-run, skipped]")
|
||||
continue
|
||||
|
||||
batch_start = (batch_idx - 1) * 3
|
||||
batch_ids = ids[batch_start: batch_start + 3]
|
||||
|
||||
async def run_single(row_id: str, payload: dict) -> bool:
|
||||
try:
|
||||
r = await qwen_json_completion(
|
||||
system_prompt=BATCH_ANALYSIS_PROMPT.format(
|
||||
questions_payload=json.dumps([payload], ensure_ascii=False),
|
||||
),
|
||||
temperature=0.3,
|
||||
max_tokens=8192,
|
||||
)
|
||||
items = r.get("analyses", [])
|
||||
if not items:
|
||||
return False
|
||||
analysis = items[0]
|
||||
sb.table("paper_questions").update({
|
||||
"knowledge_reminder": analysis.get("knowledge_reminder", ""),
|
||||
"ai_hint": analysis.get("ai_hint", ""),
|
||||
"solution": analysis.get("solution", ""),
|
||||
}).eq("id", row_id).execute()
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
try:
|
||||
result = await qwen_json_completion(
|
||||
system_prompt=BATCH_ANALYSIS_PROMPT.format(
|
||||
questions_payload=json.dumps(batch, ensure_ascii=False),
|
||||
),
|
||||
temperature=0.3,
|
||||
max_tokens=8192,
|
||||
)
|
||||
analyses = {item["question_number"]: item for item in result.get("analyses", [])}
|
||||
written = 0
|
||||
for row_id, payload in zip(batch_ids, batch):
|
||||
qnum = payload["question_number"]
|
||||
analysis = analyses.get(qnum)
|
||||
if not analysis:
|
||||
# fallback: retry this single question alone
|
||||
ok = await run_single(row_id, payload)
|
||||
if ok:
|
||||
written += 1
|
||||
total_updated += 1
|
||||
else:
|
||||
print(f"\n SKIP: {qnum}")
|
||||
else:
|
||||
sb.table("paper_questions").update({
|
||||
"knowledge_reminder": analysis.get("knowledge_reminder", ""),
|
||||
"ai_hint": analysis.get("ai_hint", ""),
|
||||
"solution": analysis.get("solution", ""),
|
||||
}).eq("id", row_id).execute()
|
||||
written += 1
|
||||
total_updated += 1
|
||||
print(f" → {written} written")
|
||||
except Exception as exc:
|
||||
# batch failed entirely — retry each question individually
|
||||
print(f" [batch error, retrying 1-by-1]")
|
||||
written = 0
|
||||
for row_id, payload in zip(batch_ids, batch):
|
||||
ok = await run_single(row_id, payload)
|
||||
if ok:
|
||||
written += 1
|
||||
total_updated += 1
|
||||
await asyncio.sleep(1)
|
||||
print(f" → {written}/{len(batch)} written")
|
||||
|
||||
await asyncio.sleep(2.5)
|
||||
|
||||
print(f"\nDone. {total_updated} questions updated.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
dry_run = "--dry-run" in sys.argv
|
||||
asyncio.run(regen(dry_run=dry_run))
|
||||
69
backend/regenerate_analysis.py
Normal file
69
backend/regenerate_analysis.py
Normal file
@@ -0,0 +1,69 @@
|
||||
"""Re-generate AI trio (knowledge_reminder, ai_hint, solution) in English for existing questions."""
|
||||
|
||||
import json
|
||||
import asyncio
|
||||
from app.services.supabase_client import get_supabase
|
||||
from app.services.llm_clients import get_qwen_client
|
||||
from app.services.paper_processor import ANALYSIS_PROMPT
|
||||
|
||||
|
||||
async def regenerate_for_paper(paper_id: str):
|
||||
sb = get_supabase()
|
||||
qwen = get_qwen_client()
|
||||
|
||||
questions = sb.table("paper_questions").select("*").eq("paper_id", paper_id).order("display_order").execute().data
|
||||
print(f"Found {len(questions)} questions for paper {paper_id[:8]}")
|
||||
|
||||
for q in questions:
|
||||
qnum = q["question_number"]
|
||||
print(f" Regenerating Q{qnum}...", end=" ", flush=True)
|
||||
|
||||
answer_section = ""
|
||||
if q.get("raw_answer_text"):
|
||||
answer_section = f"- Reference answer: {q['raw_answer_text']}"
|
||||
elif q.get("correct_option"):
|
||||
answer_section = f"- Correct option: {q['correct_option']}"
|
||||
elif q.get("correct_answer"):
|
||||
answer_section = f"- Correct answer: {q['correct_answer']}"
|
||||
|
||||
resp = qwen.chat.completions.create(
|
||||
model="qwen-plus",
|
||||
messages=[
|
||||
{"role": "system", "content": ANALYSIS_PROMPT.format(
|
||||
question_number=qnum,
|
||||
question_type=q["question_type"],
|
||||
score=q.get("score", "unknown"),
|
||||
question_text=q["question_text"],
|
||||
topics=", ".join(q.get("topics", [])),
|
||||
answer_section=answer_section,
|
||||
)},
|
||||
],
|
||||
temperature=0.3,
|
||||
response_format={"type": "json_object"},
|
||||
)
|
||||
analysis = json.loads(resp.choices[0].message.content)
|
||||
|
||||
sb.table("paper_questions").update({
|
||||
"knowledge_reminder": analysis.get("knowledge_reminder", ""),
|
||||
"ai_hint": analysis.get("ai_hint", ""),
|
||||
"solution": analysis.get("solution", ""),
|
||||
}).eq("id", q["id"]).execute()
|
||||
|
||||
print("done")
|
||||
|
||||
print(f"All questions regenerated for paper {paper_id[:8]}")
|
||||
|
||||
|
||||
async def main():
|
||||
sb = get_supabase()
|
||||
papers = sb.table("papers").select("id,course_code,year,term").eq("status", "ready").order("created_at", desc=True).execute().data
|
||||
|
||||
for p in papers:
|
||||
print(f"\n=== {p['course_code']} {p['year']} {p['term']} ===")
|
||||
await regenerate_for_paper(p["id"])
|
||||
|
||||
print("\nAll done!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
224
backend/split_comp2211_2022_spring_final_part_a.py
Normal file
224
backend/split_comp2211_2022_spring_final_part_a.py
Normal file
@@ -0,0 +1,224 @@
|
||||
"""Split COMP2211 Spring 2022 final part A into subquestions."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
from app.services.supabase_client import get_supabase
|
||||
|
||||
|
||||
EXAM_KEY = "COMP2211-2022-spring-final-part-a"
|
||||
TRUE_FALSE_OPTIONS = [{"label": "True", "text": "True"}, {"label": "False", "text": "False"}]
|
||||
PROBLEM_SEED_PATH = (
|
||||
Path(__file__).resolve().parent.parent
|
||||
/ "pastpaper-scraper"
|
||||
/ "reviews"
|
||||
/ "COMP2211"
|
||||
/ "problem_seed.json"
|
||||
)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ChildSpec:
|
||||
question_number: str
|
||||
parent_question: str
|
||||
top_level_number: str
|
||||
path: tuple[str, ...]
|
||||
score: float
|
||||
question_type: str
|
||||
question_format: str | None = None
|
||||
analytics_topic: str | None = None
|
||||
topic_primary: str | None = None
|
||||
topic_tags: tuple[str, ...] | None = None
|
||||
skill_tags: tuple[str, ...] | None = None
|
||||
page_number: int = 1
|
||||
|
||||
|
||||
def short_answer(
|
||||
question_number: str,
|
||||
parent_question: str,
|
||||
top_level_number: str,
|
||||
path: tuple[str, ...],
|
||||
score: float,
|
||||
*,
|
||||
analytics_topic: str | None = None,
|
||||
topic_primary: str | None = None,
|
||||
topic_tags: tuple[str, ...] | None = None,
|
||||
skill_tags: tuple[str, ...] | None = None,
|
||||
page_number: int,
|
||||
) -> ChildSpec:
|
||||
return ChildSpec(
|
||||
question_number=question_number,
|
||||
parent_question=parent_question,
|
||||
top_level_number=top_level_number,
|
||||
path=path,
|
||||
score=score,
|
||||
question_type="long_question",
|
||||
question_format="short_answer",
|
||||
analytics_topic=analytics_topic,
|
||||
topic_primary=topic_primary,
|
||||
topic_tags=topic_tags,
|
||||
skill_tags=skill_tags,
|
||||
page_number=page_number,
|
||||
)
|
||||
|
||||
|
||||
CHILDREN: list[ChildSpec] = [
|
||||
ChildSpec("1a", "1", "1", ("a",), 1, "true_false", "true_false", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("concept_check", "algorithm_property"), page_number=2),
|
||||
ChildSpec("1b", "1", "1", ("b",), 1, "true_false", "true_false", "Perceptron and MLP", "Perceptron and MLP", ("Perceptron and MLP",), ("concept_check", "architecture_reasoning"), page_number=2),
|
||||
ChildSpec("1c", "1", "1", ("c",), 1, "true_false", "true_false", "Perceptron and MLP", "Perceptron and MLP", ("Perceptron and MLP",), ("concept_check", "activation_selection"), page_number=2),
|
||||
ChildSpec("1d", "1", "1", ("d",), 1, "true_false", "true_false", "Evaluation and Validation", "Evaluation and Validation", ("Evaluation and Validation",), ("concept_check", "metric_reasoning"), page_number=2),
|
||||
ChildSpec("1e", "1", "1", ("e",), 1, "true_false", "true_false", "Vision and CNN", "Vision and CNN", ("Vision and CNN",), ("concept_check", "hardware_reasoning"), page_number=2),
|
||||
ChildSpec("1f", "1", "1", ("f",), 1, "true_false", "true_false", "Vision and CNN", "Vision and CNN", ("Vision and CNN",), ("concept_check", "image_processing"), page_number=2),
|
||||
ChildSpec("1g", "1", "1", ("g",), 1, "true_false", "true_false", "Vision and CNN", "Vision and CNN", ("Vision and CNN",), ("concept_check", "cnn_architecture"), page_number=2),
|
||||
ChildSpec("1h", "1", "1", ("h",), 1, "true_false", "true_false", "Perceptron and MLP", "Perceptron and MLP", ("Perceptron and MLP",), ("concept_check", "regularization"), page_number=2),
|
||||
ChildSpec("1i", "1", "1", ("i",), 1, "true_false", "true_false", "Search and Games", "Search and Games", ("Search and Games",), ("concept_check", "game_reasoning"), page_number=2),
|
||||
ChildSpec("1j", "1", "1", ("j",), 1, "true_false", "true_false", "Search and Games", "Search and Games", ("Search and Games",), ("concept_check", "pruning_reasoning"), page_number=2),
|
||||
ChildSpec("2a", "2", "2", ("a",), 6.5, "long_question", "long_answer", "Probabilistic Models", "Probabilistic Models", ("Probabilistic Models",), ("manual_computation", "probability_reasoning", "classification_decision"), page_number=4),
|
||||
ChildSpec("2b", "2", "2", ("b",), 7.5, "long_question", "long_answer", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("distance_calculation", "algorithm_tracing", "classification_decision"), page_number=4),
|
||||
short_answer("3a", "3", "3", ("a",), 3, analytics_topic="Evaluation and Validation", topic_primary="Evaluation and Validation", topic_tags=("Evaluation and Validation",), skill_tags=("concept_explanation", "metric_reasoning"), page_number=6),
|
||||
short_answer("3b", "3", "3", ("b",), 2, analytics_topic="Perceptron and MLP", topic_primary="Perceptron and MLP", topic_tags=("Perceptron and MLP",), skill_tags=("concept_explanation", "activation_selection"), page_number=6),
|
||||
short_answer("3c", "3", "3", ("c",), 2, analytics_topic="Perceptron and MLP", topic_primary="Perceptron and MLP", topic_tags=("Perceptron and MLP",), skill_tags=("architecture_reasoning", "output_layer_design"), page_number=6),
|
||||
short_answer("3d", "3", "3", ("d",), 3, analytics_topic="Perceptron and MLP", topic_primary="Perceptron and MLP", topic_tags=("Perceptron and MLP",), skill_tags=("concept_explanation", "optimization_reasoning"), page_number=6),
|
||||
short_answer("3e_i", "3e", "3", ("e", "i"), 1, analytics_topic="Perceptron and MLP", topic_primary="Perceptron and MLP", topic_tags=("Perceptron and MLP",), skill_tags=("optimization_reasoning",), page_number=6),
|
||||
short_answer("3e_ii", "3e", "3", ("e", "ii"), 1, analytics_topic="Perceptron and MLP", topic_primary="Perceptron and MLP", topic_tags=("Perceptron and MLP",), skill_tags=("optimization_reasoning",), page_number=6),
|
||||
short_answer("3f", "3", "3", ("f",), 2, analytics_topic="Perceptron and MLP", topic_primary="Perceptron and MLP", topic_tags=("Perceptron and MLP",), skill_tags=("regularization", "concept_explanation"), page_number=6),
|
||||
ChildSpec("4a_i", "4a", "4", ("a", "i"), 2, "fill_blank", "fill_blank", page_number=7),
|
||||
ChildSpec("4a_ii", "4a", "4", ("a", "ii"), 2, "long_question", "long_answer", page_number=7),
|
||||
ChildSpec("4b_i", "4b", "4", ("b", "i"), 3, "fill_blank", "fill_blank", page_number=7),
|
||||
ChildSpec("4b_ii", "4b", "4", ("b", "ii"), 4, "fill_blank", "fill_blank", page_number=7),
|
||||
ChildSpec("4b_iii", "4b", "4", ("b", "iii"), 4, "long_question", "long_answer", page_number=7),
|
||||
]
|
||||
|
||||
|
||||
MARKER_RE = re.compile(r"(?m)^\(([a-z]+|[ivx]+)\)\s*")
|
||||
|
||||
|
||||
def split_sections(text: str) -> tuple[str, dict[str, str]]:
|
||||
matches = list(MARKER_RE.finditer(text))
|
||||
if not matches:
|
||||
return text.strip(), {}
|
||||
intro = text[: matches[0].start()].strip()
|
||||
sections: dict[str, str] = {}
|
||||
for idx, match in enumerate(matches):
|
||||
marker = match.group(1)
|
||||
end = matches[idx + 1].start() if idx + 1 < len(matches) else len(text)
|
||||
sections[marker] = text[match.start() : end].strip()
|
||||
return intro, sections
|
||||
|
||||
|
||||
def extract_segment(text: str, path: tuple[str, ...]) -> str:
|
||||
current = text.strip()
|
||||
carried_intro: list[str] = []
|
||||
for depth, marker in enumerate(path):
|
||||
intro, sections = split_sections(current)
|
||||
if depth == 0 and intro:
|
||||
carried_intro.append(intro)
|
||||
current = sections.get(marker, current)
|
||||
return "\n".join(part for part in [*carried_intro, current] if part).strip()
|
||||
|
||||
|
||||
def extract_true_false_answers(answer_text: str) -> dict[str, str]:
|
||||
answers: dict[str, str] = {}
|
||||
matches = list(re.finditer(r"(?m)^\(([a-j])\)\s*\n?([TF])\b", answer_text))
|
||||
for match in matches:
|
||||
answers[match.group(1)] = match.group(2)
|
||||
return answers
|
||||
|
||||
|
||||
def derive_correct_answer(answer_text: str) -> str | None:
|
||||
if not answer_text:
|
||||
return None
|
||||
tail = answer_text.split("Answer:", 1)[1] if "Answer:" in answer_text else answer_text
|
||||
lines = [line.strip() for line in tail.splitlines() if line.strip()]
|
||||
if not lines:
|
||||
return None
|
||||
first = lines[0]
|
||||
if first.lower().startswith("marking scheme"):
|
||||
return None
|
||||
if len(first) <= 240:
|
||||
return first
|
||||
return None
|
||||
|
||||
|
||||
def load_seed_rows() -> dict[str, dict]:
|
||||
data = json.loads(PROBLEM_SEED_PATH.read_text())
|
||||
return {
|
||||
row["question_number"]: row
|
||||
for row in data
|
||||
if row["source_exam_key"] == EXAM_KEY
|
||||
}
|
||||
|
||||
|
||||
def main() -> None:
|
||||
sb = get_supabase()
|
||||
paper = sb.table("papers").select("id").eq("source_exam_key", EXAM_KEY).execute().data[0]
|
||||
paper_id = paper["id"]
|
||||
|
||||
current_rows = (
|
||||
sb.table("paper_questions")
|
||||
.select("*")
|
||||
.eq("paper_id", paper_id)
|
||||
.order("display_order")
|
||||
.execute()
|
||||
.data
|
||||
)
|
||||
existing_by_number = {row["question_number"]: row for row in current_rows}
|
||||
parent_rows = load_seed_rows()
|
||||
tf_answers = extract_true_false_answers(parent_rows["1"]["raw_answer_text"] or "")
|
||||
|
||||
inserts = []
|
||||
for display_order, child in enumerate(CHILDREN, start=1):
|
||||
parent = parent_rows[child.top_level_number]
|
||||
existing = existing_by_number.get(child.question_number, {})
|
||||
question_text = extract_segment(parent["question_text"] or "", child.path)
|
||||
raw_answer_text = extract_segment(parent["raw_answer_text"] or "", child.path)
|
||||
|
||||
correct_option = None
|
||||
correct_answer = None
|
||||
options = None
|
||||
if child.question_type == "true_false":
|
||||
correct_option = tf_answers.get(child.path[0])
|
||||
options = TRUE_FALSE_OPTIONS
|
||||
elif child.question_type == "fill_blank":
|
||||
correct_answer = derive_correct_answer(raw_answer_text)
|
||||
|
||||
inserts.append(
|
||||
{
|
||||
"paper_id": paper_id,
|
||||
"question_number": child.question_number,
|
||||
"parent_question": child.parent_question,
|
||||
"display_order": display_order,
|
||||
"question_type": child.question_type,
|
||||
"question_format": child.question_format,
|
||||
"question_text": question_text,
|
||||
"score": child.score,
|
||||
"page_number": child.page_number,
|
||||
"page_y_ratio": existing.get("page_y_ratio"),
|
||||
"options": options,
|
||||
"correct_option": correct_option,
|
||||
"correct_answer": correct_answer,
|
||||
"raw_answer_text": raw_answer_text,
|
||||
"topics": existing.get("topics") or (list(child.topic_tags) if child.topic_tags else parent.get("topics")),
|
||||
"topic_primary": existing.get("topic_primary") or child.topic_primary or parent.get("topic_primary"),
|
||||
"analytics_topic": existing.get("analytics_topic") or child.analytics_topic or parent.get("analytics_topic"),
|
||||
"topic_tags": existing.get("topic_tags") or (list(child.topic_tags) if child.topic_tags else parent.get("topic_tags")),
|
||||
"skill_tags": existing.get("skill_tags") or (list(child.skill_tags) if child.skill_tags else parent.get("skill_tags")),
|
||||
"difficulty": existing.get("difficulty") or parent.get("difficulty"),
|
||||
"knowledge_reminder": existing.get("knowledge_reminder", ""),
|
||||
"ai_hint": existing.get("ai_hint", ""),
|
||||
"solution": existing.get("solution", ""),
|
||||
}
|
||||
)
|
||||
|
||||
sb.table("paper_questions").delete().eq("paper_id", paper_id).execute()
|
||||
sb.table("paper_questions").insert(inserts).execute()
|
||||
sb.table("papers").update({"question_count": len(inserts), "status": "processing"}).eq("id", paper_id).execute()
|
||||
print(f"Inserted {len(inserts)} rows for {EXAM_KEY}.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
232
backend/split_comp2211_2022_spring_final_part_b.py
Normal file
232
backend/split_comp2211_2022_spring_final_part_b.py
Normal file
@@ -0,0 +1,232 @@
|
||||
"""Split COMP2211 Spring 2022 final part B into subquestions."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
from app.services.supabase_client import get_supabase
|
||||
|
||||
|
||||
EXAM_KEY = "COMP2211-2022-spring-final-part-b"
|
||||
PROBLEM_SEED_PATH = (
|
||||
Path(__file__).resolve().parent.parent
|
||||
/ "pastpaper-scraper"
|
||||
/ "reviews"
|
||||
/ "COMP2211"
|
||||
/ "problem_seed.json"
|
||||
)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ChildSpec:
|
||||
question_number: str
|
||||
parent_question: str
|
||||
top_level_number: str
|
||||
path: tuple[str, ...]
|
||||
score: float
|
||||
question_type: str
|
||||
question_format: str | None = None
|
||||
analytics_topic: str | None = None
|
||||
topic_primary: str | None = None
|
||||
topic_tags: tuple[str, ...] | None = None
|
||||
skill_tags: tuple[str, ...] | None = None
|
||||
options: tuple[tuple[str, str], ...] | None = None
|
||||
correct_option: str | None = None
|
||||
correct_answer: str | None = None
|
||||
page_number: int = 1
|
||||
|
||||
|
||||
def short_answer(
|
||||
question_number: str,
|
||||
parent_question: str,
|
||||
top_level_number: str,
|
||||
path: tuple[str, ...],
|
||||
score: float,
|
||||
*,
|
||||
analytics_topic: str | None = None,
|
||||
topic_primary: str | None = None,
|
||||
topic_tags: tuple[str, ...] | None = None,
|
||||
skill_tags: tuple[str, ...] | None = None,
|
||||
correct_answer: str | None = None,
|
||||
page_number: int,
|
||||
) -> ChildSpec:
|
||||
return ChildSpec(
|
||||
question_number=question_number,
|
||||
parent_question=parent_question,
|
||||
top_level_number=top_level_number,
|
||||
path=path,
|
||||
score=score,
|
||||
question_type="long_question",
|
||||
question_format="short_answer",
|
||||
analytics_topic=analytics_topic,
|
||||
topic_primary=topic_primary,
|
||||
topic_tags=topic_tags,
|
||||
skill_tags=skill_tags,
|
||||
correct_answer=correct_answer,
|
||||
page_number=page_number,
|
||||
)
|
||||
|
||||
|
||||
def mc(
|
||||
question_number: str,
|
||||
parent_question: str,
|
||||
top_level_number: str,
|
||||
path: tuple[str, ...],
|
||||
score: float,
|
||||
*,
|
||||
options: tuple[tuple[str, str], ...],
|
||||
correct_option: str,
|
||||
analytics_topic: str,
|
||||
skill_tags: tuple[str, ...],
|
||||
page_number: int,
|
||||
) -> ChildSpec:
|
||||
return ChildSpec(
|
||||
question_number=question_number,
|
||||
parent_question=parent_question,
|
||||
top_level_number=top_level_number,
|
||||
path=path,
|
||||
score=score,
|
||||
question_type="mc",
|
||||
question_format="mc",
|
||||
analytics_topic=analytics_topic,
|
||||
topic_primary=analytics_topic,
|
||||
topic_tags=(analytics_topic,),
|
||||
skill_tags=skill_tags,
|
||||
options=options,
|
||||
correct_option=correct_option,
|
||||
page_number=page_number,
|
||||
)
|
||||
|
||||
|
||||
ETHICS_ABCD = (
|
||||
("A", "A"),
|
||||
("B", "B"),
|
||||
("C", "C"),
|
||||
("D", "D"),
|
||||
)
|
||||
|
||||
|
||||
CHILDREN: list[ChildSpec] = [
|
||||
ChildSpec("1a", "1", "1", ("a",), 1.5, "long_question", "long_answer", page_number=2),
|
||||
short_answer("1b", "1", "1", ("b",), 1.5, analytics_topic="Vision and CNN", topic_primary="Vision and CNN", topic_tags=("Vision and CNN",), skill_tags=("concept_explanation", "data_augmentation"), page_number=2),
|
||||
ChildSpec("1c", "1", "1", ("c",), 4.5, "long_question", "long_answer", page_number=2),
|
||||
short_answer("1d", "1", "1", ("d",), 2, analytics_topic="Vision and CNN", topic_primary="Vision and CNN", topic_tags=("Vision and CNN",), skill_tags=("architecture_reasoning", "parameter_reduction"), page_number=3),
|
||||
ChildSpec("1e", "1", "1", ("e",), 2.5, "fill_blank", "fill_blank", correct_answer="1558656", page_number=3),
|
||||
ChildSpec("1f_i", "1f", "1", ("f", "i"), 2.5, "fill_blank", "fill_blank", correct_answer="2071656", page_number=3),
|
||||
ChildSpec("1f_ii", "1f", "1", ("f", "ii"), 2.5, "fill_blank", "fill_blank", correct_answer="150529000", page_number=4),
|
||||
short_answer("1g", "1", "1", ("g",), 2, analytics_topic="Vision and CNN", topic_primary="Vision and CNN", topic_tags=("Vision and CNN",), skill_tags=("architecture_reasoning", "comparison"), page_number=4),
|
||||
ChildSpec("2a", "2", "2", ("a",), 9, "long_question", "coding", page_number=5),
|
||||
short_answer("2b", "2", "2", ("b",), 4, analytics_topic="Vision and CNN", topic_primary="Vision and CNN", topic_tags=("Vision and CNN",), skill_tags=("architecture_reasoning", "regression_reasoning"), page_number=6),
|
||||
ChildSpec("3a", "3", "3", ("a",), 3.5, "long_question", "long_answer", page_number=9),
|
||||
short_answer("3b", "3", "3", ("b",), 0.5, analytics_topic="Search and Games", topic_primary="Search and Games", topic_tags=("Search and Games",), skill_tags=("game_reasoning",), correct_answer="E-a", page_number=9),
|
||||
short_answer("3c", "3", "3", ("c",), 1.5, analytics_topic="Search and Games", topic_primary="Search and Games", topic_tags=("Search and Games",), skill_tags=("concept_explanation", "game_reasoning"), page_number=9),
|
||||
short_answer("3d", "3", "3", ("d",), 2.5, analytics_topic="Search and Games", topic_primary="Search and Games", topic_tags=("Search and Games",), skill_tags=("pruning_reasoning",), correct_answer="E-j and E-f", page_number=9),
|
||||
mc("4a", "4", "4", ("a",), 1, options=ETHICS_ABCD, correct_option="C", analytics_topic="Ethics of AI", skill_tags=("concept_check", "ethical_reasoning"), page_number=10),
|
||||
mc("4b", "4", "4", ("b",), 1, options=ETHICS_ABCD, correct_option="A", analytics_topic="Ethics of AI", skill_tags=("concept_check", "bias_reasoning"), page_number=10),
|
||||
mc("4c", "4", "4", ("c",), 1, options=ETHICS_ABCD, correct_option="C", analytics_topic="Ethics of AI", skill_tags=("concept_check", "ethical_reasoning"), page_number=10),
|
||||
mc("4d", "4", "4", ("d",), 1, options=ETHICS_ABCD, correct_option="B", analytics_topic="Ethics of AI", skill_tags=("concept_check", "bias_reasoning"), page_number=10),
|
||||
short_answer("4e", "4", "4", ("e",), 3, analytics_topic="Ethics of AI", topic_primary="Ethics of AI", topic_tags=("Ethics of AI",), skill_tags=("argumentation", "concept_explanation"), page_number=11),
|
||||
]
|
||||
|
||||
|
||||
MARKER_RE = re.compile(r"(?m)^\(([a-z]+|[ivx]+)\)\s*")
|
||||
|
||||
|
||||
def split_sections(text: str) -> tuple[str, dict[str, str]]:
|
||||
matches = list(MARKER_RE.finditer(text))
|
||||
if not matches:
|
||||
return text.strip(), {}
|
||||
intro = text[: matches[0].start()].strip()
|
||||
sections: dict[str, str] = {}
|
||||
for idx, match in enumerate(matches):
|
||||
marker = match.group(1)
|
||||
end = matches[idx + 1].start() if idx + 1 < len(matches) else len(text)
|
||||
sections[marker] = text[match.start() : end].strip()
|
||||
return intro, sections
|
||||
|
||||
|
||||
def extract_segment(text: str, path: tuple[str, ...]) -> str:
|
||||
current = text.strip()
|
||||
carried_intro: list[str] = []
|
||||
for depth, marker in enumerate(path):
|
||||
intro, sections = split_sections(current)
|
||||
if depth == 0 and intro:
|
||||
carried_intro.append(intro)
|
||||
current = sections.get(marker, current)
|
||||
return "\n".join(part for part in [*carried_intro, current] if part).strip()
|
||||
|
||||
|
||||
def load_seed_rows() -> dict[str, dict]:
|
||||
data = json.loads(PROBLEM_SEED_PATH.read_text())
|
||||
return {
|
||||
row["question_number"]: row
|
||||
for row in data
|
||||
if row["source_exam_key"] == EXAM_KEY
|
||||
}
|
||||
|
||||
|
||||
def main() -> None:
|
||||
sb = get_supabase()
|
||||
paper = sb.table("papers").select("id").eq("source_exam_key", EXAM_KEY).execute().data[0]
|
||||
paper_id = paper["id"]
|
||||
|
||||
current_rows = (
|
||||
sb.table("paper_questions")
|
||||
.select("*")
|
||||
.eq("paper_id", paper_id)
|
||||
.order("display_order")
|
||||
.execute()
|
||||
.data
|
||||
)
|
||||
existing_by_number = {row["question_number"]: row for row in current_rows}
|
||||
parent_rows = load_seed_rows()
|
||||
|
||||
inserts = []
|
||||
for display_order, child in enumerate(CHILDREN, start=1):
|
||||
parent = parent_rows[child.top_level_number]
|
||||
existing = existing_by_number.get(child.question_number, {})
|
||||
question_text = extract_segment(parent["question_text"] or "", child.path)
|
||||
raw_answer_text = extract_segment(parent["raw_answer_text"] or "", child.path)
|
||||
options = None
|
||||
if child.options:
|
||||
options = [{"label": label, "text": text} for label, text in child.options]
|
||||
|
||||
inserts.append(
|
||||
{
|
||||
"paper_id": paper_id,
|
||||
"question_number": child.question_number,
|
||||
"parent_question": child.parent_question,
|
||||
"display_order": display_order,
|
||||
"question_type": child.question_type,
|
||||
"question_format": child.question_format,
|
||||
"question_text": question_text,
|
||||
"score": child.score,
|
||||
"page_number": child.page_number,
|
||||
"page_y_ratio": existing.get("page_y_ratio"),
|
||||
"options": options,
|
||||
"correct_option": child.correct_option,
|
||||
"correct_answer": child.correct_answer,
|
||||
"raw_answer_text": raw_answer_text,
|
||||
"topics": existing.get("topics") or (list(child.topic_tags) if child.topic_tags else parent.get("topics")),
|
||||
"topic_primary": existing.get("topic_primary") or child.topic_primary or parent.get("topic_primary"),
|
||||
"analytics_topic": existing.get("analytics_topic") or child.analytics_topic or parent.get("analytics_topic"),
|
||||
"topic_tags": existing.get("topic_tags") or (list(child.topic_tags) if child.topic_tags else parent.get("topic_tags")),
|
||||
"skill_tags": existing.get("skill_tags") or (list(child.skill_tags) if child.skill_tags else parent.get("skill_tags")),
|
||||
"difficulty": existing.get("difficulty") or parent.get("difficulty"),
|
||||
"knowledge_reminder": existing.get("knowledge_reminder", ""),
|
||||
"ai_hint": existing.get("ai_hint", ""),
|
||||
"solution": existing.get("solution", ""),
|
||||
}
|
||||
)
|
||||
|
||||
sb.table("paper_questions").delete().eq("paper_id", paper_id).execute()
|
||||
sb.table("paper_questions").insert(inserts).execute()
|
||||
sb.table("papers").update({"question_count": len(inserts), "status": "processing"}).eq("id", paper_id).execute()
|
||||
print(f"Inserted {len(inserts)} rows for {EXAM_KEY}.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
233
backend/split_comp2211_2022_spring_midterm.py
Normal file
233
backend/split_comp2211_2022_spring_midterm.py
Normal file
@@ -0,0 +1,233 @@
|
||||
"""Split COMP2211 Spring 2022 midterm top-level problems into subquestions."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
from app.services.supabase_client import get_supabase
|
||||
|
||||
|
||||
EXAM_KEY = "COMP2211-2022-spring-midterm"
|
||||
TRUE_FALSE_OPTIONS = [{"label": "True", "text": "True"}, {"label": "False", "text": "False"}]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ChildSpec:
|
||||
question_number: str
|
||||
parent_question: str
|
||||
top_level_number: str
|
||||
path: tuple[str, ...]
|
||||
score: float
|
||||
question_type: str
|
||||
question_format: str | None = None
|
||||
page_number: int = 1
|
||||
|
||||
|
||||
def short_answer(
|
||||
question_number: str,
|
||||
parent_question: str,
|
||||
top_level_number: str,
|
||||
path: tuple[str, ...],
|
||||
score: float,
|
||||
*,
|
||||
page_number: int,
|
||||
) -> ChildSpec:
|
||||
return ChildSpec(
|
||||
question_number=question_number,
|
||||
parent_question=parent_question,
|
||||
top_level_number=top_level_number,
|
||||
path=path,
|
||||
score=score,
|
||||
question_type="long_question",
|
||||
question_format="short_answer",
|
||||
page_number=page_number,
|
||||
)
|
||||
|
||||
|
||||
CHILDREN: list[ChildSpec] = [
|
||||
*[
|
||||
ChildSpec(f"1{letter}", "1", "1", (letter,), 1.5, "true_false", page_number=2)
|
||||
for letter in "abcdefghij"
|
||||
],
|
||||
ChildSpec("2a_i", "2a", "2", ("a", "i"), 1, "fill_blank", page_number=4),
|
||||
ChildSpec("2a_ii", "2a", "2", ("a", "ii"), 1, "fill_blank", page_number=4),
|
||||
ChildSpec("2a_iii", "2a", "2", ("a", "iii"), 1, "fill_blank", page_number=4),
|
||||
ChildSpec("2a_iv", "2a", "2", ("a", "iv"), 1, "fill_blank", page_number=4),
|
||||
ChildSpec("2a_v", "2a", "2", ("a", "v"), 1, "fill_blank", page_number=4),
|
||||
ChildSpec("2b", "2", "2", ("b",), 2, "fill_blank", page_number=4),
|
||||
ChildSpec("2c", "2", "2", ("c",), 9, "long_question", "coding", page_number=5),
|
||||
ChildSpec("3a", "3", "3", ("a",), 2, "fill_blank", page_number=7),
|
||||
ChildSpec("3b_i", "3b", "3", ("b", "i"), 1.75, "fill_blank", page_number=7),
|
||||
ChildSpec("3b_ii", "3b", "3", ("b", "ii"), 1.75, "fill_blank", page_number=7),
|
||||
ChildSpec("3b_iii", "3b", "3", ("b", "iii"), 1.75, "fill_blank", page_number=7),
|
||||
ChildSpec("3b_iv", "3b", "3", ("b", "iv"), 1.75, "fill_blank", page_number=7),
|
||||
short_answer("3c", "3", "3", ("c",), 2, page_number=8),
|
||||
ChildSpec("4a", "4", "4", ("a",), 3, "long_question", "long_answer", page_number=9),
|
||||
short_answer("4b_i", "4b", "4", ("b", "i"), 3, page_number=9),
|
||||
short_answer("4b_ii", "4b", "4", ("b", "ii"), 3, page_number=9),
|
||||
ChildSpec("4c_i", "4c", "4", ("c", "i"), 2, "long_question", "long_answer", page_number=10),
|
||||
ChildSpec("4c_ii", "4c", "4", ("c", "ii"), 3, "long_question", "long_answer", page_number=10),
|
||||
ChildSpec("5a", "5", "5", ("a",), 4.5, "long_question", "long_answer", page_number=11),
|
||||
ChildSpec("5b", "5", "5", ("b",), 1.5, "fill_blank", page_number=11),
|
||||
ChildSpec("5c", "5", "5", ("c",), 4.5, "long_question", "long_answer", page_number=11),
|
||||
short_answer("5d", "5", "5", ("d",), 1.5, page_number=11),
|
||||
ChildSpec("6a", "6", "6", ("a",), 8, "long_question", "long_answer", page_number=12),
|
||||
short_answer("6b", "6", "6", ("b",), 2, page_number=13),
|
||||
ChildSpec("6c", "6", "6", ("c",), 10, "long_question", "coding", page_number=13),
|
||||
short_answer("7a", "7", "7", ("a",), 4, page_number=14),
|
||||
short_answer("7b", "7", "7", ("b",), 6, page_number=14),
|
||||
ChildSpec("7c", "7", "7", ("c",), 2, "fill_blank", page_number=15),
|
||||
]
|
||||
|
||||
|
||||
MARKER_RE = re.compile(r"(?m)^\(([a-z]+)\)\s*")
|
||||
PROBLEM_SEED_PATH = (
|
||||
Path(__file__).resolve().parent.parent
|
||||
/ "pastpaper-scraper"
|
||||
/ "reviews"
|
||||
/ "COMP2211"
|
||||
/ "problem_seed.json"
|
||||
)
|
||||
|
||||
|
||||
def split_sections(text: str) -> tuple[str, dict[str, str]]:
|
||||
matches = list(MARKER_RE.finditer(text))
|
||||
if not matches:
|
||||
return text.strip(), {}
|
||||
intro = text[: matches[0].start()].strip()
|
||||
sections: dict[str, str] = {}
|
||||
for idx, match in enumerate(matches):
|
||||
marker = match.group(1)
|
||||
end = matches[idx + 1].start() if idx + 1 < len(matches) else len(text)
|
||||
sections[marker] = text[match.start() : end].strip()
|
||||
return intro, sections
|
||||
|
||||
|
||||
def extract_segment(text: str, path: tuple[str, ...]) -> str:
|
||||
intro, sections = split_sections(text)
|
||||
if not path:
|
||||
return text.strip()
|
||||
first = sections.get(path[0], "")
|
||||
if not first:
|
||||
return text.strip()
|
||||
if len(path) == 1:
|
||||
return "\n".join(part for part in [intro, first] if part).strip()
|
||||
child_intro, child_sections = split_sections(first)
|
||||
second = child_sections.get(path[1], "")
|
||||
return "\n".join(part for part in [intro, child_intro, second] if part).strip()
|
||||
|
||||
|
||||
def extract_true_false_answers(answer_text: str) -> dict[str, str]:
|
||||
answers: dict[str, str] = {}
|
||||
matches = list(re.finditer(r"(?m)^\(([a-j])\)\s*\n?([TF])\b", answer_text))
|
||||
for match in matches:
|
||||
answers[match.group(1)] = match.group(2)
|
||||
return answers
|
||||
|
||||
|
||||
def derive_correct_answer(answer_text: str) -> str | None:
|
||||
if not answer_text:
|
||||
return None
|
||||
if "Answer:" in answer_text:
|
||||
tail = answer_text.split("Answer:", 1)[1]
|
||||
else:
|
||||
tail = answer_text
|
||||
lines = [line.strip() for line in tail.splitlines() if line.strip()]
|
||||
if not lines:
|
||||
return None
|
||||
first = lines[0]
|
||||
if first.lower().startswith("marking scheme"):
|
||||
return None
|
||||
if len(first) <= 240:
|
||||
return first
|
||||
return None
|
||||
|
||||
|
||||
def load_seed_rows() -> dict[str, dict]:
|
||||
data = json.loads(PROBLEM_SEED_PATH.read_text())
|
||||
return {
|
||||
row["question_number"]: row
|
||||
for row in data
|
||||
if row["source_exam_key"] == EXAM_KEY
|
||||
}
|
||||
|
||||
|
||||
def main() -> None:
|
||||
sb = get_supabase()
|
||||
paper = (
|
||||
sb.table("papers")
|
||||
.select("id")
|
||||
.eq("source_exam_key", EXAM_KEY)
|
||||
.execute()
|
||||
.data[0]
|
||||
)
|
||||
paper_id = paper["id"]
|
||||
|
||||
current_rows = (
|
||||
sb.table("paper_questions")
|
||||
.select("*")
|
||||
.eq("paper_id", paper_id)
|
||||
.order("display_order")
|
||||
.execute()
|
||||
.data
|
||||
)
|
||||
existing_by_number = {row["question_number"]: row for row in current_rows}
|
||||
parent_rows = load_seed_rows()
|
||||
tf_answers = extract_true_false_answers(parent_rows["1"]["raw_answer_text"] or "")
|
||||
|
||||
inserts = []
|
||||
for display_order, child in enumerate(CHILDREN, start=1):
|
||||
parent = parent_rows[child.top_level_number]
|
||||
existing = existing_by_number.get(child.question_number, {})
|
||||
question_text = extract_segment(parent["question_text"] or "", child.path)
|
||||
raw_answer_text = extract_segment(parent["raw_answer_text"] or "", child.path)
|
||||
|
||||
correct_option = None
|
||||
correct_answer = None
|
||||
options = None
|
||||
if child.question_type == "true_false":
|
||||
marker = child.path[0]
|
||||
correct_option = tf_answers.get(marker)
|
||||
options = TRUE_FALSE_OPTIONS
|
||||
elif child.question_type == "fill_blank":
|
||||
correct_answer = derive_correct_answer(raw_answer_text)
|
||||
|
||||
inserts.append(
|
||||
{
|
||||
"paper_id": paper_id,
|
||||
"question_number": child.question_number,
|
||||
"parent_question": child.parent_question,
|
||||
"display_order": display_order,
|
||||
"question_type": child.question_type,
|
||||
"question_format": child.question_format,
|
||||
"question_text": question_text,
|
||||
"score": child.score,
|
||||
"page_number": child.page_number,
|
||||
"page_y_ratio": existing.get("page_y_ratio"),
|
||||
"options": options,
|
||||
"correct_option": correct_option,
|
||||
"correct_answer": correct_answer,
|
||||
"raw_answer_text": raw_answer_text,
|
||||
"topics": existing.get("topics") or parent.get("topics"),
|
||||
"topic_primary": existing.get("topic_primary") or parent.get("topic_primary"),
|
||||
"analytics_topic": existing.get("analytics_topic") or parent.get("analytics_topic"),
|
||||
"topic_tags": existing.get("topic_tags") or parent.get("topic_tags"),
|
||||
"skill_tags": existing.get("skill_tags") or parent.get("skill_tags"),
|
||||
"difficulty": existing.get("difficulty") or parent.get("difficulty"),
|
||||
"knowledge_reminder": existing.get("knowledge_reminder", ""),
|
||||
"ai_hint": existing.get("ai_hint", ""),
|
||||
"solution": existing.get("solution", ""),
|
||||
}
|
||||
)
|
||||
|
||||
sb.table("paper_questions").delete().eq("paper_id", paper_id).execute()
|
||||
sb.table("paper_questions").insert(inserts).execute()
|
||||
sb.table("papers").update({"question_count": len(inserts), "status": "processing"}).eq("id", paper_id).execute()
|
||||
print(f"Inserted {len(inserts)} rows for {EXAM_KEY}.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
268
backend/split_comp2211_2023_spring_midterm.py
Normal file
268
backend/split_comp2211_2023_spring_midterm.py
Normal file
@@ -0,0 +1,268 @@
|
||||
"""Split COMP2211 Spring 2023 midterm into subquestions."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
from app.services.supabase_client import get_supabase
|
||||
|
||||
|
||||
EXAM_KEY = "COMP2211-2023-spring-midterm"
|
||||
PROBLEM_SEED_PATH = (
|
||||
Path(__file__).resolve().parent.parent
|
||||
/ "pastpaper-scraper"
|
||||
/ "reviews"
|
||||
/ "COMP2211"
|
||||
/ "problem_seed.json"
|
||||
)
|
||||
TRUE_FALSE_OPTIONS = [{"label": "True", "text": "True"}, {"label": "False", "text": "False"}]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ChildSpec:
|
||||
question_number: str
|
||||
parent_question: str
|
||||
top_level_number: str
|
||||
path: tuple[str, ...]
|
||||
score: float
|
||||
question_type: str
|
||||
question_format: str | None = None
|
||||
analytics_topic: str | None = None
|
||||
topic_primary: str | None = None
|
||||
topic_tags: tuple[str, ...] | None = None
|
||||
skill_tags: tuple[str, ...] | None = None
|
||||
options: tuple[tuple[str, str], ...] | None = None
|
||||
correct_option: str | None = None
|
||||
correct_answer: str | None = None
|
||||
page_number: int = 1
|
||||
|
||||
|
||||
def short_answer(
|
||||
question_number: str,
|
||||
parent_question: str,
|
||||
top_level_number: str,
|
||||
path: tuple[str, ...],
|
||||
score: float,
|
||||
*,
|
||||
analytics_topic: str | None = None,
|
||||
topic_primary: str | None = None,
|
||||
topic_tags: tuple[str, ...] | None = None,
|
||||
skill_tags: tuple[str, ...] | None = None,
|
||||
correct_answer: str | None = None,
|
||||
page_number: int,
|
||||
) -> ChildSpec:
|
||||
return ChildSpec(
|
||||
question_number=question_number,
|
||||
parent_question=parent_question,
|
||||
top_level_number=top_level_number,
|
||||
path=path,
|
||||
score=score,
|
||||
question_type="long_question",
|
||||
question_format="short_answer",
|
||||
analytics_topic=analytics_topic,
|
||||
topic_primary=topic_primary,
|
||||
topic_tags=topic_tags,
|
||||
skill_tags=skill_tags,
|
||||
correct_answer=correct_answer,
|
||||
page_number=page_number,
|
||||
)
|
||||
|
||||
|
||||
def mc(
|
||||
question_number: str,
|
||||
parent_question: str,
|
||||
top_level_number: str,
|
||||
path: tuple[str, ...],
|
||||
score: float,
|
||||
*,
|
||||
options: tuple[tuple[str, str], ...],
|
||||
correct_option: str,
|
||||
analytics_topic: str,
|
||||
skill_tags: tuple[str, ...],
|
||||
page_number: int,
|
||||
) -> ChildSpec:
|
||||
return ChildSpec(
|
||||
question_number=question_number,
|
||||
parent_question=parent_question,
|
||||
top_level_number=top_level_number,
|
||||
path=path,
|
||||
score=score,
|
||||
question_type="mc",
|
||||
question_format="mc",
|
||||
analytics_topic=analytics_topic,
|
||||
topic_primary=analytics_topic,
|
||||
topic_tags=(analytics_topic,),
|
||||
skill_tags=skill_tags,
|
||||
options=options,
|
||||
correct_option=correct_option,
|
||||
page_number=page_number,
|
||||
)
|
||||
|
||||
|
||||
ABCDE = (("A", "A"), ("B", "B"), ("C", "C"), ("D", "D"), ("E", "E"))
|
||||
|
||||
|
||||
CHILDREN: list[ChildSpec] = [
|
||||
ChildSpec("1a", "1", "1", ("a",), 1, "true_false", "true_false", "Probabilistic Models", "Probabilistic Models", ("Probabilistic Models",), ("concept_check", "classification_decision"), page_number=3),
|
||||
ChildSpec("1b", "1", "1", ("b",), 1, "true_false", "true_false", "Probabilistic Models", "Probabilistic Models", ("Probabilistic Models",), ("concept_check", "classification_decision"), page_number=3),
|
||||
ChildSpec("1c", "1", "1", ("c",), 1, "true_false", "true_false", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("concept_check", "algorithm_property"), page_number=3),
|
||||
ChildSpec("1d", "1", "1", ("d",), 1, "true_false", "true_false", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("concept_check", "distance_reasoning"), page_number=3),
|
||||
ChildSpec("1e", "1", "1", ("e",), 1, "true_false", "true_false", "Evaluation and Validation", "Evaluation and Validation", ("Evaluation and Validation",), ("concept_check", "validation_reasoning"), page_number=3),
|
||||
ChildSpec("1f", "1", "1", ("f",), 1, "true_false", "true_false", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("concept_check", "algorithm_property"), page_number=3),
|
||||
ChildSpec("1g", "1", "1", ("g",), 1, "true_false", "true_false", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("concept_check", "robustness_reasoning"), page_number=3),
|
||||
ChildSpec("1h", "1", "1", ("h",), 1, "true_false", "true_false", "Perceptron and MLP", "Perceptron and MLP", ("Perceptron and MLP",), ("concept_check", "decision_boundary"), page_number=3),
|
||||
ChildSpec("1i", "1", "1", ("i",), 1, "true_false", "true_false", "Perceptron and MLP", "Perceptron and MLP", ("Perceptron and MLP",), ("concept_check", "optimization_reasoning"), page_number=3),
|
||||
ChildSpec("1j", "1", "1", ("j",), 1, "true_false", "true_false", "Perceptron and MLP", "Perceptron and MLP", ("Perceptron and MLP",), ("concept_check", "expressiveness_reasoning"), page_number=3),
|
||||
short_answer("2a_i", "2a", "2", ("a", "i"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("code_tracing",), page_number=4),
|
||||
short_answer("2a_ii", "2a", "2", ("a", "ii"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("code_tracing",), page_number=4),
|
||||
short_answer("2a_iii", "2a", "2", ("a", "iii"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("code_tracing",), page_number=4),
|
||||
short_answer("2a_iv", "2a", "2", ("a", "iv"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("code_tracing",), page_number=4),
|
||||
short_answer("2a_v", "2a", "2", ("a", "v"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("indexing", "code_tracing"), page_number=4),
|
||||
short_answer("2a_vi", "2a", "2", ("a", "vi"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("indexing", "error_reasoning"), page_number=5),
|
||||
short_answer("2a_vii", "2a", "2", ("a", "vii"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("masking", "code_tracing"), page_number=5),
|
||||
short_answer("2a_viii", "2a", "2", ("a", "viii"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("aggregation", "code_tracing"), page_number=5),
|
||||
short_answer("2a_ix", "2a", "2", ("a", "ix"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("transpose", "code_tracing"), page_number=5),
|
||||
short_answer("2b_i", "2b", "2", ("b", "i"), 2, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("broadcasting", "code_tracing"), page_number=6),
|
||||
short_answer("2b_ii", "2b", "2", ("b", "ii"), 2, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("broadcasting", "error_reasoning"), page_number=6),
|
||||
short_answer("2b_iii", "2b", "2", ("b", "iii"), 2, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("broadcasting", "code_tracing"), page_number=6),
|
||||
ChildSpec("2c", "2", "2", ("c",), 6, "long_question", "coding", "Python Fundamentals", "Python Fundamentals", ("Python Fundamentals",), ("implementation", "vectorization", "geometry_reasoning"), page_number=7),
|
||||
short_answer("3", "3", "3", (), 8, analytics_topic="Probabilistic Models", topic_primary="Probabilistic Models", topic_tags=("Probabilistic Models",), skill_tags=("concept_explanation", "missing_data_reasoning"), page_number=9),
|
||||
ChildSpec("4a", "4", "4", ("a",), 8, "long_question", "long_answer", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("distance_calculation", "classification_decision"), page_number=10),
|
||||
short_answer("4b", "4", "4", ("b",), 6, analytics_topic="KNN and Clustering", topic_primary="KNN and Clustering", topic_tags=("KNN and Clustering",), skill_tags=("distance_reasoning", "comparison"), page_number=11),
|
||||
ChildSpec("5a", "5", "5", ("a",), 7, "long_question", "long_answer", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("distance_calculation", "algorithm_tracing"), page_number=12),
|
||||
ChildSpec("5b", "5", "5", ("b",), 7, "long_question", "long_answer", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("centroid_update", "algorithm_tracing"), page_number=12),
|
||||
short_answer("5c", "5", "5", ("c",), 5, analytics_topic="KNN and Clustering", topic_primary="KNN and Clustering", topic_tags=("KNN and Clustering",), skill_tags=("concept_explanation", "model_selection"), page_number=14),
|
||||
short_answer("6a", "6", "6", ("a",), 2, analytics_topic="Perceptron and MLP", topic_primary="Perceptron and MLP", topic_tags=("Perceptron and MLP",), skill_tags=("convergence_reasoning",), page_number=15),
|
||||
mc("6b", "6", "6", ("b",), 2, options=ABCDE, correct_option="D", analytics_topic="Perceptron and MLP", skill_tags=("generalization_reasoning",), page_number=15),
|
||||
short_answer("6c", "6", "6", ("c",), 2, analytics_topic="Perceptron and MLP", topic_primary="Perceptron and MLP", topic_tags=("Perceptron and MLP",), skill_tags=("activation_reasoning",), page_number=16),
|
||||
ChildSpec("6d", "6", "6", ("d",), 6, "long_question", "coding", "Perceptron and MLP", "Perceptron and MLP", ("Perceptron and MLP",), ("debugging", "implementation", "weight_update"), page_number=16),
|
||||
short_answer("7a", "7", "7", ("a",), 4, analytics_topic="Perceptron and MLP", topic_primary="Perceptron and MLP", topic_tags=("Perceptron and MLP",), skill_tags=("decision_boundary", "linearity_reasoning"), page_number=18),
|
||||
short_answer("7b", "7", "7", ("b",), 2, analytics_topic="Perceptron and MLP", topic_primary="Perceptron and MLP", topic_tags=("Perceptron and MLP",), skill_tags=("decision_boundary", "linearity_reasoning"), page_number=18),
|
||||
ChildSpec("7c", "7", "7", ("c",), 10, "long_question", "long_answer", "Perceptron and MLP", "Perceptron and MLP", ("Perceptron and MLP",), ("architecture_reasoning", "parameter_design"), page_number=19),
|
||||
]
|
||||
|
||||
|
||||
MARKER_RE = re.compile(r"(?m)^\(([a-z]+|[ivx]+)\)\s*")
|
||||
|
||||
|
||||
def split_sections(text: str) -> tuple[str, dict[str, str]]:
|
||||
matches = list(MARKER_RE.finditer(text))
|
||||
if not matches:
|
||||
return text.strip(), {}
|
||||
intro = text[: matches[0].start()].strip()
|
||||
sections: dict[str, str] = {}
|
||||
for idx, match in enumerate(matches):
|
||||
marker = match.group(1)
|
||||
end = matches[idx + 1].start() if idx + 1 < len(matches) else len(text)
|
||||
sections[marker] = text[match.start() : end].strip()
|
||||
return intro, sections
|
||||
|
||||
|
||||
def extract_segment(text: str, path: tuple[str, ...]) -> str:
|
||||
current = text.strip()
|
||||
carried_intro: list[str] = []
|
||||
for depth, marker in enumerate(path):
|
||||
intro, sections = split_sections(current)
|
||||
if depth == 0 and intro:
|
||||
carried_intro.append(intro)
|
||||
current = sections.get(marker, current)
|
||||
return "\n".join(part for part in [*carried_intro, current] if part).strip()
|
||||
|
||||
|
||||
def extract_true_false_answers(answer_text: str) -> dict[str, str]:
|
||||
answers: dict[str, str] = {}
|
||||
matches = list(re.finditer(r"(?m)^\(([a-j])\)\s*\n?T\s*F", answer_text))
|
||||
if matches:
|
||||
return answers
|
||||
for match in re.finditer(r"(?m)^\(([a-j])\)\s*\n?([TF])\b", answer_text):
|
||||
answers[match.group(1)] = match.group(2)
|
||||
if answers:
|
||||
return answers
|
||||
lines = [line.strip() for line in answer_text.splitlines() if line.strip()]
|
||||
current = None
|
||||
for line in lines:
|
||||
m = re.fullmatch(r"\(([a-j])\)", line)
|
||||
if m:
|
||||
current = m.group(1)
|
||||
continue
|
||||
if current and line in {"T", "F"}:
|
||||
answers[current] = line
|
||||
current = None
|
||||
return answers
|
||||
|
||||
|
||||
def load_seed_rows() -> dict[str, dict]:
|
||||
data = json.loads(PROBLEM_SEED_PATH.read_text())
|
||||
return {row["question_number"]: row for row in data if row["source_exam_key"] == EXAM_KEY}
|
||||
|
||||
|
||||
def main() -> None:
|
||||
sb = get_supabase()
|
||||
paper = sb.table("papers").select("id").eq("source_exam_key", EXAM_KEY).execute().data[0]
|
||||
paper_id = paper["id"]
|
||||
current_rows = (
|
||||
sb.table("paper_questions")
|
||||
.select("*")
|
||||
.eq("paper_id", paper_id)
|
||||
.order("display_order")
|
||||
.execute()
|
||||
.data
|
||||
)
|
||||
existing_by_number = {row["question_number"]: row for row in current_rows}
|
||||
parent_rows = load_seed_rows()
|
||||
tf_answers = extract_true_false_answers(parent_rows["1"]["raw_answer_text"] or "")
|
||||
|
||||
inserts = []
|
||||
for display_order, child in enumerate(CHILDREN, start=1):
|
||||
parent = parent_rows[child.top_level_number]
|
||||
existing = existing_by_number.get(child.question_number, {})
|
||||
question_text = extract_segment(parent["question_text"] or "", child.path)
|
||||
raw_answer_text = extract_segment(parent["raw_answer_text"] or "", child.path) if child.path else (parent["raw_answer_text"] or "")
|
||||
|
||||
options = None
|
||||
correct_option = child.correct_option
|
||||
if child.options:
|
||||
options = [{"label": label, "text": text} for label, text in child.options]
|
||||
if child.question_type == "true_false":
|
||||
options = TRUE_FALSE_OPTIONS
|
||||
correct_option = tf_answers.get(child.path[0])
|
||||
|
||||
inserts.append(
|
||||
{
|
||||
"paper_id": paper_id,
|
||||
"question_number": child.question_number,
|
||||
"parent_question": child.parent_question,
|
||||
"display_order": display_order,
|
||||
"question_type": child.question_type,
|
||||
"question_format": child.question_format,
|
||||
"question_text": question_text,
|
||||
"score": child.score,
|
||||
"page_number": child.page_number,
|
||||
"page_y_ratio": existing.get("page_y_ratio"),
|
||||
"options": options,
|
||||
"correct_option": correct_option,
|
||||
"correct_answer": child.correct_answer,
|
||||
"raw_answer_text": raw_answer_text,
|
||||
"topics": existing.get("topics") or (list(child.topic_tags) if child.topic_tags else parent.get("topics")),
|
||||
"topic_primary": existing.get("topic_primary") or child.topic_primary or parent.get("topic_primary"),
|
||||
"analytics_topic": existing.get("analytics_topic") or child.analytics_topic or parent.get("analytics_topic"),
|
||||
"topic_tags": existing.get("topic_tags") or (list(child.topic_tags) if child.topic_tags else parent.get("topic_tags")),
|
||||
"skill_tags": existing.get("skill_tags") or (list(child.skill_tags) if child.skill_tags else parent.get("skill_tags")),
|
||||
"difficulty": existing.get("difficulty") or parent.get("difficulty"),
|
||||
"knowledge_reminder": existing.get("knowledge_reminder", ""),
|
||||
"ai_hint": existing.get("ai_hint", ""),
|
||||
"solution": existing.get("solution", ""),
|
||||
}
|
||||
)
|
||||
|
||||
sb.table("paper_questions").delete().eq("paper_id", paper_id).execute()
|
||||
sb.table("paper_questions").insert(inserts).execute()
|
||||
sb.table("papers").update({"question_count": len(inserts), "status": "processing"}).eq("id", paper_id).execute()
|
||||
print(f"Inserted {len(inserts)} rows for {EXAM_KEY}.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
242
backend/split_comp2211_2024_spring_final.py
Normal file
242
backend/split_comp2211_2024_spring_final.py
Normal file
@@ -0,0 +1,242 @@
|
||||
"""Split COMP2211 Spring 2024 final into subquestions."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
from app.services.supabase_client import get_supabase
|
||||
|
||||
|
||||
EXAM_KEY = "COMP2211-2024-spring-final"
|
||||
PROBLEM_SEED_PATH = (
|
||||
Path(__file__).resolve().parent.parent
|
||||
/ "pastpaper-scraper"
|
||||
/ "reviews"
|
||||
/ "COMP2211"
|
||||
/ "problem_seed.json"
|
||||
)
|
||||
TRUE_FALSE_OPTIONS = [{"label": "True", "text": "True"}, {"label": "False", "text": "False"}]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ChildSpec:
|
||||
question_number: str
|
||||
parent_question: str
|
||||
top_level_number: str
|
||||
path: tuple[str, ...]
|
||||
score: float
|
||||
question_type: str
|
||||
question_format: str | None = None
|
||||
analytics_topic: str | None = None
|
||||
topic_primary: str | None = None
|
||||
topic_tags: tuple[str, ...] | None = None
|
||||
skill_tags: tuple[str, ...] | None = None
|
||||
options: tuple[tuple[str, str], ...] | None = None
|
||||
correct_option: str | None = None
|
||||
correct_answer: str | None = None
|
||||
page_number: int = 1
|
||||
|
||||
|
||||
def short_answer(
|
||||
question_number: str,
|
||||
parent_question: str,
|
||||
top_level_number: str,
|
||||
path: tuple[str, ...],
|
||||
score: float,
|
||||
*,
|
||||
analytics_topic: str | None = None,
|
||||
topic_primary: str | None = None,
|
||||
topic_tags: tuple[str, ...] | None = None,
|
||||
skill_tags: tuple[str, ...] | None = None,
|
||||
correct_answer: str | None = None,
|
||||
page_number: int,
|
||||
) -> ChildSpec:
|
||||
return ChildSpec(
|
||||
question_number=question_number,
|
||||
parent_question=parent_question,
|
||||
top_level_number=top_level_number,
|
||||
path=path,
|
||||
score=score,
|
||||
question_type="long_question",
|
||||
question_format="short_answer",
|
||||
analytics_topic=analytics_topic,
|
||||
topic_primary=topic_primary,
|
||||
topic_tags=topic_tags,
|
||||
skill_tags=skill_tags,
|
||||
correct_answer=correct_answer,
|
||||
page_number=page_number,
|
||||
)
|
||||
|
||||
|
||||
CHILDREN: list[ChildSpec] = [
|
||||
ChildSpec("1a", "1", "1", ("a",), 1, "true_false", "true_false", "Python Fundamentals", "Python Fundamentals", ("Python Fundamentals",), ("concept_check", "code_tracing"), page_number=2),
|
||||
ChildSpec("1b", "1", "1", ("b",), 1, "true_false", "true_false", "Probabilistic Models", "Probabilistic Models", ("Probabilistic Models",), ("concept_check", "classification_decision"), page_number=2),
|
||||
ChildSpec("1c", "1", "1", ("c",), 1, "true_false", "true_false", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("concept_check", "algorithm_property"), page_number=2),
|
||||
ChildSpec("1d", "1", "1", ("d",), 1, "true_false", "true_false", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("concept_check", "algorithm_property"), page_number=2),
|
||||
ChildSpec("1e", "1", "1", ("e",), 1, "true_false", "true_false", "Perceptron and MLP", "Perceptron and MLP", ("Perceptron and MLP",), ("concept_check", "activation_reasoning"), page_number=2),
|
||||
ChildSpec("1f", "1", "1", ("f",), 1, "true_false", "true_false", "Vision and CNN", "Vision and CNN", ("Vision and CNN",), ("concept_check", "image_processing"), page_number=2),
|
||||
ChildSpec("1g", "1", "1", ("g",), 1, "true_false", "true_false", "Vision and CNN", "Vision and CNN", ("Vision and CNN",), ("concept_check", "cnn_complexity"), page_number=2),
|
||||
ChildSpec("1h", "1", "1", ("h",), 1, "true_false", "true_false", "Vision and CNN", "Vision and CNN", ("Vision and CNN",), ("concept_check", "regularization"), page_number=2),
|
||||
ChildSpec("1i", "1", "1", ("i",), 1, "true_false", "true_false", "Search and Games", "Search and Games", ("Search and Games",), ("concept_check", "pruning_reasoning"), page_number=2),
|
||||
ChildSpec("1j", "1", "1", ("j",), 1, "true_false", "true_false", "Ethics of AI", "Ethics of AI", ("Ethics of AI",), ("concept_check", "research_ethics"), page_number=2),
|
||||
ChildSpec("2a", "2", "2", ("a",), 4, "long_question", "coding", "Python Fundamentals", "Python Fundamentals", ("Python Fundamentals",), ("implementation", "vectorization", "masking"), page_number=3),
|
||||
ChildSpec("2b", "2", "2", ("b",), 6, "long_question", "coding", "Python Fundamentals", "Python Fundamentals", ("Python Fundamentals",), ("implementation", "convolution", "array_manipulation"), page_number=4),
|
||||
short_answer("3a_i", "3a", "3", ("a", "i"), 1.5, analytics_topic="Probabilistic Models", topic_primary="Probabilistic Models", topic_tags=("Probabilistic Models",), skill_tags=("manual_computation", "probability_reasoning"), page_number=6),
|
||||
short_answer("3a_ii", "3a", "3", ("a", "ii"), 1.5, analytics_topic="Probabilistic Models", topic_primary="Probabilistic Models", topic_tags=("Probabilistic Models",), skill_tags=("manual_computation", "probability_reasoning"), page_number=6),
|
||||
short_answer("3a_iii", "3a", "3", ("a", "iii"), 1.5, analytics_topic="Probabilistic Models", topic_primary="Probabilistic Models", topic_tags=("Probabilistic Models",), skill_tags=("manual_computation", "probability_reasoning"), page_number=6),
|
||||
short_answer("3a_iv", "3a", "3", ("a", "iv"), 1.5, analytics_topic="Probabilistic Models", topic_primary="Probabilistic Models", topic_tags=("Probabilistic Models",), skill_tags=("manual_computation", "probability_reasoning"), page_number=6),
|
||||
short_answer("3b_i", "3b", "3", ("b", "i"), 1.5, analytics_topic="Evaluation and Validation", topic_primary="Evaluation and Validation", topic_tags=("Evaluation and Validation",), skill_tags=("validation_reasoning",), page_number=6),
|
||||
short_answer("3b_ii", "3b", "3", ("b", "ii"), 1.5, analytics_topic="Evaluation and Validation", topic_primary="Evaluation and Validation", topic_tags=("Evaluation and Validation",), skill_tags=("validation_reasoning",), page_number=6),
|
||||
short_answer("3b_iii", "3b", "3", ("b", "iii"), 1.5, analytics_topic="Evaluation and Validation", topic_primary="Evaluation and Validation", topic_tags=("Evaluation and Validation",), skill_tags=("validation_reasoning",), page_number=6),
|
||||
short_answer("3c", "3", "3", ("c",), 1.5, analytics_topic="Perceptron and MLP", topic_primary="Perceptron and MLP", topic_tags=("Perceptron and MLP",), skill_tags=("linearity_reasoning", "classification_decision"), page_number=6),
|
||||
short_answer("4a_i", "4a", "4", ("a", "i"), 2.5, analytics_topic="Perceptron and MLP", topic_primary="Perceptron and MLP", topic_tags=("Perceptron and MLP",), skill_tags=("parameter_counting",), page_number=7),
|
||||
short_answer("4a_ii", "4a", "4", ("a", "ii"), 2.5, analytics_topic="Perceptron and MLP", topic_primary="Perceptron and MLP", topic_tags=("Perceptron and MLP",), skill_tags=("model_selection",), page_number=7),
|
||||
short_answer("4b", "4", "4", ("b",), 1, analytics_topic="Perceptron and MLP", topic_primary="Perceptron and MLP", topic_tags=("Perceptron and MLP",), skill_tags=("concept_explanation",), page_number=7),
|
||||
short_answer("4c", "4", "4", ("c",), 2, analytics_topic="Perceptron and MLP", topic_primary="Perceptron and MLP", topic_tags=("Perceptron and MLP",), skill_tags=("activation_reasoning", "optimization_reasoning"), page_number=7),
|
||||
ChildSpec("4d_i", "4d", "4", ("d", "i"), 1.5, "long_question", "long_answer", "Perceptron and MLP", "Perceptron and MLP", ("Perceptron and MLP",), ("forward_pass", "activation_reasoning"), page_number=8),
|
||||
ChildSpec("4d_ii", "4d", "4", ("d", "ii"), 1.5, "long_question", "long_answer", "Perceptron and MLP", "Perceptron and MLP", ("Perceptron and MLP",), ("backpropagation", "weight_update"), page_number=8),
|
||||
ChildSpec("5a", "5", "5", ("a",), 4.5, "long_question", "long_answer", "Vision and CNN", "Vision and CNN", ("Vision and CNN",), ("histogram_reasoning", "image_transform"), page_number=9),
|
||||
ChildSpec("5b", "5", "5", ("b",), 3, "long_question", "long_answer", "Vision and CNN", "Vision and CNN", ("Vision and CNN",), ("thresholding", "manual_computation"), page_number=10),
|
||||
ChildSpec("5c", "5", "5", ("c",), 2, "long_question", "long_answer", "Vision and CNN", "Vision and CNN", ("Vision and CNN",), ("padding", "manual_construction"), page_number=10),
|
||||
short_answer("5d_i", "5d", "5", ("d", "i"), 0.5, analytics_topic="Vision and CNN", topic_primary="Vision and CNN", topic_tags=("Vision and CNN",), skill_tags=("filter_effect_reasoning",), page_number=11),
|
||||
short_answer("5d_ii", "5d", "5", ("d", "ii"), 0.5, analytics_topic="Vision and CNN", topic_primary="Vision and CNN", topic_tags=("Vision and CNN",), skill_tags=("filter_effect_reasoning",), page_number=11),
|
||||
short_answer("5d_iii", "5d", "5", ("d", "iii"), 0.5, analytics_topic="Vision and CNN", topic_primary="Vision and CNN", topic_tags=("Vision and CNN",), skill_tags=("filter_effect_reasoning",), page_number=11),
|
||||
short_answer("5e", "5", "5", ("e",), 2, analytics_topic="Vision and CNN", topic_primary="Vision and CNN", topic_tags=("Vision and CNN",), skill_tags=("concept_explanation", "local_vs_global"), page_number=11),
|
||||
ChildSpec("6a", "6", "6", ("a",), 10, "long_question", "coding", "Vision and CNN", "Vision and CNN", ("Vision and CNN",), ("implementation", "convolution", "debugging"), page_number=12),
|
||||
ChildSpec("6b", "6", "6", ("b",), 3, "long_question", "coding", "Vision and CNN", "Vision and CNN", ("Vision and CNN",), ("implementation", "regularization"), page_number=15),
|
||||
short_answer("7a_i", "7a", "7", ("a", "i"), 1, analytics_topic="Vision and CNN", topic_primary="Vision and CNN", topic_tags=("Vision and CNN",), skill_tags=("cnn_architecture",), page_number=16),
|
||||
short_answer("7a_ii", "7a", "7", ("a", "ii"), 4, analytics_topic="Vision and CNN", topic_primary="Vision and CNN", topic_tags=("Vision and CNN",), skill_tags=("shape_reasoning", "parameter_counting"), page_number=16),
|
||||
short_answer("7a_iii", "7a", "7", ("a", "iii"), 3, analytics_topic="Vision and CNN", topic_primary="Vision and CNN", topic_tags=("Vision and CNN",), skill_tags=("overfitting", "regularization"), page_number=16),
|
||||
ChildSpec("7b", "7", "7", ("b",), 5, "long_question", "long_answer", "Vision and CNN", "Vision and CNN", ("Vision and CNN",), ("manual_computation", "cnn_forward_pass"), page_number=17),
|
||||
short_answer("7c_i", "7c", "7", ("c", "i"), 2, analytics_topic="Vision and CNN", topic_primary="Vision and CNN", topic_tags=("Vision and CNN",), skill_tags=("shape_reasoning", "3d_convolution"), page_number=17),
|
||||
short_answer("7c_ii", "7c", "7", ("c", "ii"), 1.5, analytics_topic="Vision and CNN", topic_primary="Vision and CNN", topic_tags=("Vision and CNN",), skill_tags=("parameter_counting", "3d_convolution"), page_number=17),
|
||||
short_answer("7c_iii", "7c", "7", ("c", "iii"), 1.5, analytics_topic="Vision and CNN", topic_primary="Vision and CNN", topic_tags=("Vision and CNN",), skill_tags=("parameter_counting", "3d_convolution"), page_number=17),
|
||||
short_answer("8a_i", "8a", "8", ("a", "i"), 1, analytics_topic="Search and Games", topic_primary="Search and Games", topic_tags=("Search and Games",), skill_tags=("tree_search", "manual_tracing"), page_number=18),
|
||||
short_answer("8a_ii", "8a", "8", ("a", "ii"), 3, analytics_topic="Search and Games", topic_primary="Search and Games", topic_tags=("Search and Games",), skill_tags=("pruning", "manual_tracing"), page_number=18),
|
||||
short_answer("8a_iii", "8a", "8", ("a", "iii"), 1, analytics_topic="Search and Games", topic_primary="Search and Games", topic_tags=("Search and Games",), skill_tags=("game_reasoning",), page_number=18),
|
||||
short_answer("8b_i", "8b", "8", ("b", "i"), 2.5, analytics_topic="Search and Games", topic_primary="Search and Games", topic_tags=("Search and Games",), skill_tags=("utility_reasoning",), page_number=18),
|
||||
short_answer("8b_ii", "8b", "8", ("b", "ii"), 2.5, analytics_topic="Search and Games", topic_primary="Search and Games", topic_tags=("Search and Games",), skill_tags=("pruning_reasoning", "concept_explanation"), page_number=18),
|
||||
short_answer("9", "9", "9", (), 3, analytics_topic="Ethics of AI", topic_primary="Ethics of AI", topic_tags=("Ethics of AI",), skill_tags=("concept_explanation", "governance"), page_number=19),
|
||||
]
|
||||
|
||||
|
||||
MARKER_RE = re.compile(r"(?m)^\(([a-z]+|[ivx]+)\)\s*")
|
||||
|
||||
|
||||
def split_sections(text: str) -> tuple[str, dict[str, str]]:
|
||||
matches = list(MARKER_RE.finditer(text))
|
||||
if not matches:
|
||||
return text.strip(), {}
|
||||
intro = text[: matches[0].start()].strip()
|
||||
sections: dict[str, str] = {}
|
||||
for idx, match in enumerate(matches):
|
||||
marker = match.group(1)
|
||||
end = matches[idx + 1].start() if idx + 1 < len(matches) else len(text)
|
||||
sections[marker] = text[match.start() : end].strip()
|
||||
return intro, sections
|
||||
|
||||
|
||||
def extract_segment(text: str, path: tuple[str, ...]) -> str:
|
||||
if not path:
|
||||
return text.strip()
|
||||
current = text.strip()
|
||||
carried_intro: list[str] = []
|
||||
for depth, marker in enumerate(path):
|
||||
intro, sections = split_sections(current)
|
||||
if depth == 0 and intro:
|
||||
carried_intro.append(intro)
|
||||
current = sections.get(marker, current)
|
||||
return "\n".join(part for part in [*carried_intro, current] if part).strip()
|
||||
|
||||
|
||||
def extract_true_false_answers(answer_text: str) -> dict[str, str]:
|
||||
answers: dict[str, str] = {}
|
||||
table_match = re.search(r"Answer\s+(T\s+F\s+T\s+F\s+F\s+T\s+F\s+F\s+F\s+T)", answer_text, re.S)
|
||||
if table_match:
|
||||
seq = re.findall(r"[TF]", table_match.group(1))
|
||||
if len(seq) == 10:
|
||||
for idx, val in enumerate(seq):
|
||||
answers[chr(ord("a") + idx)] = val
|
||||
return answers
|
||||
seq = re.findall(r"\b([TF])\b", answer_text)
|
||||
if len(seq) >= 10:
|
||||
for idx, val in enumerate(seq[:10]):
|
||||
answers[chr(ord("a") + idx)] = val
|
||||
return answers
|
||||
|
||||
|
||||
def load_seed_rows() -> dict[str, dict]:
|
||||
data = json.loads(PROBLEM_SEED_PATH.read_text())
|
||||
return {row["question_number"]: row for row in data if row["source_exam_key"] == EXAM_KEY}
|
||||
|
||||
|
||||
def main() -> None:
|
||||
sb = get_supabase()
|
||||
paper = sb.table("papers").select("id").eq("source_exam_key", EXAM_KEY).execute().data[0]
|
||||
paper_id = paper["id"]
|
||||
current_rows = (
|
||||
sb.table("paper_questions")
|
||||
.select("*")
|
||||
.eq("paper_id", paper_id)
|
||||
.order("display_order")
|
||||
.execute()
|
||||
.data
|
||||
)
|
||||
existing_by_number = {row["question_number"]: row for row in current_rows}
|
||||
parent_rows = load_seed_rows()
|
||||
tf_answers = extract_true_false_answers(parent_rows["1"]["raw_answer_text"] or "")
|
||||
|
||||
inserts = []
|
||||
for display_order, child in enumerate(CHILDREN, start=1):
|
||||
parent = parent_rows[child.top_level_number]
|
||||
existing = existing_by_number.get(child.question_number, {})
|
||||
question_text = extract_segment(parent["question_text"] or "", child.path)
|
||||
raw_answer_text = extract_segment(parent["raw_answer_text"] or "", child.path) if child.path else (parent["raw_answer_text"] or "")
|
||||
|
||||
options = None
|
||||
correct_option = child.correct_option
|
||||
if child.question_type == "true_false":
|
||||
options = TRUE_FALSE_OPTIONS
|
||||
correct_option = tf_answers.get(child.path[0])
|
||||
elif child.options:
|
||||
options = [{"label": label, "text": text} for label, text in child.options]
|
||||
|
||||
inserts.append(
|
||||
{
|
||||
"paper_id": paper_id,
|
||||
"question_number": child.question_number,
|
||||
"parent_question": child.parent_question,
|
||||
"display_order": display_order,
|
||||
"question_type": child.question_type,
|
||||
"question_format": child.question_format,
|
||||
"question_text": question_text,
|
||||
"score": child.score,
|
||||
"page_number": child.page_number,
|
||||
"page_y_ratio": existing.get("page_y_ratio"),
|
||||
"options": options,
|
||||
"correct_option": correct_option,
|
||||
"correct_answer": child.correct_answer,
|
||||
"raw_answer_text": raw_answer_text,
|
||||
"topics": existing.get("topics") or (list(child.topic_tags) if child.topic_tags else parent.get("topics")),
|
||||
"topic_primary": existing.get("topic_primary") or child.topic_primary or parent.get("topic_primary"),
|
||||
"analytics_topic": existing.get("analytics_topic") or child.analytics_topic or parent.get("analytics_topic"),
|
||||
"topic_tags": existing.get("topic_tags") or (list(child.topic_tags) if child.topic_tags else parent.get("topic_tags")),
|
||||
"skill_tags": existing.get("skill_tags") or (list(child.skill_tags) if child.skill_tags else parent.get("skill_tags")),
|
||||
"difficulty": existing.get("difficulty") or parent.get("difficulty"),
|
||||
"knowledge_reminder": existing.get("knowledge_reminder", ""),
|
||||
"ai_hint": existing.get("ai_hint", ""),
|
||||
"solution": existing.get("solution", ""),
|
||||
}
|
||||
)
|
||||
|
||||
sb.table("paper_questions").delete().eq("paper_id", paper_id).execute()
|
||||
sb.table("paper_questions").insert(inserts).execute()
|
||||
sb.table("papers").update({"question_count": len(inserts), "status": "processing"}).eq("id", paper_id).execute()
|
||||
print(f"Inserted {len(inserts)} rows for {EXAM_KEY}.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
291
backend/split_comp2211_2024_spring_midterm.py
Normal file
291
backend/split_comp2211_2024_spring_midterm.py
Normal file
@@ -0,0 +1,291 @@
|
||||
"""Rebuild COMP2211 Spring 2024 midterm into subquestions."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
import fitz
|
||||
|
||||
from app.services.supabase_client import get_supabase
|
||||
|
||||
|
||||
EXAM_KEY = "COMP2211-2024-spring-midterm"
|
||||
ROOT = Path(__file__).resolve().parent.parent
|
||||
QUESTION_PDF = ROOT / "pastpaper-scraper" / "papers" / "COMP2211" / "(COMP2211)[2024](s)midterm~=rcidkjgf^_82003.pdf"
|
||||
ANSWER_PDF = ROOT / "pastpaper-scraper" / "papers" / "COMP2211" / "(COMP2211)[2024](s)midterm~=ubrzkjmz^_90406.pdf"
|
||||
PROBLEM_SEED_PATH = ROOT / "pastpaper-scraper" / "reviews" / "COMP2211" / "problem_seed.json"
|
||||
TRUE_FALSE_OPTIONS = [{"label": "True", "text": "True"}, {"label": "False", "text": "False"}]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ChildSpec:
|
||||
question_number: str
|
||||
parent_question: str
|
||||
top_level_number: str
|
||||
path: tuple[str, ...]
|
||||
score: float
|
||||
question_type: str
|
||||
question_format: str | None = None
|
||||
analytics_topic: str | None = None
|
||||
topic_primary: str | None = None
|
||||
topic_tags: tuple[str, ...] | None = None
|
||||
skill_tags: tuple[str, ...] | None = None
|
||||
page_number: int = 1
|
||||
|
||||
|
||||
def short_answer(
|
||||
question_number: str,
|
||||
parent_question: str,
|
||||
top_level_number: str,
|
||||
path: tuple[str, ...],
|
||||
score: float,
|
||||
*,
|
||||
analytics_topic: str | None = None,
|
||||
topic_primary: str | None = None,
|
||||
topic_tags: tuple[str, ...] | None = None,
|
||||
skill_tags: tuple[str, ...] | None = None,
|
||||
page_number: int,
|
||||
) -> ChildSpec:
|
||||
return ChildSpec(
|
||||
question_number=question_number,
|
||||
parent_question=parent_question,
|
||||
top_level_number=top_level_number,
|
||||
path=path,
|
||||
score=score,
|
||||
question_type="long_question",
|
||||
question_format="short_answer",
|
||||
analytics_topic=analytics_topic,
|
||||
topic_primary=topic_primary,
|
||||
topic_tags=topic_tags,
|
||||
skill_tags=skill_tags,
|
||||
page_number=page_number,
|
||||
)
|
||||
|
||||
|
||||
CHILDREN: list[ChildSpec] = [
|
||||
ChildSpec("1a", "1", "1", ("a",), 0.5, "true_false", "true_false", "Python Fundamentals", "Python Fundamentals", ("Python Fundamentals",), ("concept_check", "code_tracing"), page_number=3),
|
||||
ChildSpec("1b", "1", "1", ("b",), 0.5, "true_false", "true_false", "Python Fundamentals", "Python Fundamentals", ("Python Fundamentals",), ("concept_check", "broadcasting"), page_number=3),
|
||||
ChildSpec("1c", "1", "1", ("c",), 0.5, "true_false", "true_false", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("concept_check", "algorithm_property"), page_number=3),
|
||||
ChildSpec("1d", "1", "1", ("d",), 0.5, "true_false", "true_false", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("concept_check", "tie_reasoning"), page_number=3),
|
||||
ChildSpec("1e", "1", "1", ("e",), 0.5, "true_false", "true_false", "Evaluation and Validation", "Evaluation and Validation", ("Evaluation and Validation",), ("concept_check", "cross_validation"), page_number=3),
|
||||
ChildSpec("1f", "1", "1", ("f",), 0.5, "true_false", "true_false", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("concept_check", "clustering_property"), page_number=3),
|
||||
ChildSpec("1g", "1", "1", ("g",), 0.5, "true_false", "true_false", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("concept_check", "robustness_reasoning"), page_number=3),
|
||||
ChildSpec("1h", "1", "1", ("h",), 0.5, "true_false", "true_false", "Perceptron and MLP", "Perceptron and MLP", ("Perceptron and MLP",), ("concept_check", "decision_boundary"), page_number=3),
|
||||
ChildSpec("1i", "1", "1", ("i",), 0.5, "true_false", "true_false", "Perceptron and MLP", "Perceptron and MLP", ("Perceptron and MLP",), ("concept_check", "optimization_reasoning"), page_number=3),
|
||||
ChildSpec("1j", "1", "1", ("j",), 0.5, "true_false", "true_false", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("concept_check", "clustering_property"), page_number=3),
|
||||
short_answer("2a_i", "2a", "2", ("a", "i"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("code_tracing",), page_number=4),
|
||||
short_answer("2a_ii", "2a", "2", ("a", "ii"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("code_tracing",), page_number=4),
|
||||
short_answer("2a_iii", "2a", "2", ("a", "iii"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("array_manipulation",), page_number=5),
|
||||
short_answer("2a_iv", "2a", "2", ("a", "iv"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("array_construction",), page_number=5),
|
||||
short_answer("2a_v", "2a", "2", ("a", "v"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("aggregation",), page_number=5),
|
||||
short_answer("2a_vi", "2a", "2", ("a", "vi"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("transpose",), page_number=6),
|
||||
short_answer("2a_vii", "2a", "2", ("a", "vii"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("matrix_multiplication",), page_number=6),
|
||||
short_answer("2a_viii", "2a", "2", ("a", "viii"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("dot_product",), page_number=6),
|
||||
short_answer("2a_ix", "2a", "2", ("a", "ix"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("broadcasting",), page_number=6),
|
||||
short_answer("2a_x", "2a", "2", ("a", "x"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("error_reasoning",), page_number=7),
|
||||
short_answer("2a_xi", "2a", "2", ("a", "xi"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("broadcasting",), page_number=7),
|
||||
short_answer("2a_xii", "2a", "2", ("a", "xii"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("slicing",), page_number=7),
|
||||
short_answer("2a_xiii", "2a", "2", ("a", "xiii"), 1, analytics_topic="Python Fundamentals", topic_primary="Python Fundamentals", topic_tags=("Python Fundamentals",), skill_tags=("views_vs_copies",), page_number=7),
|
||||
ChildSpec("2b", "2", "2", ("b",), 6, "long_question", "coding", "Python Fundamentals", "Python Fundamentals", ("Python Fundamentals",), ("implementation", "vectorization", "similarity_computation"), page_number=8),
|
||||
ChildSpec("3a", "3", "3", ("a",), 5.5, "long_question", "long_answer", "Evaluation and Validation", "Evaluation and Validation", ("Evaluation and Validation",), ("manual_computation", "metric_reasoning"), page_number=10),
|
||||
short_answer("3b", "3", "3", ("b",), 1, analytics_topic="Evaluation and Validation", topic_primary="Evaluation and Validation", topic_tags=("Evaluation and Validation",), skill_tags=("metric_reasoning",), page_number=11),
|
||||
ChildSpec("3c", "3", "3", ("c",), 2.5, "long_question", "long_answer", "Evaluation and Validation", "Evaluation and Validation", ("Evaluation and Validation",), ("manual_computation", "metric_reasoning"), page_number=11),
|
||||
short_answer("3d", "3", "3", ("d",), 1, analytics_topic="Evaluation and Validation", topic_primary="Evaluation and Validation", topic_tags=("Evaluation and Validation",), skill_tags=("metric_reasoning",), page_number=12),
|
||||
ChildSpec("3e", "3", "3", ("e",), 6, "long_question", "coding", "Evaluation and Validation", "Evaluation and Validation", ("Evaluation and Validation",), ("implementation", "metrics", "vectorization"), page_number=12),
|
||||
ChildSpec("4a", "4", "4", ("a",), 4, "long_question", "long_answer", "Probabilistic Models", "Probabilistic Models", ("Probabilistic Models",), ("manual_computation", "gaussian_nb"), page_number=15),
|
||||
ChildSpec("4b", "4", "4", ("b",), 3, "long_question", "long_answer", "Probabilistic Models", "Probabilistic Models", ("Probabilistic Models",), ("manual_computation", "likelihood_reasoning"), page_number=15),
|
||||
ChildSpec("4c", "4", "4", ("c",), 4, "long_question", "long_answer", "Probabilistic Models", "Probabilistic Models", ("Probabilistic Models",), ("laplace_smoothing", "likelihood_reasoning"), page_number=16),
|
||||
short_answer("4d", "4", "4", ("d",), 2, analytics_topic="Probabilistic Models", topic_primary="Probabilistic Models", topic_tags=("Probabilistic Models",), skill_tags=("prior_reasoning",), page_number=17),
|
||||
ChildSpec("4e", "4", "4", ("e",), 3, "long_question", "long_answer", "Probabilistic Models", "Probabilistic Models", ("Probabilistic Models",), ("posterior_reasoning", "classification_decision"), page_number=17),
|
||||
ChildSpec("5a", "5", "5", ("a",), 3, "long_question", "long_answer", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("distance_calculation", "weighted_knn"), page_number=18),
|
||||
ChildSpec("5b", "5", "5", ("b",), 13, "long_question", "long_answer", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("cross_validation", "manual_tracing", "model_selection"), page_number=18),
|
||||
short_answer("5c", "5", "5", ("c",), 2, analytics_topic="KNN and Clustering", topic_primary="KNN and Clustering", topic_tags=("KNN and Clustering",), skill_tags=("test_error", "model_selection"), page_number=20),
|
||||
ChildSpec("6a", "6", "6", ("a",), 6, "long_question", "long_answer", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("manual_computation", "clustering"), page_number=21),
|
||||
ChildSpec("6b", "6", "6", ("b",), 6, "long_question", "long_answer", "KNN and Clustering", "KNN and Clustering", ("KNN and Clustering",), ("manual_computation", "clustering"), page_number=22),
|
||||
short_answer("6c", "6", "6", ("c",), 2, analytics_topic="KNN and Clustering", topic_primary="KNN and Clustering", topic_tags=("KNN and Clustering",), skill_tags=("outlier_reasoning",), page_number=22),
|
||||
short_answer("6d", "6", "6", ("d",), 2, analytics_topic="KNN and Clustering", topic_primary="KNN and Clustering", topic_tags=("KNN and Clustering",), skill_tags=("model_selection", "threshold_reasoning"), page_number=22),
|
||||
ChildSpec("7", "7", "7", (), 10, "long_question", "long_answer", "Evaluation and Validation", "Evaluation and Validation", ("Evaluation and Validation",), ("cross_validation", "data_leakage_reasoning"), page_number=23),
|
||||
]
|
||||
|
||||
|
||||
MARKER_RE = re.compile(r"(?m)^\(([a-z]+|[ivx]+)\)\s*")
|
||||
|
||||
|
||||
def split_sections(text: str) -> tuple[str, dict[str, str]]:
|
||||
matches = list(MARKER_RE.finditer(text))
|
||||
if not matches:
|
||||
return text.strip(), {}
|
||||
intro = text[: matches[0].start()].strip()
|
||||
sections: dict[str, str] = {}
|
||||
for idx, match in enumerate(matches):
|
||||
marker = match.group(1)
|
||||
end = matches[idx + 1].start() if idx + 1 < len(matches) else len(text)
|
||||
sections[marker] = text[match.start() : end].strip()
|
||||
return intro, sections
|
||||
|
||||
|
||||
def extract_segment(text: str, path: tuple[str, ...]) -> str:
|
||||
if not path:
|
||||
return text.strip()
|
||||
current = text.strip()
|
||||
carried_intro: list[str] = []
|
||||
for depth, marker in enumerate(path):
|
||||
intro, sections = split_sections(current)
|
||||
if depth == 0 and intro:
|
||||
carried_intro.append(intro)
|
||||
current = sections.get(marker, current)
|
||||
return "\n".join(part for part in [*carried_intro, current] if part).strip()
|
||||
|
||||
|
||||
def extract_pages(pdf_path: Path, start: int, end: int) -> str:
|
||||
doc = fitz.open(pdf_path)
|
||||
try:
|
||||
return "\n".join(doc[i].get_text("text") for i in range(start - 1, end))
|
||||
finally:
|
||||
doc.close()
|
||||
|
||||
|
||||
def load_seed_rows() -> dict[str, dict]:
|
||||
data = json.loads(PROBLEM_SEED_PATH.read_text())
|
||||
return {row["question_number"]: row for row in data if row["source_exam_key"] == EXAM_KEY}
|
||||
|
||||
|
||||
def build_source_rows(existing_rows: dict[str, dict]) -> dict[str, dict]:
|
||||
seed_rows = load_seed_rows()
|
||||
rows = dict(seed_rows)
|
||||
if "5" in rows:
|
||||
rows["5"] = {
|
||||
**rows["5"],
|
||||
"question_text": extract_pages(QUESTION_PDF, 18, 20),
|
||||
"raw_answer_text": extract_pages(ANSWER_PDF, 21, 25),
|
||||
"page_number": 18,
|
||||
"analytics_topic": "KNN and Clustering",
|
||||
"topic_primary": "KNN and Clustering",
|
||||
"topic_tags": ["KNN and Clustering"],
|
||||
"skill_tags": ["manual_computation", "distance_calculation", "algorithm_tracing"],
|
||||
"difficulty": "medium",
|
||||
}
|
||||
else:
|
||||
rows["5"] = {
|
||||
**seed_rows["5"],
|
||||
"question_text": extract_pages(QUESTION_PDF, 18, 20),
|
||||
"raw_answer_text": extract_pages(ANSWER_PDF, 21, 25),
|
||||
"page_number": 18,
|
||||
}
|
||||
if "7" in rows:
|
||||
rows["7"] = {
|
||||
**rows["7"],
|
||||
"question_text": extract_pages(QUESTION_PDF, 23, 24),
|
||||
"raw_answer_text": extract_pages(ANSWER_PDF, 31, 34),
|
||||
"page_number": 23,
|
||||
"analytics_topic": "Evaluation and Validation",
|
||||
"topic_primary": "Evaluation and Validation",
|
||||
"topic_tags": ["Evaluation and Validation"],
|
||||
"skill_tags": ["cross_validation", "data_leakage_reasoning"],
|
||||
"difficulty": "medium",
|
||||
}
|
||||
else:
|
||||
rows["7"] = {
|
||||
**seed_rows["7"],
|
||||
"question_text": extract_pages(QUESTION_PDF, 23, 24),
|
||||
"raw_answer_text": extract_pages(ANSWER_PDF, 31, 34),
|
||||
"page_number": 23,
|
||||
}
|
||||
return rows
|
||||
|
||||
|
||||
def extract_true_false_answers(answer_text: str) -> dict[str, str]:
|
||||
answers: dict[str, str] = {}
|
||||
table_match = re.search(r"Answer\s+([TF\s]+)", answer_text, re.S)
|
||||
if table_match:
|
||||
seq = re.findall(r"[TF]", table_match.group(1))
|
||||
if len(seq) >= 10:
|
||||
for idx, val in enumerate(seq[:10]):
|
||||
answers[chr(ord("a") + idx)] = val
|
||||
return answers
|
||||
lines = [line.strip() for line in answer_text.splitlines() if line.strip()]
|
||||
current_letter: str | None = None
|
||||
for line in lines:
|
||||
m = re.fullmatch(r"\(([a-j])\)", line)
|
||||
if m:
|
||||
current_letter = m.group(1)
|
||||
continue
|
||||
if current_letter and line in {"T", "F"}:
|
||||
answers[current_letter] = line
|
||||
current_letter = None
|
||||
if answers:
|
||||
return answers
|
||||
seq = re.findall(r"\b([TF])\b", answer_text)
|
||||
if len(seq) >= 10:
|
||||
for idx, val in enumerate(seq[:10]):
|
||||
answers[chr(ord("a") + idx)] = val
|
||||
return answers
|
||||
|
||||
|
||||
def main() -> None:
|
||||
sb = get_supabase()
|
||||
paper = sb.table("papers").select("id").eq("source_exam_key", EXAM_KEY).execute().data[0]
|
||||
paper_id = paper["id"]
|
||||
current_rows = (
|
||||
sb.table("paper_questions")
|
||||
.select("*")
|
||||
.eq("paper_id", paper_id)
|
||||
.order("display_order")
|
||||
.execute()
|
||||
.data
|
||||
)
|
||||
existing_by_number = {row["question_number"]: row for row in current_rows}
|
||||
parent_rows = build_source_rows(existing_by_number)
|
||||
tf_answers = extract_true_false_answers(parent_rows["1"]["raw_answer_text"] or "")
|
||||
|
||||
inserts = []
|
||||
for display_order, child in enumerate(CHILDREN, start=1):
|
||||
parent = parent_rows[child.top_level_number]
|
||||
existing = existing_by_number.get(child.question_number, {})
|
||||
question_text = extract_segment(parent["question_text"] or "", child.path)
|
||||
raw_answer_text = extract_segment(parent["raw_answer_text"] or "", child.path) if child.path else (parent["raw_answer_text"] or "")
|
||||
options = None
|
||||
correct_option = None
|
||||
if child.question_type == "true_false":
|
||||
options = TRUE_FALSE_OPTIONS
|
||||
correct_option = tf_answers.get(child.path[0])
|
||||
|
||||
inserts.append(
|
||||
{
|
||||
"paper_id": paper_id,
|
||||
"question_number": child.question_number,
|
||||
"parent_question": child.parent_question,
|
||||
"display_order": display_order,
|
||||
"question_type": child.question_type,
|
||||
"question_format": child.question_format,
|
||||
"question_text": question_text,
|
||||
"score": child.score,
|
||||
"page_number": child.page_number,
|
||||
"page_y_ratio": existing.get("page_y_ratio"),
|
||||
"options": options,
|
||||
"correct_option": correct_option,
|
||||
"correct_answer": None,
|
||||
"raw_answer_text": raw_answer_text,
|
||||
"topics": existing.get("topics") or (list(child.topic_tags) if child.topic_tags else parent.get("topics")),
|
||||
"topic_primary": existing.get("topic_primary") or child.topic_primary or parent.get("topic_primary"),
|
||||
"analytics_topic": existing.get("analytics_topic") or child.analytics_topic or parent.get("analytics_topic"),
|
||||
"topic_tags": existing.get("topic_tags") or (list(child.topic_tags) if child.topic_tags else parent.get("topic_tags")),
|
||||
"skill_tags": existing.get("skill_tags") or (list(child.skill_tags) if child.skill_tags else parent.get("skill_tags")),
|
||||
"difficulty": existing.get("difficulty") or parent.get("difficulty"),
|
||||
"knowledge_reminder": existing.get("knowledge_reminder", ""),
|
||||
"ai_hint": existing.get("ai_hint", ""),
|
||||
"solution": existing.get("solution", ""),
|
||||
}
|
||||
)
|
||||
|
||||
sb.table("paper_questions").delete().eq("paper_id", paper_id).execute()
|
||||
sb.table("paper_questions").insert(inserts).execute()
|
||||
sb.table("papers").update({"question_count": len(inserts), "status": "processing"}).eq("id", paper_id).execute()
|
||||
print(f"Inserted {len(inserts)} rows for {EXAM_KEY}.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
121
backend/upload_course_library_pdfs.py
Normal file
121
backend/upload_course_library_pdfs.py
Normal file
@@ -0,0 +1,121 @@
|
||||
"""Upload COMP2211 course-library PDFs to Supabase Storage.
|
||||
|
||||
Run from the backend directory:
|
||||
uv run python upload_course_library_pdfs.py
|
||||
|
||||
Each entry maps a storage path (inside the `papers` bucket) to the local
|
||||
source file under pastpaper-scraper/papers/COMP2211/.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Manifest: (storage_path, local_filename)
|
||||
# storage_path is relative inside the `papers` bucket.
|
||||
# local_filename is relative to PAPERS_DIR below.
|
||||
# ---------------------------------------------------------------------------
|
||||
MANIFEST: list[tuple[str, str]] = [
|
||||
(
|
||||
"course-library/COMP2211/COMP2211-2022-fall-midterm/paper.pdf",
|
||||
"(COMP2211)[2022](f)midterm~=yjz8dxdd^_27002.pdf",
|
||||
),
|
||||
(
|
||||
"course-library/COMP2211/COMP2211-2022-fall-midterm/answer.pdf",
|
||||
"(COMP2211)[2022](f)midterm~=yjz8dxdd^_18747.pdf",
|
||||
),
|
||||
(
|
||||
"course-library/COMP2211/COMP2211-2022-spring-midterm/paper.pdf",
|
||||
"(COMP2211)[2022](s)midterm~=b8bidkgs^_14629.pdf",
|
||||
),
|
||||
(
|
||||
"course-library/COMP2211/COMP2211-2022-spring-midterm/answer.pdf",
|
||||
"(COMP2211)[2022](s)midterm~=6ma030^_89587.pdf",
|
||||
),
|
||||
(
|
||||
"course-library/COMP2211/COMP2211-2022-spring-final-part-a/paper.pdf",
|
||||
"(COMP2211)[2022](s)final~=b8bidkgs^_33018.pdf",
|
||||
),
|
||||
(
|
||||
"course-library/COMP2211/COMP2211-2022-spring-final-part-a/answer.pdf",
|
||||
"(COMP2211)[2022](s)final~=ajou6^_82011.pdf",
|
||||
),
|
||||
(
|
||||
"course-library/COMP2211/COMP2211-2022-spring-final-part-b/paper.pdf",
|
||||
"(COMP2211)[2022](s)final~=b8bidkgs^_40627.pdf",
|
||||
),
|
||||
(
|
||||
"course-library/COMP2211/COMP2211-2022-spring-final-part-b/answer.pdf",
|
||||
"(COMP2211)[2022](s)final~=ajou6^_51199.pdf",
|
||||
),
|
||||
(
|
||||
"course-library/COMP2211/COMP2211-2023-spring-midterm/paper.pdf",
|
||||
"(COMP2211)[2023](s)midterm~=bxbidkmj^_26587.pdf",
|
||||
),
|
||||
(
|
||||
"course-library/COMP2211/COMP2211-2023-spring-midterm/answer.pdf",
|
||||
"(COMP2211)[2023](s)midterm~clchanbg^_17297.pdf",
|
||||
),
|
||||
(
|
||||
"course-library/COMP2211/COMP2211-2024-spring-midterm/paper.pdf",
|
||||
"(COMP2211)[2024](s)midterm~=rcidkjgf^_82003.pdf",
|
||||
),
|
||||
(
|
||||
"course-library/COMP2211/COMP2211-2024-spring-midterm/answer.pdf",
|
||||
"(COMP2211)[2024](s)midterm~=ubrzkjmz^_90406.pdf",
|
||||
),
|
||||
(
|
||||
"course-library/COMP2211/COMP2211-2024-spring-final/paper.pdf",
|
||||
"(COMP2211)[2024](s)final~=igk5mmg^_90365.pdf",
|
||||
),
|
||||
(
|
||||
"course-library/COMP2211/COMP2211-2024-spring-final/answer.pdf",
|
||||
"(COMP2211)[2024](s)final~=igk5mmg^_58857.pdf",
|
||||
),
|
||||
]
|
||||
|
||||
PAPERS_DIR = (
|
||||
Path(__file__).parent.parent
|
||||
/ "pastpaper-scraper"
|
||||
/ "papers"
|
||||
/ "COMP2211"
|
||||
)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
from app.services.supabase_client import get_supabase
|
||||
|
||||
sb = get_supabase()
|
||||
bucket = sb.storage.from_("papers")
|
||||
|
||||
ok = 0
|
||||
skipped = 0
|
||||
failed = 0
|
||||
|
||||
for storage_path, local_name in MANIFEST:
|
||||
local_file = PAPERS_DIR / local_name
|
||||
if not local_file.exists():
|
||||
print(f" MISSING local file: {local_name}")
|
||||
failed += 1
|
||||
continue
|
||||
|
||||
data = local_file.read_bytes()
|
||||
try:
|
||||
bucket.upload(
|
||||
storage_path,
|
||||
data,
|
||||
file_options={"content-type": "application/pdf", "upsert": "true"},
|
||||
)
|
||||
print(f" OK {storage_path}")
|
||||
ok += 1
|
||||
except Exception as exc:
|
||||
print(f" ERR {storage_path}: {exc}")
|
||||
failed += 1
|
||||
|
||||
print(f"\nDone: {ok} uploaded, {skipped} skipped, {failed} failed.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
1969
backend/uv.lock
generated
Normal file
1969
backend/uv.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user