Initial commit: PastPaper Master full stack

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Zhao
2026-04-21 12:15:35 +07:00
commit 7a09167261
105 changed files with 24799 additions and 0 deletions

View File

View File

@@ -0,0 +1,285 @@
"""Course-level analytics endpoints."""
from __future__ import annotations
from collections import Counter, defaultdict
from fastapi import APIRouter
from app.services.supabase_client import get_supabase
router = APIRouter()
DIFFICULTY_SCORE = {"easy": 1, "medium": 2, "hard": 3}
DIFFICULTY_LABEL = {1: "Easy", 2: "Medium", 3: "Hard"}
# ── Topic normalization ──────────────────────────────────────
# Map variant spellings to canonical label
_TOPIC_ALIASES: dict[str, str] = {
"numpy": "NumPy",
"naïve bayes": "Naive Bayes",
"naïve bayes classifier": "Naive Bayes",
"naive bayes classifier": "Naive Bayes",
"bayes classifier": "Naive Bayes",
"bayes model": "Naive Bayes",
"bayes' theorem": "Naive Bayes",
"bayes' rule": "Naive Bayes",
"k-nearest neighbors": "K-Nearest Neighbors (KNN)",
"knn": "K-Nearest Neighbors (KNN)",
"k-means clustering": "K-Means Clustering",
"k-means": "K-Means Clustering",
"k means": "K-Means Clustering",
"multilayer perceptron": "Multilayer Perceptron (MLP)",
"multi-layer perceptron": "Multilayer Perceptron (MLP)",
"multi-layer perceptron (mlp)": "Multilayer Perceptron (MLP)",
"mlp": "Multilayer Perceptron (MLP)",
"single layer perceptron": "Perceptron",
"convolutional neural network": "CNN",
"convolutional neural network (cnn)": "CNN",
"convolutional neural networks": "CNN",
"cnn architecture": "CNN",
"cnn properties": "CNN",
"python fundamentals": "Python",
"python programming": "Python",
"python implementation": "Python",
"advanced python programming": "Python",
"python programming: convolutional neural network": "CNN",
"cross-validation": "Cross Validation",
"model evaluation implementation": "Model Evaluation",
"digital image processing": "Image Processing",
"computer vision": "Image Processing",
"array slicing": "Array Slicing",
"slicing": "Array Slicing",
"array indexing": "Array Slicing",
"array reshaping": "Reshape",
"array views": "Array Slicing",
"view vs copy": "Array Slicing",
"boolean indexing": "Array Slicing",
"arange": "NumPy",
"newaxis": "NumPy",
"expand dims": "NumPy",
"transpose": "NumPy",
"type casting": "NumPy",
"element-wise operation": "NumPy",
"array reduction": "NumPy",
"multi-dimensional array": "NumPy",
"dot product": "NumPy",
"vectorization": "NumPy",
"activation functions": "Activation Function",
"linear activation function": "Activation Function",
"neural network architecture": "Neural Networks",
"hidden layer": "Neural Networks",
"deep learning": "Neural Networks",
"deep learning frameworks": "Neural Networks",
"alpha-beta pruning": "Alpha-Beta Pruning",
"minimax algorithm": "Minimax",
"ethics of ai": "AI Ethics",
"ethics": "AI Ethics",
"cosine distance": "Cosine Similarity",
"distance calculation": "Distance Metrics",
"euclidean distance": "Distance Metrics",
"manhattan distance": "Distance Metrics",
"hamming distance": "Distance Metrics",
"precision": "Model Evaluation",
"recall": "Model Evaluation",
"f1 score": "Model Evaluation",
"macro f1 score": "Model Evaluation",
"accuracy": "Model Evaluation",
"classification accuracy": "Model Evaluation",
"confusion matrix": "Model Evaluation",
"convolution operation": "Convolution",
"dilated convolution": "Convolution",
"3d convolution": "Convolution",
"gaussian likelihood": "Probability",
"gaussian distribution": "Probability",
"categorical likelihood": "Probability",
"conditional probability": "Probability",
"total probability theorem": "Probability",
"probability assumptions": "Probability",
"tensorflow": "Keras",
"model summary": "Keras",
"model construction": "Keras",
"trainable parameters": "Parameter Calculation",
"parameter reduction": "Parameter Calculation",
"output shape calculation": "Parameter Calculation",
"shape calculation": "Parameter Calculation",
}
def normalize_topic(label: str) -> str:
return _TOPIC_ALIASES.get(label.lower().strip(), label)
def extract_topic_labels(question: dict) -> list[str]:
labels: list[str] = []
raw_labels: list[str] = []
analytics_topic = question.get("analytics_topic")
if analytics_topic:
raw_labels.append(analytics_topic)
for tag in question.get("topic_tags") or []:
if tag and tag not in raw_labels:
raw_labels.append(tag)
if not raw_labels:
for tag in question.get("topics") or []:
if tag and tag not in raw_labels:
raw_labels.append(tag)
# Normalize and deduplicate
seen: set[str] = set()
for raw in raw_labels:
norm = normalize_topic(raw)
if norm not in seen:
seen.add(norm)
labels.append(norm)
return labels
def extract_question_family(question: dict) -> str:
return (
question.get("question_format")
or question.get("question_type")
or "unknown"
)
@router.get("/courses")
async def list_courses():
"""返回所有有 ready 状态试卷的课程列表"""
sb = get_supabase()
rows = (
sb.table("papers")
.select("course_code")
.eq("status", "ready")
.execute()
.data
)
codes = sorted({row["course_code"] for row in rows if row.get("course_code")})
return codes
@router.get("/course/{course_code}")
async def get_course_analytics(course_code: str):
sb = get_supabase()
papers = (
sb.table("papers")
.select("id, course_code, year, term, exam_type, part_label, status")
.eq("course_code", course_code.upper())
.eq("status", "ready")
.order("year", desc=True)
.execute()
.data
)
if not papers:
return {
"course_code": course_code.upper(),
"kpi": {"papers": 0, "questions": 0, "topics": 0, "difficulty": "N/A"},
"topic_frequency": [],
"question_types": [],
"difficulty_distribution": {"easy": 0, "medium": 0, "hard": 0},
"high_yield_topics": [],
}
paper_ids = [paper["id"] for paper in papers]
questions = (
sb.table("paper_questions")
.select(
"id, paper_id, question_number, question_type, question_format, "
"question_text, score, topics, analytics_topic, topic_tags, difficulty"
)
.in_("paper_id", paper_ids)
.order("display_order")
.execute()
.data
)
papers_by_id = {paper["id"]: paper for paper in papers}
total_questions = len(questions)
topic_counter: Counter[str] = Counter()
type_counter: Counter[str] = Counter()
difficulty_counter: Counter[str] = Counter()
topic_examples: dict[str, list[dict]] = defaultdict(list)
difficulty_scores: list[int] = []
all_question_items: list[dict] = []
for question in questions:
question_type = extract_question_family(question)
type_counter[question_type] += 1
difficulty = question.get("difficulty")
if difficulty in DIFFICULTY_SCORE:
difficulty_counter[difficulty] += 1
difficulty_scores.append(DIFFICULTY_SCORE[difficulty])
paper = papers_by_id.get(question["paper_id"], {})
source_label = (
f"{paper.get('year', '')} {paper.get('term', '').title()} "
f"{paper.get('exam_type', '').title()}"
).strip()
if paper.get("part_label"):
source_label = f"{source_label} Part {paper['part_label']}"
topics = extract_topic_labels(question)
q_item = {
"paper_id": paper.get("id"),
"source": source_label,
"question_number": question["question_number"],
"preview": question["question_text"][:220],
"difficulty": question.get("difficulty"),
"question_type": question_type,
"year": paper.get("year"),
"term": paper.get("term"),
"exam_type": paper.get("exam_type"),
"topics": topics,
}
all_question_items.append(q_item)
for topic in topics:
topic_counter[topic] += 1
topic_examples[topic].append(q_item)
avg_difficulty = "N/A"
if difficulty_scores:
rounded = round(sum(difficulty_scores) / len(difficulty_scores))
avg_difficulty = DIFFICULTY_LABEL.get(rounded, "Medium")
topic_frequency = []
for topic, count in topic_counter.most_common():
pct = round((count / total_questions) * 100) if total_questions else 0
topic_frequency.append(
{
"label": topic,
"count": count,
"pct": pct,
"questions": topic_examples[topic],
}
)
question_types = []
for label, count in type_counter.most_common():
pct = round((count / total_questions) * 100) if total_questions else 0
question_types.append({"label": label, "count": count, "pct": pct})
return {
"course_code": course_code.upper(),
"kpi": {
"papers": len(papers),
"questions": total_questions,
"topics": len(topic_counter),
"difficulty": avg_difficulty,
},
"topic_frequency": topic_frequency,
"question_types": question_types,
"all_questions": all_question_items,
"difficulty_distribution": {
"easy": difficulty_counter.get("easy", 0),
"medium": difficulty_counter.get("medium", 0),
"hard": difficulty_counter.get("hard", 0),
},
"high_yield_topics": [topic for topic, _ in topic_counter.most_common(5)],
}

View File

@@ -0,0 +1,208 @@
"""用户答题记录 + 拍照批改 + 错题本"""
import asyncio
from fastapi import APIRouter, UploadFile, File, Form, HTTPException, Depends
from pydantic import BaseModel
from app.services.supabase_client import get_supabase
from app.services.grader import ocr_photo, grade_answer
from app.dependencies.auth import get_current_user_id
router = APIRouter()
class AttemptCreate(BaseModel):
question_id: str
attempt_type: str # "select" | "input" | "photo"
user_answer: str | None = None
is_correct: bool | None = None
class AttemptUpdate(BaseModel):
in_error_book: bool | None = None
mastered: bool | None = None
@router.post("/")
async def create_attempt(data: AttemptCreate, user_id: str = Depends(get_current_user_id)):
"""记录一次答题"""
sb = get_supabase()
record = {
"user_id": user_id,
"question_id": data.question_id,
"attempt_type": data.attempt_type,
"user_answer": data.user_answer,
"is_correct": data.is_correct,
}
# Auto add to error book if wrong
if data.is_correct is False:
record["in_error_book"] = True
result = sb.table("user_attempts").insert(record).execute()
return result.data[0]
@router.post("/photo")
async def photo_attempt(
question_id: str = Form(...),
photo: UploadFile = File(...),
user_id: str = Depends(get_current_user_id),
):
"""拍照上传 → OCR → AI批改"""
sb = get_supabase()
# 1. Read photo
photo_bytes = await photo.read()
# 2. Upload to storage
storage_path = f"attempts/{user_id}/{question_id}/{photo.filename}"
sb.storage.from_("attempt-photos").upload(
storage_path, photo_bytes,
file_options={"content-type": photo.content_type or "image/jpeg", "upsert": "true"},
)
photo_url = sb.storage.from_("attempt-photos").get_public_url(storage_path)
# 3. OCR (run in thread pool to avoid blocking event loop)
ocr_text = await asyncio.to_thread(ocr_photo, photo_bytes)
# 4. Fetch question for grading context
q_result = sb.table("paper_questions").select("*").eq("id", question_id).execute()
if not q_result.data:
raise HTTPException(status_code=404, detail="Question not found")
question = q_result.data[0]
# 5. AI grading (run in thread pool)
grade_result = await asyncio.to_thread(grade_answer, question, ocr_text)
# 6. Save attempt
record = {
"user_id": user_id,
"question_id": question_id,
"attempt_type": "photo",
"photo_url": photo_url,
"photo_ocr_text": ocr_text,
"is_correct": grade_result.get("is_correct", False),
"feedback": grade_result.get("feedback", ""),
"error_at_step": grade_result.get("error_at_step"),
"in_error_book": not grade_result.get("is_correct", False),
}
result = sb.table("user_attempts").insert(record).execute()
return {
"attempt": result.data[0],
"ocr_text": ocr_text,
"grade": grade_result,
}
@router.get("/error-book")
async def get_error_book(
course_code: str | None = None,
user_id: str = Depends(get_current_user_id),
):
"""获取错题本"""
sb = get_supabase()
attempts = (
sb.table("user_attempts")
.select("*")
.eq("user_id", user_id)
.eq("in_error_book", True)
.eq("mastered", False)
.order("created_at", desc=True)
.execute()
.data
)
if not attempts:
return []
question_ids = list({attempt["question_id"] for attempt in attempts})
questions = (
sb.table("paper_questions")
.select("*")
.in_("id", question_ids)
.execute()
.data
)
questions_by_id = {question["id"]: question for question in questions}
paper_ids = list({question["paper_id"] for question in questions})
papers = (
sb.table("papers")
.select("id, course_code, year, term, exam_type, part_label")
.in_("id", paper_ids)
.execute()
.data
)
papers_by_id = {paper["id"]: paper for paper in papers}
enriched = []
for attempt in attempts:
question = questions_by_id.get(attempt["question_id"])
if not question:
continue
paper = papers_by_id.get(question["paper_id"])
if course_code and paper and paper.get("course_code") != course_code.upper():
continue
enriched.append(
{
**attempt,
"paper_questions": {
**question,
"paper": paper,
},
}
)
return enriched
@router.get("/by-paper/{paper_id}")
async def get_paper_attempts(paper_id: str, user_id: str = Depends(get_current_user_id)):
"""获取某张试卷所有题目的最新判卷记录"""
sb = get_supabase()
attempts = (
sb.table("user_attempts")
.select("question_id, is_correct, feedback, photo_ocr_text, attempt_type, created_at")
.eq("user_id", user_id)
.order("created_at", desc=True)
.execute()
.data
)
# 只保留 photo 类型的,且只保留每题最新一条
question_ids = (
sb.table("paper_questions")
.select("id")
.eq("paper_id", paper_id)
.execute()
.data
)
qid_set = {q["id"] for q in question_ids}
seen: set[str] = set()
result = []
for a in attempts:
if a["question_id"] not in qid_set:
continue
if a["question_id"] in seen:
continue
if a["attempt_type"] != "photo":
continue
seen.add(a["question_id"])
result.append(a)
return result
@router.patch("/{attempt_id}")
async def update_attempt(attempt_id: str, data: AttemptUpdate):
"""更新错题状态(标记掌握等)"""
sb = get_supabase()
update = {}
if data.in_error_book is not None:
update["in_error_book"] = data.in_error_book
if data.mastered is not None:
update["mastered"] = data.mastered
if not update:
raise HTTPException(status_code=400, detail="Nothing to update")
result = sb.table("user_attempts").update(update).eq("id", attempt_id).execute()
if not result.data:
raise HTTPException(status_code=404, detail="Attempt not found")
return result.data[0]

View File

@@ -0,0 +1,142 @@
"""试卷上传 + 处理管线"""
import asyncio
import threading
from fastapi import APIRouter, UploadFile, File, Form, HTTPException, Depends
from app.services.supabase_client import get_supabase
from app.services.text_extractor import extract_pdf, get_full_text
from app.services.paper_processor import process_paper
from app.dependencies.auth import get_current_user_id
router = APIRouter()
def _upload_and_process_sync(
paper_id: str,
storage_path: str,
paper_bytes: bytes,
answer_bytes: bytes | None,
):
"""在独立线程中运行Storage 上传 + AI 处理"""
sb = get_supabase()
try:
paper_storage_path = f"{storage_path}/paper.pdf"
sb.storage.from_("papers").upload(
paper_storage_path, paper_bytes,
file_options={"content-type": "application/pdf", "upsert": "true"},
)
paper_url = sb.storage.from_("papers").get_public_url(paper_storage_path)
update_data: dict = {"paper_file_url": paper_url}
if answer_bytes:
answer_storage_path = f"{storage_path}/answer.pdf"
sb.storage.from_("papers").upload(
answer_storage_path, answer_bytes,
file_options={"content-type": "application/pdf", "upsert": "true"},
)
update_data["answer_file_url"] = sb.storage.from_("papers").get_public_url(answer_storage_path)
sb.table("papers").update(update_data).eq("id", paper_id).execute()
except Exception:
pass
# process_paper 是 async在新事件循环里跑
asyncio.run(process_paper(paper_id, paper_bytes, answer_bytes))
@router.get("/")
async def list_papers():
"""获取试卷列表(公共资产,所有用户共享)"""
sb = get_supabase()
return (
sb.table("papers")
.select("id, course_code, year, term, exam_type, status, question_count, total_score, difficulty_level, processing_step, processing_progress, processing_total, created_at")
.order("created_at", desc=True)
.execute()
.data
)
@router.get("/mine")
async def my_papers(user_id: str = Depends(get_current_user_id)):
"""当前用户上传的试卷(含 processing 状态)"""
sb = get_supabase()
return (
sb.table("papers")
.select("id, course_code, year, term, exam_type, part_label, status, question_count, processing_step, processing_progress, processing_total, created_at")
.eq("user_id", user_id)
.order("created_at", desc=True)
.execute()
.data
)
@router.post("/upload")
async def upload_paper(
paper_file: UploadFile = File(...),
answer_file: UploadFile | None = File(None),
course_code: str = Form(...),
year: int = Form(...),
term: str = Form(...),
exam_type: str = Form(...),
user_id: str = Depends(get_current_user_id),
):
"""上传试卷 PDF可选答案 PDF触发后台处理"""
sb = get_supabase()
# 1. 读取文件内容(已在内存中,快)
paper_bytes = await paper_file.read()
answer_bytes = await answer_file.read() if answer_file else None
# 2. 立即创建记录status=processing马上返回
storage_path = f"{course_code.upper()}/{year}_{term}_{exam_type}"
paper_record = sb.table("papers").insert({
"user_id": user_id,
"course_code": course_code.upper(),
"year": year,
"term": term,
"exam_type": exam_type,
"paper_file_url": "", # 后台上传后更新
"answer_file_url": None,
"status": "processing",
}).execute()
paper_id = paper_record.data[0]["id"]
# 3. 在独立线程中运行,完全不阻塞事件循环
threading.Thread(
target=_upload_and_process_sync,
args=(paper_id, storage_path, paper_bytes, answer_bytes),
daemon=True,
).start()
return {
"paper_id": paper_id,
"status": "processing",
"message": "试卷已上传,正在处理中...",
}
@router.get("/{paper_id}")
async def get_paper(paper_id: str):
"""获取试卷信息 + 处理状态"""
sb = get_supabase()
result = sb.table("papers").select("*").eq("id", paper_id).execute()
if not result.data:
raise HTTPException(status_code=404, detail="Paper not found")
return result.data[0]
@router.get("/{paper_id}/questions")
async def get_questions(paper_id: str):
"""获取试卷的所有题目(含 AI 三件套)"""
sb = get_supabase()
result = (
sb.table("paper_questions")
.select("*")
.eq("paper_id", paper_id)
.order("display_order")
.execute()
)
return result.data

View File

@@ -0,0 +1,325 @@
"""题目相关:变式题生成 + 相似题召回"""
from __future__ import annotations
import asyncio
import time
from fastapi import APIRouter, HTTPException, Depends
from pydantic import BaseModel
from app.services.supabase_client import get_supabase
from app.services.grader import generate_variant
from app.dependencies.auth import get_current_user_id
# Simple in-memory cache: question_id → (timestamp, result)
_similar_cache: dict[str, tuple[float, list]] = {}
_CACHE_TTL = 300 # 5 minutes
class VariantUpdate(BaseModel):
favorited: bool | None = None
router = APIRouter()
def normalized_labels(values: list[str] | None) -> dict[str, str]:
labels: dict[str, str] = {}
for value in values or []:
if value:
labels[value.lower()] = value
return labels
def question_family(question: dict) -> str:
return question.get("question_format") or question.get("question_type") or "unknown"
def display_topics(question: dict) -> list[str]:
labels: list[str] = []
analytics_topic = question.get("analytics_topic")
if analytics_topic:
labels.append(analytics_topic)
for topic in question.get("topic_tags") or []:
if topic and topic not in labels:
labels.append(topic)
if labels:
return labels
for topic in question.get("topics") or []:
if topic and topic not in labels:
labels.append(topic)
return labels
def similarity_score(
target: dict,
candidate: dict,
text_score: float = 0.0,
) -> tuple[int, list[str]]:
score = 0
reasons: list[str] = []
# Primary topic bucket: 40 pts
target_topic = target.get("analytics_topic")
candidate_topic = candidate.get("analytics_topic")
if target_topic and target_topic == candidate_topic:
score += 40
reasons.append(f"Same topic: {target_topic}")
# Concept overlap: up to 20 pts
target_topics = normalized_labels(target.get("topic_tags"))
candidate_topics = normalized_labels(candidate.get("topic_tags"))
shared_topics = sorted(set(target_topics) & set(candidate_topics))
if shared_topics:
score += min(len(shared_topics) * 10, 20)
# Only show concept reason if analytics_topic didn't already match (avoid redundancy)
if not (target_topic and target_topic == candidate_topic):
reasons.append(
"Shared concept: "
+ ", ".join(target_topics[key] for key in shared_topics[:2])
)
# Skill overlap: up to 20 pts
target_skills = normalized_labels(target.get("skill_tags"))
candidate_skills = normalized_labels(candidate.get("skill_tags"))
shared_skills = sorted(set(target_skills) & set(candidate_skills))
if shared_skills:
score += min(len(shared_skills) * 10, 20)
reasons.append(
"Shared skill: "
+ ", ".join(target_skills[key] for key in shared_skills[:2])
)
# Same question format: 10 pts
if question_family(candidate) == question_family(target):
score += 10
reasons.append("Same format")
# Same difficulty: 5 pts
if candidate.get("difficulty") and candidate.get("difficulty") == target.get("difficulty"):
score += 5
reasons.append("Same difficulty")
# Full-text similarity from PostgreSQL ts_rank_cd: up to 20 pts
if text_score > 0:
text_pts = min(round(text_score * 60), 20)
score += text_pts
if text_pts >= 4:
reasons.append("Similar wording")
return min(score, 99), reasons
@router.get("/variants/favorited")
async def get_favorited_variants(user_id: str = Depends(get_current_user_id)):
"""获取用户收藏的所有 variant用于 Error Book"""
sb = get_supabase()
rows = (
sb.table("question_variants")
.select("*, paper_questions(question_number, paper_id, papers(id, course_code, year, term, exam_type, part_label))")
.eq("user_id", user_id)
.eq("favorited", True)
.order("created_at", desc=True)
.execute()
.data
)
return rows
@router.post("/{question_id}/variant")
async def create_variant(question_id: str, user_id: str = Depends(get_current_user_id)):
"""生成变式题并入库"""
sb = get_supabase()
result = sb.table("paper_questions").select("*").eq("id", question_id).execute()
if not result.data:
raise HTTPException(status_code=404, detail="Question not found")
question = result.data[0]
variant_data = await asyncio.to_thread(generate_variant, question)
variant_data["knowledge_reminder"] = question.get("knowledge_reminder", "")
saved = sb.table("question_variants").insert({
"user_id": user_id,
"source_question_id": question_id,
"variant_data": variant_data,
"favorited": False,
}).execute()
row = saved.data[0]
row["source_question_number"] = question["question_number"]
return row
@router.get("/{question_id}/variants")
async def list_variants(question_id: str, user_id: str = Depends(get_current_user_id)):
"""获取某道题的用户所有 variant"""
sb = get_supabase()
q_result = sb.table("paper_questions").select("question_number").eq("id", question_id).execute()
question_number = q_result.data[0]["question_number"] if q_result.data else ""
rows = (
sb.table("question_variants")
.select("*")
.eq("user_id", user_id)
.eq("source_question_id", question_id)
.order("created_at", desc=True)
.execute()
.data
)
for row in rows:
row["source_question_number"] = question_number
return rows
@router.patch("/variant/{variant_id}")
async def update_variant(variant_id: str, data: VariantUpdate, user_id: str = Depends(get_current_user_id)):
"""更新 variant收藏/取消收藏)"""
sb = get_supabase()
update: dict = {}
if data.favorited is not None:
update["favorited"] = data.favorited
if not update:
raise HTTPException(status_code=400, detail="Nothing to update")
result = (
sb.table("question_variants")
.update(update)
.eq("id", variant_id)
.eq("user_id", user_id)
.execute()
)
if not result.data:
raise HTTPException(status_code=404, detail="Variant not found")
return result.data[0]
@router.delete("/variant/{variant_id}", status_code=204)
async def delete_variant(variant_id: str, user_id: str = Depends(get_current_user_id)):
"""删除 variant"""
sb = get_supabase()
sb.table("question_variants").delete().eq("id", variant_id).eq("user_id", user_id).execute()
@router.get("/{question_id}/similar")
async def get_similar_questions(question_id: str, limit: int = 6):
"""Retrieve similar questions from the same course."""
# Cache hit
cached = _similar_cache.get(question_id)
if cached and (time.time() - cached[0]) < _CACHE_TTL:
return cached[1][:max(1, min(limit, 12))]
sb = get_supabase()
result = sb.table("paper_questions").select("*, similar_questions").eq("id", question_id).execute()
if not result.data:
raise HTTPException(status_code=404, detail="Question not found")
target = result.data[0]
# Return pre-computed immediately; schedule background refresh
if target.get("similar_questions"):
precomputed = target["similar_questions"]
_similar_cache[question_id] = (time.time(), precomputed)
return precomputed[:max(1, min(limit, 12))]
paper_result = sb.table("papers").select("id, course_code").eq("id", target["paper_id"]).execute()
# (fallback: compute on-the-fly for questions not yet backfilled)
if not paper_result.data:
raise HTTPException(status_code=404, detail="Paper not found")
course_code = paper_result.data[0]["course_code"]
papers = (
sb.table("papers")
.select("id, course_code, year, term, exam_type, part_label")
.eq("course_code", course_code)
.eq("status", "ready")
.execute()
.data
)
paper_ids = [paper["id"] for paper in papers if paper["id"] != target["paper_id"]]
if not paper_ids:
return []
papers_by_id = {paper["id"]: paper for paper in papers}
# Pre-filter by analytics_topic in DB when possible (cuts candidates from ~250 to ~30)
candidates_query = (
sb.table("paper_questions")
.select(
"id, paper_id, question_number, question_type, question_format, "
"question_text, score, topics, analytics_topic, topic_tags, skill_tags, "
"difficulty, knowledge_reminder, ai_hint, solution"
)
.in_("paper_id", paper_ids)
)
target_topic = target.get("analytics_topic")
if target_topic:
candidates_query = candidates_query.eq("analytics_topic", target_topic)
candidates = candidates_query.execute().data
if not candidates:
return []
# Batch full-text scores from PostgreSQL (skip if too many candidates — slow)
text_scores: dict[str, float] = {}
if len(candidates) <= 50:
try:
rpc_result = sb.rpc(
"text_similarity_scores",
{
"query_text": target.get("question_text") or "",
"candidate_ids": [c["id"] for c in candidates],
},
).execute()
for row in rpc_result.data or []:
text_scores[row["question_id"]] = float(row["text_score"] or 0)
except Exception:
pass
ranked = []
for candidate in candidates:
text_score = text_scores.get(candidate["id"], 0.0)
match_percent, reasons = similarity_score(target, candidate, text_score)
if match_percent < 20:
continue
paper = papers_by_id.get(candidate["paper_id"], {})
source = (
f"{paper.get('year', '')} {paper.get('term', '').title()} "
f"{paper.get('exam_type', '').title()}"
).strip()
if paper.get("part_label"):
source = f"{source} Part {paper['part_label']}"
ranked.append(
{
"id": candidate["id"],
"paper_id": candidate["paper_id"],
"source": source,
"question_number": candidate["question_number"],
"match_percent": match_percent,
"match_reasons": reasons,
"question_type": question_family(candidate),
"question_text": candidate["question_text"],
"topics": display_topics(candidate),
"difficulty": candidate.get("difficulty"),
"knowledge_reminder": candidate.get("knowledge_reminder", ""),
"ai_hint": candidate.get("ai_hint", ""),
"solution": candidate.get("solution", ""),
}
)
ranked.sort(key=lambda item: (-item["match_percent"], item["source"], item["question_number"]))
# Keep only the best-scoring question per paper
seen_papers: set[str] = set()
deduped = []
for item in ranked:
if item["paper_id"] not in seen_papers:
seen_papers.add(item["paper_id"])
deduped.append(item)
_similar_cache[question_id] = (time.time(), deduped)
# Persist to DB so future requests are instant
try:
sb.table("paper_questions").update({"similar_questions": deduped}).eq("id", question_id).execute()
except Exception:
pass
return deduped[:max(1, min(limit, 12))]