Initial commit: PastPaper Master full stack

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-21 12:15:35 +07:00
commit 7a09167261
105 changed files with 24799 additions and 0 deletions
--- a/backend/app/routers/analytics.py
+++ b/backend/app/routers/analytics.py
@@ -0,0 +1,285 @@
+"""Course-level analytics endpoints."""
+
+from __future__ import annotations
+
+from collections import Counter, defaultdict
+
+from fastapi import APIRouter
+
+from app.services.supabase_client import get_supabase
+
+router = APIRouter()
+
+
+DIFFICULTY_SCORE = {"easy": 1, "medium": 2, "hard": 3}
+DIFFICULTY_LABEL = {1: "Easy", 2: "Medium", 3: "Hard"}
+
+# ── Topic normalization ──────────────────────────────────────
+# Map variant spellings to canonical label
+_TOPIC_ALIASES: dict[str, str] = {
+    "numpy": "NumPy",
+    "naïve bayes": "Naive Bayes",
+    "naïve bayes classifier": "Naive Bayes",
+    "naive bayes classifier": "Naive Bayes",
+    "bayes classifier": "Naive Bayes",
+    "bayes model": "Naive Bayes",
+    "bayes' theorem": "Naive Bayes",
+    "bayes' rule": "Naive Bayes",
+    "k-nearest neighbors": "K-Nearest Neighbors (KNN)",
+    "knn": "K-Nearest Neighbors (KNN)",
+    "k-means clustering": "K-Means Clustering",
+    "k-means": "K-Means Clustering",
+    "k means": "K-Means Clustering",
+    "multilayer perceptron": "Multilayer Perceptron (MLP)",
+    "multi-layer perceptron": "Multilayer Perceptron (MLP)",
+    "multi-layer perceptron (mlp)": "Multilayer Perceptron (MLP)",
+    "mlp": "Multilayer Perceptron (MLP)",
+    "single layer perceptron": "Perceptron",
+    "convolutional neural network": "CNN",
+    "convolutional neural network (cnn)": "CNN",
+    "convolutional neural networks": "CNN",
+    "cnn architecture": "CNN",
+    "cnn properties": "CNN",
+    "python fundamentals": "Python",
+    "python programming": "Python",
+    "python implementation": "Python",
+    "advanced python programming": "Python",
+    "python programming: convolutional neural network": "CNN",
+    "cross-validation": "Cross Validation",
+    "model evaluation implementation": "Model Evaluation",
+    "digital image processing": "Image Processing",
+    "computer vision": "Image Processing",
+    "array slicing": "Array Slicing",
+    "slicing": "Array Slicing",
+    "array indexing": "Array Slicing",
+    "array reshaping": "Reshape",
+    "array views": "Array Slicing",
+    "view vs copy": "Array Slicing",
+    "boolean indexing": "Array Slicing",
+    "arange": "NumPy",
+    "newaxis": "NumPy",
+    "expand dims": "NumPy",
+    "transpose": "NumPy",
+    "type casting": "NumPy",
+    "element-wise operation": "NumPy",
+    "array reduction": "NumPy",
+    "multi-dimensional array": "NumPy",
+    "dot product": "NumPy",
+    "vectorization": "NumPy",
+    "activation functions": "Activation Function",
+    "linear activation function": "Activation Function",
+    "neural network architecture": "Neural Networks",
+    "hidden layer": "Neural Networks",
+    "deep learning": "Neural Networks",
+    "deep learning frameworks": "Neural Networks",
+    "alpha-beta pruning": "Alpha-Beta Pruning",
+    "minimax algorithm": "Minimax",
+    "ethics of ai": "AI Ethics",
+    "ethics": "AI Ethics",
+    "cosine distance": "Cosine Similarity",
+    "distance calculation": "Distance Metrics",
+    "euclidean distance": "Distance Metrics",
+    "manhattan distance": "Distance Metrics",
+    "hamming distance": "Distance Metrics",
+    "precision": "Model Evaluation",
+    "recall": "Model Evaluation",
+    "f1 score": "Model Evaluation",
+    "macro f1 score": "Model Evaluation",
+    "accuracy": "Model Evaluation",
+    "classification accuracy": "Model Evaluation",
+    "confusion matrix": "Model Evaluation",
+    "convolution operation": "Convolution",
+    "dilated convolution": "Convolution",
+    "3d convolution": "Convolution",
+    "gaussian likelihood": "Probability",
+    "gaussian distribution": "Probability",
+    "categorical likelihood": "Probability",
+    "conditional probability": "Probability",
+    "total probability theorem": "Probability",
+    "probability assumptions": "Probability",
+    "tensorflow": "Keras",
+    "model summary": "Keras",
+    "model construction": "Keras",
+    "trainable parameters": "Parameter Calculation",
+    "parameter reduction": "Parameter Calculation",
+    "output shape calculation": "Parameter Calculation",
+    "shape calculation": "Parameter Calculation",
+}
+
+
+def normalize_topic(label: str) -> str:
+    return _TOPIC_ALIASES.get(label.lower().strip(), label)
+
+
+def extract_topic_labels(question: dict) -> list[str]:
+    labels: list[str] = []
+    raw_labels: list[str] = []
+
+    analytics_topic = question.get("analytics_topic")
+    if analytics_topic:
+        raw_labels.append(analytics_topic)
+
+    for tag in question.get("topic_tags") or []:
+        if tag and tag not in raw_labels:
+            raw_labels.append(tag)
+
+    if not raw_labels:
+        for tag in question.get("topics") or []:
+            if tag and tag not in raw_labels:
+                raw_labels.append(tag)
+
+    # Normalize and deduplicate
+    seen: set[str] = set()
+    for raw in raw_labels:
+        norm = normalize_topic(raw)
+        if norm not in seen:
+            seen.add(norm)
+            labels.append(norm)
+
+    return labels
+
+
+def extract_question_family(question: dict) -> str:
+    return (
+        question.get("question_format")
+        or question.get("question_type")
+        or "unknown"
+    )
+
+
+@router.get("/courses")
+async def list_courses():
+    """返回所有有 ready 状态试卷的课程列表"""
+    sb = get_supabase()
+    rows = (
+        sb.table("papers")
+        .select("course_code")
+        .eq("status", "ready")
+        .execute()
+        .data
+    )
+    codes = sorted({row["course_code"] for row in rows if row.get("course_code")})
+    return codes
+
+
+@router.get("/course/{course_code}")
+async def get_course_analytics(course_code: str):
+    sb = get_supabase()
+
+    papers = (
+        sb.table("papers")
+        .select("id, course_code, year, term, exam_type, part_label, status")
+        .eq("course_code", course_code.upper())
+        .eq("status", "ready")
+        .order("year", desc=True)
+        .execute()
+        .data
+    )
+    if not papers:
+        return {
+            "course_code": course_code.upper(),
+            "kpi": {"papers": 0, "questions": 0, "topics": 0, "difficulty": "N/A"},
+            "topic_frequency": [],
+            "question_types": [],
+            "difficulty_distribution": {"easy": 0, "medium": 0, "hard": 0},
+            "high_yield_topics": [],
+        }
+
+    paper_ids = [paper["id"] for paper in papers]
+    questions = (
+        sb.table("paper_questions")
+        .select(
+            "id, paper_id, question_number, question_type, question_format, "
+            "question_text, score, topics, analytics_topic, topic_tags, difficulty"
+        )
+        .in_("paper_id", paper_ids)
+        .order("display_order")
+        .execute()
+        .data
+    )
+
+    papers_by_id = {paper["id"]: paper for paper in papers}
+    total_questions = len(questions)
+    topic_counter: Counter[str] = Counter()
+    type_counter: Counter[str] = Counter()
+    difficulty_counter: Counter[str] = Counter()
+    topic_examples: dict[str, list[dict]] = defaultdict(list)
+    difficulty_scores: list[int] = []
+    all_question_items: list[dict] = []
+
+    for question in questions:
+        question_type = extract_question_family(question)
+        type_counter[question_type] += 1
+
+        difficulty = question.get("difficulty")
+        if difficulty in DIFFICULTY_SCORE:
+            difficulty_counter[difficulty] += 1
+            difficulty_scores.append(DIFFICULTY_SCORE[difficulty])
+
+        paper = papers_by_id.get(question["paper_id"], {})
+        source_label = (
+            f"{paper.get('year', '')} {paper.get('term', '').title()} "
+            f"{paper.get('exam_type', '').title()}"
+        ).strip()
+        if paper.get("part_label"):
+            source_label = f"{source_label} Part {paper['part_label']}"
+
+        topics = extract_topic_labels(question)
+        q_item = {
+            "paper_id": paper.get("id"),
+            "source": source_label,
+            "question_number": question["question_number"],
+            "preview": question["question_text"][:220],
+            "difficulty": question.get("difficulty"),
+            "question_type": question_type,
+            "year": paper.get("year"),
+            "term": paper.get("term"),
+            "exam_type": paper.get("exam_type"),
+            "topics": topics,
+        }
+        all_question_items.append(q_item)
+
+        for topic in topics:
+            topic_counter[topic] += 1
+            topic_examples[topic].append(q_item)
+
+    avg_difficulty = "N/A"
+    if difficulty_scores:
+        rounded = round(sum(difficulty_scores) / len(difficulty_scores))
+        avg_difficulty = DIFFICULTY_LABEL.get(rounded, "Medium")
+
+    topic_frequency = []
+    for topic, count in topic_counter.most_common():
+        pct = round((count / total_questions) * 100) if total_questions else 0
+        topic_frequency.append(
+            {
+                "label": topic,
+                "count": count,
+                "pct": pct,
+                "questions": topic_examples[topic],
+            }
+        )
+
+    question_types = []
+    for label, count in type_counter.most_common():
+        pct = round((count / total_questions) * 100) if total_questions else 0
+        question_types.append({"label": label, "count": count, "pct": pct})
+
+    return {
+        "course_code": course_code.upper(),
+        "kpi": {
+            "papers": len(papers),
+            "questions": total_questions,
+            "topics": len(topic_counter),
+            "difficulty": avg_difficulty,
+        },
+        "topic_frequency": topic_frequency,
+        "question_types": question_types,
+        "all_questions": all_question_items,
+        "difficulty_distribution": {
+            "easy": difficulty_counter.get("easy", 0),
+            "medium": difficulty_counter.get("medium", 0),
+            "hard": difficulty_counter.get("hard", 0),
+        },
+        "high_yield_topics": [topic for topic, _ in topic_counter.most_common(5)],
+    }