Initial commit: PastPaper Master full stack
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
174
backend/regen_ai_trio_comp2211.py
Normal file
174
backend/regen_ai_trio_comp2211.py
Normal file
@@ -0,0 +1,174 @@
|
||||
"""Regenerate AI trio (knowledge_reminder, ai_hint, solution) for all COMP2211 course-library questions.
|
||||
|
||||
Reads existing paper_questions rows and runs the same BATCH_ANALYSIS_PROMPT used by
|
||||
paper_processor.py — but does UPDATE instead of INSERT, so question structure is untouched.
|
||||
|
||||
Run from the backend directory:
|
||||
uv run python regen_ai_trio_comp2211.py
|
||||
|
||||
Pass --dry-run to print batches without calling the LLM or writing to the database.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import sys
|
||||
from app.services.supabase_client import get_supabase
|
||||
from app.services.paper_processor import BATCH_ANALYSIS_PROMPT, qwen_json_completion, chunked
|
||||
|
||||
|
||||
def build_reference_answer(q: dict) -> str:
|
||||
if q.get("raw_answer_text"):
|
||||
return q["raw_answer_text"]
|
||||
if q.get("correct_option"):
|
||||
return f"Correct option: {q['correct_option']}"
|
||||
if q.get("correct_answer"):
|
||||
return f"Correct answer: {q['correct_answer']}"
|
||||
return ""
|
||||
|
||||
|
||||
async def regen(dry_run: bool = False) -> None:
|
||||
sb = get_supabase()
|
||||
|
||||
papers = (
|
||||
sb.table("papers")
|
||||
.select("id")
|
||||
.eq("course_code", "COMP2211")
|
||||
.eq("source_kind", "course_library")
|
||||
.execute()
|
||||
.data
|
||||
)
|
||||
paper_ids = [p["id"] for p in papers]
|
||||
if not paper_ids:
|
||||
print("No COMP2211 course-library papers found.")
|
||||
return
|
||||
|
||||
questions = (
|
||||
sb.table("paper_questions")
|
||||
.select("id, paper_id, question_number, question_type, score, question_text, topics, raw_answer_text, correct_option, correct_answer")
|
||||
.in_("paper_id", paper_ids)
|
||||
.order("paper_id")
|
||||
.order("display_order")
|
||||
.execute()
|
||||
.data
|
||||
)
|
||||
print(f"Found {len(questions)} questions across {len(paper_ids)} papers.")
|
||||
|
||||
payloads = [
|
||||
{
|
||||
"question_number": q["question_number"],
|
||||
"question_type": q["question_type"] or "long_question",
|
||||
"score": q.get("score") or "unknown",
|
||||
"question_text": q.get("question_text") or "",
|
||||
"topics": q.get("topics") or [],
|
||||
"reference_answer": build_reference_answer(q),
|
||||
}
|
||||
for q in questions
|
||||
]
|
||||
|
||||
id_by_qnum_paper: dict[tuple[str, str], str] = {
|
||||
(q["paper_id"], q["question_number"]): q["id"]
|
||||
for q in questions
|
||||
}
|
||||
paper_id_by_qnum: dict[str, str] = {
|
||||
q["question_number"]: q["paper_id"] for q in questions
|
||||
}
|
||||
|
||||
# Group payloads by paper so batches don't mix papers (cleaner context for LLM)
|
||||
from collections import defaultdict
|
||||
payloads_by_paper: dict[str, list[dict]] = defaultdict(list)
|
||||
for q, payload in zip(questions, payloads):
|
||||
payloads_by_paper[q["paper_id"]].append((q["id"], payload))
|
||||
|
||||
total_updated = 0
|
||||
total_papers = len(payloads_by_paper)
|
||||
|
||||
for paper_idx, (paper_id, items) in enumerate(payloads_by_paper.items(), 1):
|
||||
ids = [item[0] for item in items]
|
||||
batch_payloads = [item[1] for item in items]
|
||||
|
||||
print(f"\n[{paper_idx}/{total_papers}] paper_id={paper_id} — {len(batch_payloads)} questions")
|
||||
|
||||
for batch_idx, batch in enumerate(chunked(batch_payloads, 3), 1):
|
||||
print(f" Batch {batch_idx}: questions {[b['question_number'] for b in batch]}", end="", flush=True)
|
||||
|
||||
if dry_run:
|
||||
print(" [dry-run, skipped]")
|
||||
continue
|
||||
|
||||
batch_start = (batch_idx - 1) * 3
|
||||
batch_ids = ids[batch_start: batch_start + 3]
|
||||
|
||||
async def run_single(row_id: str, payload: dict) -> bool:
|
||||
try:
|
||||
r = await qwen_json_completion(
|
||||
system_prompt=BATCH_ANALYSIS_PROMPT.format(
|
||||
questions_payload=json.dumps([payload], ensure_ascii=False),
|
||||
),
|
||||
temperature=0.3,
|
||||
max_tokens=8192,
|
||||
)
|
||||
items = r.get("analyses", [])
|
||||
if not items:
|
||||
return False
|
||||
analysis = items[0]
|
||||
sb.table("paper_questions").update({
|
||||
"knowledge_reminder": analysis.get("knowledge_reminder", ""),
|
||||
"ai_hint": analysis.get("ai_hint", ""),
|
||||
"solution": analysis.get("solution", ""),
|
||||
}).eq("id", row_id).execute()
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
try:
|
||||
result = await qwen_json_completion(
|
||||
system_prompt=BATCH_ANALYSIS_PROMPT.format(
|
||||
questions_payload=json.dumps(batch, ensure_ascii=False),
|
||||
),
|
||||
temperature=0.3,
|
||||
max_tokens=8192,
|
||||
)
|
||||
analyses = {item["question_number"]: item for item in result.get("analyses", [])}
|
||||
written = 0
|
||||
for row_id, payload in zip(batch_ids, batch):
|
||||
qnum = payload["question_number"]
|
||||
analysis = analyses.get(qnum)
|
||||
if not analysis:
|
||||
# fallback: retry this single question alone
|
||||
ok = await run_single(row_id, payload)
|
||||
if ok:
|
||||
written += 1
|
||||
total_updated += 1
|
||||
else:
|
||||
print(f"\n SKIP: {qnum}")
|
||||
else:
|
||||
sb.table("paper_questions").update({
|
||||
"knowledge_reminder": analysis.get("knowledge_reminder", ""),
|
||||
"ai_hint": analysis.get("ai_hint", ""),
|
||||
"solution": analysis.get("solution", ""),
|
||||
}).eq("id", row_id).execute()
|
||||
written += 1
|
||||
total_updated += 1
|
||||
print(f" → {written} written")
|
||||
except Exception as exc:
|
||||
# batch failed entirely — retry each question individually
|
||||
print(f" [batch error, retrying 1-by-1]")
|
||||
written = 0
|
||||
for row_id, payload in zip(batch_ids, batch):
|
||||
ok = await run_single(row_id, payload)
|
||||
if ok:
|
||||
written += 1
|
||||
total_updated += 1
|
||||
await asyncio.sleep(1)
|
||||
print(f" → {written}/{len(batch)} written")
|
||||
|
||||
await asyncio.sleep(2.5)
|
||||
|
||||
print(f"\nDone. {total_updated} questions updated.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
dry_run = "--dry-run" in sys.argv
|
||||
asyncio.run(regen(dry_run=dry_run))
|
||||
Reference in New Issue
Block a user