Files
PastpaperMaster/backend/regen_ai_trio_comp2211.py
Zhao 7a09167261 Initial commit: PastPaper Master full stack
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-21 12:27:47 +07:00

175 lines
6.6 KiB
Python

"""Regenerate AI trio (knowledge_reminder, ai_hint, solution) for all COMP2211 course-library questions.
Reads existing paper_questions rows and runs the same BATCH_ANALYSIS_PROMPT used by
paper_processor.py — but does UPDATE instead of INSERT, so question structure is untouched.
Run from the backend directory:
uv run python regen_ai_trio_comp2211.py
Pass --dry-run to print batches without calling the LLM or writing to the database.
"""
from __future__ import annotations
import asyncio
import json
import sys
from app.services.supabase_client import get_supabase
from app.services.paper_processor import BATCH_ANALYSIS_PROMPT, qwen_json_completion, chunked
def build_reference_answer(q: dict) -> str:
if q.get("raw_answer_text"):
return q["raw_answer_text"]
if q.get("correct_option"):
return f"Correct option: {q['correct_option']}"
if q.get("correct_answer"):
return f"Correct answer: {q['correct_answer']}"
return ""
async def regen(dry_run: bool = False) -> None:
sb = get_supabase()
papers = (
sb.table("papers")
.select("id")
.eq("course_code", "COMP2211")
.eq("source_kind", "course_library")
.execute()
.data
)
paper_ids = [p["id"] for p in papers]
if not paper_ids:
print("No COMP2211 course-library papers found.")
return
questions = (
sb.table("paper_questions")
.select("id, paper_id, question_number, question_type, score, question_text, topics, raw_answer_text, correct_option, correct_answer")
.in_("paper_id", paper_ids)
.order("paper_id")
.order("display_order")
.execute()
.data
)
print(f"Found {len(questions)} questions across {len(paper_ids)} papers.")
payloads = [
{
"question_number": q["question_number"],
"question_type": q["question_type"] or "long_question",
"score": q.get("score") or "unknown",
"question_text": q.get("question_text") or "",
"topics": q.get("topics") or [],
"reference_answer": build_reference_answer(q),
}
for q in questions
]
id_by_qnum_paper: dict[tuple[str, str], str] = {
(q["paper_id"], q["question_number"]): q["id"]
for q in questions
}
paper_id_by_qnum: dict[str, str] = {
q["question_number"]: q["paper_id"] for q in questions
}
# Group payloads by paper so batches don't mix papers (cleaner context for LLM)
from collections import defaultdict
payloads_by_paper: dict[str, list[dict]] = defaultdict(list)
for q, payload in zip(questions, payloads):
payloads_by_paper[q["paper_id"]].append((q["id"], payload))
total_updated = 0
total_papers = len(payloads_by_paper)
for paper_idx, (paper_id, items) in enumerate(payloads_by_paper.items(), 1):
ids = [item[0] for item in items]
batch_payloads = [item[1] for item in items]
print(f"\n[{paper_idx}/{total_papers}] paper_id={paper_id}{len(batch_payloads)} questions")
for batch_idx, batch in enumerate(chunked(batch_payloads, 3), 1):
print(f" Batch {batch_idx}: questions {[b['question_number'] for b in batch]}", end="", flush=True)
if dry_run:
print(" [dry-run, skipped]")
continue
batch_start = (batch_idx - 1) * 3
batch_ids = ids[batch_start: batch_start + 3]
async def run_single(row_id: str, payload: dict) -> bool:
try:
r = await qwen_json_completion(
system_prompt=BATCH_ANALYSIS_PROMPT.format(
questions_payload=json.dumps([payload], ensure_ascii=False),
),
temperature=0.3,
max_tokens=8192,
)
items = r.get("analyses", [])
if not items:
return False
analysis = items[0]
sb.table("paper_questions").update({
"knowledge_reminder": analysis.get("knowledge_reminder", ""),
"ai_hint": analysis.get("ai_hint", ""),
"solution": analysis.get("solution", ""),
}).eq("id", row_id).execute()
return True
except Exception:
return False
try:
result = await qwen_json_completion(
system_prompt=BATCH_ANALYSIS_PROMPT.format(
questions_payload=json.dumps(batch, ensure_ascii=False),
),
temperature=0.3,
max_tokens=8192,
)
analyses = {item["question_number"]: item for item in result.get("analyses", [])}
written = 0
for row_id, payload in zip(batch_ids, batch):
qnum = payload["question_number"]
analysis = analyses.get(qnum)
if not analysis:
# fallback: retry this single question alone
ok = await run_single(row_id, payload)
if ok:
written += 1
total_updated += 1
else:
print(f"\n SKIP: {qnum}")
else:
sb.table("paper_questions").update({
"knowledge_reminder": analysis.get("knowledge_reminder", ""),
"ai_hint": analysis.get("ai_hint", ""),
"solution": analysis.get("solution", ""),
}).eq("id", row_id).execute()
written += 1
total_updated += 1
print(f"{written} written")
except Exception as exc:
# batch failed entirely — retry each question individually
print(f" [batch error, retrying 1-by-1]")
written = 0
for row_id, payload in zip(batch_ids, batch):
ok = await run_single(row_id, payload)
if ok:
written += 1
total_updated += 1
await asyncio.sleep(1)
print(f"{written}/{len(batch)} written")
await asyncio.sleep(2.5)
print(f"\nDone. {total_updated} questions updated.")
if __name__ == "__main__":
dry_run = "--dry-run" in sys.argv
asyncio.run(regen(dry_run=dry_run))