175 lines
6.6 KiB
Python
175 lines
6.6 KiB
Python
"""Regenerate AI trio (knowledge_reminder, ai_hint, solution) for all COMP2211 course-library questions.
|
|
|
|
Reads existing paper_questions rows and runs the same BATCH_ANALYSIS_PROMPT used by
|
|
paper_processor.py — but does UPDATE instead of INSERT, so question structure is untouched.
|
|
|
|
Run from the backend directory:
|
|
uv run python regen_ai_trio_comp2211.py
|
|
|
|
Pass --dry-run to print batches without calling the LLM or writing to the database.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import json
|
|
import sys
|
|
from app.services.supabase_client import get_supabase
|
|
from app.services.paper_processor import BATCH_ANALYSIS_PROMPT, qwen_json_completion, chunked
|
|
|
|
|
|
def build_reference_answer(q: dict) -> str:
|
|
if q.get("raw_answer_text"):
|
|
return q["raw_answer_text"]
|
|
if q.get("correct_option"):
|
|
return f"Correct option: {q['correct_option']}"
|
|
if q.get("correct_answer"):
|
|
return f"Correct answer: {q['correct_answer']}"
|
|
return ""
|
|
|
|
|
|
async def regen(dry_run: bool = False) -> None:
|
|
sb = get_supabase()
|
|
|
|
papers = (
|
|
sb.table("papers")
|
|
.select("id")
|
|
.eq("course_code", "COMP2211")
|
|
.eq("source_kind", "course_library")
|
|
.execute()
|
|
.data
|
|
)
|
|
paper_ids = [p["id"] for p in papers]
|
|
if not paper_ids:
|
|
print("No COMP2211 course-library papers found.")
|
|
return
|
|
|
|
questions = (
|
|
sb.table("paper_questions")
|
|
.select("id, paper_id, question_number, question_type, score, question_text, topics, raw_answer_text, correct_option, correct_answer")
|
|
.in_("paper_id", paper_ids)
|
|
.order("paper_id")
|
|
.order("display_order")
|
|
.execute()
|
|
.data
|
|
)
|
|
print(f"Found {len(questions)} questions across {len(paper_ids)} papers.")
|
|
|
|
payloads = [
|
|
{
|
|
"question_number": q["question_number"],
|
|
"question_type": q["question_type"] or "long_question",
|
|
"score": q.get("score") or "unknown",
|
|
"question_text": q.get("question_text") or "",
|
|
"topics": q.get("topics") or [],
|
|
"reference_answer": build_reference_answer(q),
|
|
}
|
|
for q in questions
|
|
]
|
|
|
|
id_by_qnum_paper: dict[tuple[str, str], str] = {
|
|
(q["paper_id"], q["question_number"]): q["id"]
|
|
for q in questions
|
|
}
|
|
paper_id_by_qnum: dict[str, str] = {
|
|
q["question_number"]: q["paper_id"] for q in questions
|
|
}
|
|
|
|
# Group payloads by paper so batches don't mix papers (cleaner context for LLM)
|
|
from collections import defaultdict
|
|
payloads_by_paper: dict[str, list[dict]] = defaultdict(list)
|
|
for q, payload in zip(questions, payloads):
|
|
payloads_by_paper[q["paper_id"]].append((q["id"], payload))
|
|
|
|
total_updated = 0
|
|
total_papers = len(payloads_by_paper)
|
|
|
|
for paper_idx, (paper_id, items) in enumerate(payloads_by_paper.items(), 1):
|
|
ids = [item[0] for item in items]
|
|
batch_payloads = [item[1] for item in items]
|
|
|
|
print(f"\n[{paper_idx}/{total_papers}] paper_id={paper_id} — {len(batch_payloads)} questions")
|
|
|
|
for batch_idx, batch in enumerate(chunked(batch_payloads, 3), 1):
|
|
print(f" Batch {batch_idx}: questions {[b['question_number'] for b in batch]}", end="", flush=True)
|
|
|
|
if dry_run:
|
|
print(" [dry-run, skipped]")
|
|
continue
|
|
|
|
batch_start = (batch_idx - 1) * 3
|
|
batch_ids = ids[batch_start: batch_start + 3]
|
|
|
|
async def run_single(row_id: str, payload: dict) -> bool:
|
|
try:
|
|
r = await qwen_json_completion(
|
|
system_prompt=BATCH_ANALYSIS_PROMPT.format(
|
|
questions_payload=json.dumps([payload], ensure_ascii=False),
|
|
),
|
|
temperature=0.3,
|
|
max_tokens=8192,
|
|
)
|
|
items = r.get("analyses", [])
|
|
if not items:
|
|
return False
|
|
analysis = items[0]
|
|
sb.table("paper_questions").update({
|
|
"knowledge_reminder": analysis.get("knowledge_reminder", ""),
|
|
"ai_hint": analysis.get("ai_hint", ""),
|
|
"solution": analysis.get("solution", ""),
|
|
}).eq("id", row_id).execute()
|
|
return True
|
|
except Exception:
|
|
return False
|
|
|
|
try:
|
|
result = await qwen_json_completion(
|
|
system_prompt=BATCH_ANALYSIS_PROMPT.format(
|
|
questions_payload=json.dumps(batch, ensure_ascii=False),
|
|
),
|
|
temperature=0.3,
|
|
max_tokens=8192,
|
|
)
|
|
analyses = {item["question_number"]: item for item in result.get("analyses", [])}
|
|
written = 0
|
|
for row_id, payload in zip(batch_ids, batch):
|
|
qnum = payload["question_number"]
|
|
analysis = analyses.get(qnum)
|
|
if not analysis:
|
|
# fallback: retry this single question alone
|
|
ok = await run_single(row_id, payload)
|
|
if ok:
|
|
written += 1
|
|
total_updated += 1
|
|
else:
|
|
print(f"\n SKIP: {qnum}")
|
|
else:
|
|
sb.table("paper_questions").update({
|
|
"knowledge_reminder": analysis.get("knowledge_reminder", ""),
|
|
"ai_hint": analysis.get("ai_hint", ""),
|
|
"solution": analysis.get("solution", ""),
|
|
}).eq("id", row_id).execute()
|
|
written += 1
|
|
total_updated += 1
|
|
print(f" → {written} written")
|
|
except Exception as exc:
|
|
# batch failed entirely — retry each question individually
|
|
print(f" [batch error, retrying 1-by-1]")
|
|
written = 0
|
|
for row_id, payload in zip(batch_ids, batch):
|
|
ok = await run_single(row_id, payload)
|
|
if ok:
|
|
written += 1
|
|
total_updated += 1
|
|
await asyncio.sleep(1)
|
|
print(f" → {written}/{len(batch)} written")
|
|
|
|
await asyncio.sleep(2.5)
|
|
|
|
print(f"\nDone. {total_updated} questions updated.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
dry_run = "--dry-run" in sys.argv
|
|
asyncio.run(regen(dry_run=dry_run))
|