"""Regenerate AI trio (knowledge_reminder, ai_hint, solution) for all COMP2211 course-library questions. Reads existing paper_questions rows and runs the same BATCH_ANALYSIS_PROMPT used by paper_processor.py — but does UPDATE instead of INSERT, so question structure is untouched. Run from the backend directory: uv run python regen_ai_trio_comp2211.py Pass --dry-run to print batches without calling the LLM or writing to the database. """ from __future__ import annotations import asyncio import json import sys from app.services.supabase_client import get_supabase from app.services.paper_processor import BATCH_ANALYSIS_PROMPT, qwen_json_completion, chunked def build_reference_answer(q: dict) -> str: if q.get("raw_answer_text"): return q["raw_answer_text"] if q.get("correct_option"): return f"Correct option: {q['correct_option']}" if q.get("correct_answer"): return f"Correct answer: {q['correct_answer']}" return "" async def regen(dry_run: bool = False) -> None: sb = get_supabase() papers = ( sb.table("papers") .select("id") .eq("course_code", "COMP2211") .eq("source_kind", "course_library") .execute() .data ) paper_ids = [p["id"] for p in papers] if not paper_ids: print("No COMP2211 course-library papers found.") return questions = ( sb.table("paper_questions") .select("id, paper_id, question_number, question_type, score, question_text, topics, raw_answer_text, correct_option, correct_answer") .in_("paper_id", paper_ids) .order("paper_id") .order("display_order") .execute() .data ) print(f"Found {len(questions)} questions across {len(paper_ids)} papers.") payloads = [ { "question_number": q["question_number"], "question_type": q["question_type"] or "long_question", "score": q.get("score") or "unknown", "question_text": q.get("question_text") or "", "topics": q.get("topics") or [], "reference_answer": build_reference_answer(q), } for q in questions ] id_by_qnum_paper: dict[tuple[str, str], str] = { (q["paper_id"], q["question_number"]): q["id"] for q in questions } paper_id_by_qnum: dict[str, str] = { q["question_number"]: q["paper_id"] for q in questions } # Group payloads by paper so batches don't mix papers (cleaner context for LLM) from collections import defaultdict payloads_by_paper: dict[str, list[dict]] = defaultdict(list) for q, payload in zip(questions, payloads): payloads_by_paper[q["paper_id"]].append((q["id"], payload)) total_updated = 0 total_papers = len(payloads_by_paper) for paper_idx, (paper_id, items) in enumerate(payloads_by_paper.items(), 1): ids = [item[0] for item in items] batch_payloads = [item[1] for item in items] print(f"\n[{paper_idx}/{total_papers}] paper_id={paper_id} — {len(batch_payloads)} questions") for batch_idx, batch in enumerate(chunked(batch_payloads, 3), 1): print(f" Batch {batch_idx}: questions {[b['question_number'] for b in batch]}", end="", flush=True) if dry_run: print(" [dry-run, skipped]") continue batch_start = (batch_idx - 1) * 3 batch_ids = ids[batch_start: batch_start + 3] async def run_single(row_id: str, payload: dict) -> bool: try: r = await qwen_json_completion( system_prompt=BATCH_ANALYSIS_PROMPT.format( questions_payload=json.dumps([payload], ensure_ascii=False), ), temperature=0.3, max_tokens=8192, ) items = r.get("analyses", []) if not items: return False analysis = items[0] sb.table("paper_questions").update({ "knowledge_reminder": analysis.get("knowledge_reminder", ""), "ai_hint": analysis.get("ai_hint", ""), "solution": analysis.get("solution", ""), }).eq("id", row_id).execute() return True except Exception: return False try: result = await qwen_json_completion( system_prompt=BATCH_ANALYSIS_PROMPT.format( questions_payload=json.dumps(batch, ensure_ascii=False), ), temperature=0.3, max_tokens=8192, ) analyses = {item["question_number"]: item for item in result.get("analyses", [])} written = 0 for row_id, payload in zip(batch_ids, batch): qnum = payload["question_number"] analysis = analyses.get(qnum) if not analysis: # fallback: retry this single question alone ok = await run_single(row_id, payload) if ok: written += 1 total_updated += 1 else: print(f"\n SKIP: {qnum}") else: sb.table("paper_questions").update({ "knowledge_reminder": analysis.get("knowledge_reminder", ""), "ai_hint": analysis.get("ai_hint", ""), "solution": analysis.get("solution", ""), }).eq("id", row_id).execute() written += 1 total_updated += 1 print(f" → {written} written") except Exception as exc: # batch failed entirely — retry each question individually print(f" [batch error, retrying 1-by-1]") written = 0 for row_id, payload in zip(batch_ids, batch): ok = await run_single(row_id, payload) if ok: written += 1 total_updated += 1 await asyncio.sleep(1) print(f" → {written}/{len(batch)} written") await asyncio.sleep(2.5) print(f"\nDone. {total_updated} questions updated.") if __name__ == "__main__": dry_run = "--dry-run" in sys.argv asyncio.run(regen(dry_run=dry_run))