"""Backfill AI trio for questions where knowledge_reminder IS NULL. For each question, generates fields in two separate LLM calls to avoid token truncation: Call 1 → knowledge_reminder + ai_hint (short, ~500 tokens output) Call 2 → solution (long, up to 4096 tokens output) Run from the backend directory: uv run python backfill_null_ai_trio.py [--dry-run] """ from __future__ import annotations import asyncio import json import sys from app.services.supabase_client import get_supabase from app.services.paper_processor import qwen_json_completion KNOWLEDGE_HINT_PROMPT = """\ You are an expert tutor. Given a past-paper question, produce two short study aids in English. Return JSON exactly: {{ "knowledge_reminder": "2-4 sentences summarising the key concept or formula the student must recall.", "ai_hint": "1-3 sentence nudge that guides WITHOUT giving the answer away." }} Question: {payload} """ SOLUTION_PROMPT = """\ You are an expert tutor. Given a past-paper question and its reference answer, write a clear, \ step-by-step model solution in English. Show all working. Be thorough but stop when the answer \ is complete — do not pad. Return JSON exactly: {{ "solution": "" }} Question: {payload} """ def build_payload(q: dict) -> dict: ref = "" if q.get("raw_answer_text"): ref = q["raw_answer_text"] elif q.get("correct_option"): ref = f"Correct option: {q['correct_option']}" elif q.get("correct_answer"): ref = f"Correct answer: {q['correct_answer']}" return { "question_number": q["question_number"], "question_type": q["question_type"] or "long_question", "score": q.get("score") or "unknown", "question_text": q.get("question_text") or "", "topics": q.get("topics") or [], "reference_answer": ref, } async def process_one(sb, q: dict, dry_run: bool) -> bool: payload_str = json.dumps(build_payload(q), ensure_ascii=False) row_id = q["id"] qnum = q["question_number"] if dry_run: print(f" [dry-run] would process {qnum}") return True update: dict = {} # ── Call 1: knowledge_reminder + ai_hint ───────────────────────── try: r1 = await qwen_json_completion( system_prompt=KNOWLEDGE_HINT_PROMPT.format(payload=payload_str), temperature=0.3, max_tokens=1024, ) if r1.get("knowledge_reminder"): update["knowledge_reminder"] = r1["knowledge_reminder"] if r1.get("ai_hint"): update["ai_hint"] = r1["ai_hint"] except Exception as e: print(f" WARN call-1 failed for {qnum}: {e}") await asyncio.sleep(1) # ── Call 2: solution ────────────────────────────────────────────── try: r2 = await qwen_json_completion( system_prompt=SOLUTION_PROMPT.format(payload=payload_str), temperature=0.3, max_tokens=4096, ) if r2.get("solution"): update["solution"] = r2["solution"] except Exception as e: print(f" WARN call-2 failed for {qnum}: {e}") if not update: print(f" SKIP {qnum}: both calls returned nothing") return False sb.table("paper_questions").update(update).eq("id", row_id).execute() return True async def backfill(dry_run: bool = False) -> None: sb = get_supabase() papers = ( sb.table("papers") .select("id") .eq("course_code", "COMP2211") .eq("source_kind", "course_library") .execute() .data ) paper_ids = [p["id"] for p in papers] if not paper_ids: print("No COMP2211 course-library papers found.") return questions = ( sb.table("paper_questions") .select("id, paper_id, question_number, question_type, score, question_text, topics, raw_answer_text, correct_option, correct_answer") .in_("paper_id", paper_ids) .is_("knowledge_reminder", "null") .order("paper_id") .order("display_order") .execute() .data ) if not questions: print("No NULL questions found — all done!") return print(f"Found {len(questions)} questions with NULL knowledge_reminder.") # Group by paper for cleaner output from collections import defaultdict by_paper: dict[str, list] = defaultdict(list) for q in questions: by_paper[q["paper_id"]].append(q) total_updated = 0 for paper_idx, (paper_id, qs) in enumerate(by_paper.items(), 1): print(f"\n[{paper_idx}/{len(by_paper)}] paper_id={paper_id} — {len(qs)} NULL questions") for q in qs: print(f" Processing {q['question_number']}...", end=" ", flush=True) ok = await process_one(sb, q, dry_run) if ok: total_updated += 1 print("done") await asyncio.sleep(1.5) print(f"\nDone. {total_updated}/{len(questions)} questions updated.") if __name__ == "__main__": dry_run = "--dry-run" in sys.argv asyncio.run(backfill(dry_run=dry_run))