Initial commit: PastPaper Master full stack
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
169
backend/backfill_null_ai_trio.py
Normal file
169
backend/backfill_null_ai_trio.py
Normal file
@@ -0,0 +1,169 @@
|
||||
"""Backfill AI trio for questions where knowledge_reminder IS NULL.
|
||||
|
||||
For each question, generates fields in two separate LLM calls to avoid token truncation:
|
||||
Call 1 → knowledge_reminder + ai_hint (short, ~500 tokens output)
|
||||
Call 2 → solution (long, up to 4096 tokens output)
|
||||
|
||||
Run from the backend directory:
|
||||
uv run python backfill_null_ai_trio.py [--dry-run]
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import sys
|
||||
from app.services.supabase_client import get_supabase
|
||||
from app.services.paper_processor import qwen_json_completion
|
||||
|
||||
|
||||
KNOWLEDGE_HINT_PROMPT = """\
|
||||
You are an expert tutor. Given a past-paper question, produce two short study aids in English.
|
||||
|
||||
Return JSON exactly:
|
||||
{{
|
||||
"knowledge_reminder": "2-4 sentences summarising the key concept or formula the student must recall.",
|
||||
"ai_hint": "1-3 sentence nudge that guides WITHOUT giving the answer away."
|
||||
}}
|
||||
|
||||
Question:
|
||||
{payload}
|
||||
"""
|
||||
|
||||
SOLUTION_PROMPT = """\
|
||||
You are an expert tutor. Given a past-paper question and its reference answer, write a clear, \
|
||||
step-by-step model solution in English. Show all working. Be thorough but stop when the answer \
|
||||
is complete — do not pad.
|
||||
|
||||
Return JSON exactly:
|
||||
{{
|
||||
"solution": "<full step-by-step solution as a single string, use \\n for line breaks>"
|
||||
}}
|
||||
|
||||
Question:
|
||||
{payload}
|
||||
"""
|
||||
|
||||
|
||||
def build_payload(q: dict) -> dict:
|
||||
ref = ""
|
||||
if q.get("raw_answer_text"):
|
||||
ref = q["raw_answer_text"]
|
||||
elif q.get("correct_option"):
|
||||
ref = f"Correct option: {q['correct_option']}"
|
||||
elif q.get("correct_answer"):
|
||||
ref = f"Correct answer: {q['correct_answer']}"
|
||||
|
||||
return {
|
||||
"question_number": q["question_number"],
|
||||
"question_type": q["question_type"] or "long_question",
|
||||
"score": q.get("score") or "unknown",
|
||||
"question_text": q.get("question_text") or "",
|
||||
"topics": q.get("topics") or [],
|
||||
"reference_answer": ref,
|
||||
}
|
||||
|
||||
|
||||
async def process_one(sb, q: dict, dry_run: bool) -> bool:
|
||||
payload_str = json.dumps(build_payload(q), ensure_ascii=False)
|
||||
row_id = q["id"]
|
||||
qnum = q["question_number"]
|
||||
|
||||
if dry_run:
|
||||
print(f" [dry-run] would process {qnum}")
|
||||
return True
|
||||
|
||||
update: dict = {}
|
||||
|
||||
# ── Call 1: knowledge_reminder + ai_hint ─────────────────────────
|
||||
try:
|
||||
r1 = await qwen_json_completion(
|
||||
system_prompt=KNOWLEDGE_HINT_PROMPT.format(payload=payload_str),
|
||||
temperature=0.3,
|
||||
max_tokens=1024,
|
||||
)
|
||||
if r1.get("knowledge_reminder"):
|
||||
update["knowledge_reminder"] = r1["knowledge_reminder"]
|
||||
if r1.get("ai_hint"):
|
||||
update["ai_hint"] = r1["ai_hint"]
|
||||
except Exception as e:
|
||||
print(f" WARN call-1 failed for {qnum}: {e}")
|
||||
|
||||
await asyncio.sleep(1)
|
||||
|
||||
# ── Call 2: solution ──────────────────────────────────────────────
|
||||
try:
|
||||
r2 = await qwen_json_completion(
|
||||
system_prompt=SOLUTION_PROMPT.format(payload=payload_str),
|
||||
temperature=0.3,
|
||||
max_tokens=4096,
|
||||
)
|
||||
if r2.get("solution"):
|
||||
update["solution"] = r2["solution"]
|
||||
except Exception as e:
|
||||
print(f" WARN call-2 failed for {qnum}: {e}")
|
||||
|
||||
if not update:
|
||||
print(f" SKIP {qnum}: both calls returned nothing")
|
||||
return False
|
||||
|
||||
sb.table("paper_questions").update(update).eq("id", row_id).execute()
|
||||
return True
|
||||
|
||||
|
||||
async def backfill(dry_run: bool = False) -> None:
|
||||
sb = get_supabase()
|
||||
|
||||
papers = (
|
||||
sb.table("papers")
|
||||
.select("id")
|
||||
.eq("course_code", "COMP2211")
|
||||
.eq("source_kind", "course_library")
|
||||
.execute()
|
||||
.data
|
||||
)
|
||||
paper_ids = [p["id"] for p in papers]
|
||||
if not paper_ids:
|
||||
print("No COMP2211 course-library papers found.")
|
||||
return
|
||||
|
||||
questions = (
|
||||
sb.table("paper_questions")
|
||||
.select("id, paper_id, question_number, question_type, score, question_text, topics, raw_answer_text, correct_option, correct_answer")
|
||||
.in_("paper_id", paper_ids)
|
||||
.is_("knowledge_reminder", "null")
|
||||
.order("paper_id")
|
||||
.order("display_order")
|
||||
.execute()
|
||||
.data
|
||||
)
|
||||
|
||||
if not questions:
|
||||
print("No NULL questions found — all done!")
|
||||
return
|
||||
|
||||
print(f"Found {len(questions)} questions with NULL knowledge_reminder.")
|
||||
|
||||
# Group by paper for cleaner output
|
||||
from collections import defaultdict
|
||||
by_paper: dict[str, list] = defaultdict(list)
|
||||
for q in questions:
|
||||
by_paper[q["paper_id"]].append(q)
|
||||
|
||||
total_updated = 0
|
||||
for paper_idx, (paper_id, qs) in enumerate(by_paper.items(), 1):
|
||||
print(f"\n[{paper_idx}/{len(by_paper)}] paper_id={paper_id} — {len(qs)} NULL questions")
|
||||
for q in qs:
|
||||
print(f" Processing {q['question_number']}...", end=" ", flush=True)
|
||||
ok = await process_one(sb, q, dry_run)
|
||||
if ok:
|
||||
total_updated += 1
|
||||
print("done")
|
||||
await asyncio.sleep(1.5)
|
||||
|
||||
print(f"\nDone. {total_updated}/{len(questions)} questions updated.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
dry_run = "--dry-run" in sys.argv
|
||||
asyncio.run(backfill(dry_run=dry_run))
|
||||
Reference in New Issue
Block a user