"""PDF 文本提取 — 复用 SOS 的 text_extractor 逻辑""" import base64 import fitz # PyMuPDF from dataclasses import dataclass @dataclass class ExtractedContent: pages_text: list[str] # 每页文本 page_images: dict[int, str] # 页码 → base64 图片(图片密集型页面) total_pages: int has_images: bool def extract_pdf(file_bytes: bytes) -> ExtractedContent: """从 PDF 提取文本和图片""" doc = fitz.open(stream=file_bytes, filetype="pdf") pages_text = [] page_images = {} for i, page in enumerate(doc): text = page.get_text("text") pages_text.append(text) # 如果某页文本很少但有图片,可能是扫描件 → 保存为图片用于 Vision OCR if len(text.strip()) < 50: pix = page.get_pixmap(dpi=200) img_bytes = pix.tobytes("png") page_images[i] = base64.b64encode(img_bytes).decode("utf-8") doc.close() return ExtractedContent( pages_text=pages_text, page_images=page_images, total_pages=len(pages_text), has_images=len(page_images) > 0, ) def get_full_text(extracted: ExtractedContent) -> str: """合并所有页面文本""" return "\n\n".join( f"--- Page {i+1} ---\n{text}" for i, text in enumerate(extracted.pages_text) if text.strip() )