Initial commit: PastPaper Master full stack
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
48
backend/app/services/text_extractor.py
Normal file
48
backend/app/services/text_extractor.py
Normal file
@@ -0,0 +1,48 @@
|
||||
"""PDF 文本提取 — 复用 SOS 的 text_extractor 逻辑"""
|
||||
|
||||
import base64
|
||||
import fitz # PyMuPDF
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExtractedContent:
|
||||
pages_text: list[str] # 每页文本
|
||||
page_images: dict[int, str] # 页码 → base64 图片(图片密集型页面)
|
||||
total_pages: int
|
||||
has_images: bool
|
||||
|
||||
|
||||
def extract_pdf(file_bytes: bytes) -> ExtractedContent:
|
||||
"""从 PDF 提取文本和图片"""
|
||||
doc = fitz.open(stream=file_bytes, filetype="pdf")
|
||||
pages_text = []
|
||||
page_images = {}
|
||||
|
||||
for i, page in enumerate(doc):
|
||||
text = page.get_text("text")
|
||||
pages_text.append(text)
|
||||
|
||||
# 如果某页文本很少但有图片,可能是扫描件 → 保存为图片用于 Vision OCR
|
||||
if len(text.strip()) < 50:
|
||||
pix = page.get_pixmap(dpi=200)
|
||||
img_bytes = pix.tobytes("png")
|
||||
page_images[i] = base64.b64encode(img_bytes).decode("utf-8")
|
||||
|
||||
doc.close()
|
||||
|
||||
return ExtractedContent(
|
||||
pages_text=pages_text,
|
||||
page_images=page_images,
|
||||
total_pages=len(pages_text),
|
||||
has_images=len(page_images) > 0,
|
||||
)
|
||||
|
||||
|
||||
def get_full_text(extracted: ExtractedContent) -> str:
|
||||
"""合并所有页面文本"""
|
||||
return "\n\n".join(
|
||||
f"--- Page {i+1} ---\n{text}"
|
||||
for i, text in enumerate(extracted.pages_text)
|
||||
if text.strip()
|
||||
)
|
||||
Reference in New Issue
Block a user