Add EasyEDA Pro EPRO2 source ingestion (3/5 batch test)
打通 oshwhub origin=pro 现代 Pro 3.x 工程的 EPRO2 源抓取链路。3/5
modern Pro 项目完整解出(共 8423 docs / 542 MB plain):
- X86 主板 7374 docs / 481 MB plain (chain=85, editor=3.2.15)
- 220V 桌面电源 771 docs / 26 MB plain (chain=28, editor=3.2.69)
- ESP-VoCat 278 docs / 7.5 MB plain (chain=12, editor=3.2.91)
剩余 2/5 是 legacy Pro 2.x(立创泰山派 RK3566、梁山派),项目 meta
返回 branch_uuid=null + editorVersion="2.1.40",没有 git-style chain
模型,文档直接挂在 boards[].sch/pcb 字段上,访问端点暂未挖通;元
数据落库 metadata.json,source/ 留空。
实现要点:
- fetch_pro_source(): 4 步流程(project → branch HEAD → structures
→ /branches/<B>/histories/<HEAD> 即返完整 chain,无需 ?limit 批量
端点)+ 逐 history 走 AES-128-GCM 解密(16 字节 IV,pycryptodome
原生支持)+ gunzip + 按 DOCHEAD 切 per-doc EPRO2 流
- EPRO2 解析坑:行末单 `|` 是行终止符不是字段分隔符,必须先
rstrip("|") 再 split("||"),否则 payload JSON 解析失败 silently
swallow 导致 cur_doc 不设 → 第一轮 X86 板 7374 docs 抽出来只剩 2 个
- docType 实测远不止 BOARD/PCB/SCH/SCH_PAGE,还含 SYMBOL /
FOOTPRINT / DEVICE / BLOB / FONT / CONFIG —— Pro 把组件库快照也
随项目存到 history,下游做 EPRO2→KiCad 转换时必须先把这些 lib
doc 加载进 symbol cache
- Pro 2.x vs 3.x 是不同存储模型 —— 3.x 走 branch 模型(已打通),
2.x 走 boards[] 直链(未打通);判别条件:project meta 的
branch_uuid 是否为 null
CLI 新增 --with-pro-source / --backfill-pro-source / --pro-cookie /
--origin(按 origin 字段服务端过滤 listing API),crawl_one() 按
origin=pro 自动 dispatch 到 Pro fetcher。
schema:docType 类型从 integer 放宽到 [integer, string, null]
(兼容 Std 的 1/3 + Pro 的 BOARD/SCH 等),新增 message_count 字段。
License 注意:本批 5 个项目全是 NC-SA / GPL,未达 Pro source doc
§4.2 Forge 白名单(MIT/BSD/Apache/CC0/CC-BY/CERN-OHL-P/Unlicense)。
按 CLAUDE.md "研究用、不再分发" 原则 raw 入库无碍;Forge 投影时
另过白名单。
详细技术细节见 docs/sources/easyeda_pro_source.md rev 3 + log.md。
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -33,6 +33,9 @@ API_PROJECT = "https://oshwhub.com/api/project" # /api/project/<uuid> for sourc
|
||||
BASE = "https://oshwhub.com"
|
||||
IMG_CDN = "https://image.lceda.cn"
|
||||
LCEDA_DOC_API = "https://lceda.cn/api/documents"
|
||||
PRO_API = "https://pro.lceda.cn/api/v4"
|
||||
PRO_EDITOR_VERSION = "3.2.127"
|
||||
PRO_COOKIE_PATH_DEFAULT = "/home/ubuntu/.secrets/pro-lceda-cookie-header.txt"
|
||||
UA = "FacereDataset/0.1 (+https://git.deepknow.site/Facere/FacereDataset)"
|
||||
# Std source endpoints reject FacereDataset UA on oshwhub /api/project; spoof browser UA only there.
|
||||
# See docs/sources/easyeda_std_source.md §3.
|
||||
@@ -42,6 +45,7 @@ BROWSER_UA = (
|
||||
)
|
||||
SLEEP_BETWEEN = 2.0 # seconds between detail-page / file fetches
|
||||
SLEEP_SOURCE = 5.0 # source fetch is sensitive — QPS ≤ 0.2 per CLAUDE.md登录态 spirit
|
||||
SLEEP_PRO = 5.0 # Pro is logged-in; same QPS ≤ 0.2 per docs/sources/easyeda_pro_source.md §4.1
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -75,6 +79,33 @@ def make_source_client(timeout: float = 60.0) -> httpx.Client:
|
||||
)
|
||||
|
||||
|
||||
def make_pro_source_client(
|
||||
cookie_path: str = PRO_COOKIE_PATH_DEFAULT,
|
||||
timeout: float = 90.0,
|
||||
) -> httpx.Client:
|
||||
"""Client for Pro source endpoints (pro.lceda.cn /api/v4/...).
|
||||
|
||||
Requires logged-in cookie header at `cookie_path` (mode 600). The cookie
|
||||
file is a single Cookie header value (e.g. `lceda_pro_session=...; XSRF-TOKEN=...`).
|
||||
Per-request `path: <project_uuid>` header MUST be added by callers — see
|
||||
docs/sources/easyeda_pro_source.md §2.5.
|
||||
"""
|
||||
cookie = Path(cookie_path).read_text(encoding="utf-8").strip()
|
||||
return httpx.Client(
|
||||
http2=True,
|
||||
timeout=timeout,
|
||||
headers={
|
||||
"User-Agent": BROWSER_UA,
|
||||
"Accept": "application/json, text/plain, */*",
|
||||
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
||||
"Editor-Version": PRO_EDITOR_VERSION,
|
||||
"Referer": "https://pro.lceda.cn/editor",
|
||||
"Cookie": cookie,
|
||||
},
|
||||
follow_redirects=False,
|
||||
)
|
||||
|
||||
|
||||
def polite_sleep() -> None:
|
||||
time.sleep(SLEEP_BETWEEN)
|
||||
|
||||
@@ -88,8 +119,12 @@ def list_projects(
|
||||
page: int = 1,
|
||||
page_size: int = 30,
|
||||
sort: str = "hot",
|
||||
origin: str | None = None,
|
||||
) -> dict:
|
||||
r = client.get(API_LIST, params={"page": page, "pageSize": page_size, "sort": sort})
|
||||
params: dict[str, object] = {"page": page, "pageSize": page_size, "sort": sort}
|
||||
if origin:
|
||||
params["origin"] = origin # 'std' or 'pro' — server-side filter
|
||||
r = client.get(API_LIST, params=params)
|
||||
r.raise_for_status()
|
||||
data = r.json()
|
||||
if not data.get("success"):
|
||||
@@ -331,6 +366,237 @@ def _extract_editor_version(body_json: dict) -> str | None:
|
||||
return None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Pro source fetch (pro.lceda.cn — EPRO2 message stream, AES-128-GCM)
|
||||
# See docs/sources/easyeda_pro_source.md.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _pro_get_json(
|
||||
client: httpx.Client,
|
||||
url: str,
|
||||
project_uuid: str,
|
||||
*,
|
||||
params: dict | None = None,
|
||||
) -> dict | list:
|
||||
"""GET a pro.lceda.cn /api/v4 endpoint with the per-project `path` header.
|
||||
|
||||
Raises if the JSON envelope's `success` is False; returns `result`.
|
||||
"""
|
||||
r = client.get(url, params=params, headers={"path": project_uuid})
|
||||
r.raise_for_status()
|
||||
j = r.json()
|
||||
if not j.get("success"):
|
||||
raise RuntimeError(f"Pro API failed (url={url}): {j}")
|
||||
return j["result"]
|
||||
|
||||
|
||||
def _order_history_chain(chain: list[dict]) -> list[dict]:
|
||||
"""Return the chain ordered root→HEAD by walking parent links.
|
||||
|
||||
Pro returns the chain HEAD-first as a flat list with `parent` links. We
|
||||
walk from the unique root forward.
|
||||
"""
|
||||
by_uuid = {h["uuid"]: h for h in chain}
|
||||
roots = [h for h in chain if h.get("parent") not in by_uuid]
|
||||
if len(roots) != 1:
|
||||
raise RuntimeError(
|
||||
f"history chain has {len(roots)} roots; not strictly linear"
|
||||
)
|
||||
children: dict[str | None, list[dict]] = {}
|
||||
for h in chain:
|
||||
children.setdefault(h.get("parent"), []).append(h)
|
||||
ordered: list[dict] = []
|
||||
cur: dict | None = roots[0]
|
||||
while cur is not None:
|
||||
ordered.append(cur)
|
||||
nexts = children.get(cur["uuid"], [])
|
||||
if len(nexts) > 1:
|
||||
raise RuntimeError(
|
||||
f"history chain not linear at {cur['uuid']!r}: {len(nexts)} children"
|
||||
)
|
||||
cur = nexts[0] if nexts else None
|
||||
if len(ordered) != len(chain):
|
||||
raise RuntimeError(
|
||||
f"reconstructed {len(ordered)} of {len(chain)} histories; chain has cycles or orphans"
|
||||
)
|
||||
return ordered
|
||||
|
||||
|
||||
def fetch_pro_source(
|
||||
pro_client: httpx.Client,
|
||||
project_uuid: str,
|
||||
proj_dir: Path,
|
||||
sleep: float = SLEEP_PRO,
|
||||
) -> dict:
|
||||
"""Fetch EasyEDA Pro project source: full history chain, AES-GCM decrypted,
|
||||
gunzipped, and partitioned into per-document EPRO2 streams.
|
||||
|
||||
Side effects under ``proj_dir``:
|
||||
- source/structure.json — project document tree (boards/schematics/sheets/pcbs/...)
|
||||
- source/<doc_uuid>.epro2 — one file per document, raw EPRO2 messages (newline-separated)
|
||||
- source/manifest.json — per-doc index + chain summary
|
||||
|
||||
Returns dict matching the shape `fetch_std_source` returns.
|
||||
"""
|
||||
import gzip
|
||||
from collections import OrderedDict
|
||||
from Crypto.Cipher import AES # local import: cheap if pycryptodome already loaded
|
||||
|
||||
src_dir = proj_dir / "source"
|
||||
src_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 1. project meta -> branch_uuid + editor_version fallback
|
||||
proj = _pro_get_json(pro_client, f"{PRO_API}/projects/{project_uuid}", project_uuid)
|
||||
branch_uuid = proj.get("branch_uuid")
|
||||
if not branch_uuid:
|
||||
raise RuntimeError(f"no branch_uuid in project meta for {project_uuid}")
|
||||
# Some projects' DOCHEAD payloads lack `editVersion`; project meta has `editorVersion`
|
||||
project_editor_version = proj.get("editorVersion")
|
||||
time.sleep(sleep)
|
||||
|
||||
# 2. branch meta -> head history_uuid
|
||||
branch = _pro_get_json(
|
||||
pro_client,
|
||||
f"{PRO_API}/projects/{project_uuid}/branches/{branch_uuid}",
|
||||
project_uuid,
|
||||
)
|
||||
head_uuid = branch.get("history_uuid")
|
||||
if not head_uuid:
|
||||
raise RuntimeError(f"no history_uuid (HEAD) on branch {branch_uuid}")
|
||||
time.sleep(sleep)
|
||||
|
||||
# 3. structure tree
|
||||
st = _pro_get_json(
|
||||
pro_client,
|
||||
f"{PRO_API}/projects/{project_uuid}/branches/{branch_uuid}/structures",
|
||||
project_uuid,
|
||||
)
|
||||
raw_structure = st.get("structure")
|
||||
structure = json.loads(raw_structure) if isinstance(raw_structure, str) else raw_structure
|
||||
(src_dir / "structure.json").write_text(
|
||||
json.dumps(structure, ensure_ascii=False, indent=2), encoding="utf-8"
|
||||
)
|
||||
time.sleep(sleep)
|
||||
|
||||
# 4. history chain — single endpoint returns full chain (HAR-confirmed 2026-04-28)
|
||||
chain = _pro_get_json(
|
||||
pro_client,
|
||||
f"{PRO_API}/projects/{project_uuid}/branches/{branch_uuid}/histories/{head_uuid}",
|
||||
project_uuid,
|
||||
)
|
||||
if not isinstance(chain, list) or not chain:
|
||||
raise RuntimeError(f"unexpected histories response: {type(chain).__name__}")
|
||||
ordered = _order_history_chain(chain)
|
||||
time.sleep(sleep)
|
||||
|
||||
# 5. download + decrypt + gunzip + partition by DOCHEAD
|
||||
docs: OrderedDict[str, dict] = OrderedDict()
|
||||
cur_doc: str | None = None
|
||||
bytes_blob_total = 0
|
||||
bytes_plain_total = 0
|
||||
for h in ordered:
|
||||
blob_r = pro_client.get(h["dataStrUrl"], headers={"path": project_uuid})
|
||||
blob_r.raise_for_status()
|
||||
blob = blob_r.content
|
||||
bytes_blob_total += len(blob)
|
||||
if len(blob) < 16:
|
||||
raise RuntimeError(f"history {h['uuid']} blob too short ({len(blob)} B)")
|
||||
ct, tag = blob[:-16], blob[-16:]
|
||||
cipher = AES.new(
|
||||
bytes.fromhex(h["key"]),
|
||||
AES.MODE_GCM,
|
||||
nonce=bytes.fromhex(h["iv"]),
|
||||
)
|
||||
gz = cipher.decrypt_and_verify(ct, tag)
|
||||
plain = gzip.decompress(gz)
|
||||
bytes_plain_total += len(plain)
|
||||
for ln in plain.split(b"\n"):
|
||||
if not ln.strip():
|
||||
continue
|
||||
# EPRO2 lines use `||` as field separator and terminate with a single
|
||||
# `|`. Strip the trailing `|` first so each part parses as bare JSON.
|
||||
stripped = ln.rstrip(b"|")
|
||||
parts = stripped.split(b"||")
|
||||
try:
|
||||
head_msg = json.loads(parts[0])
|
||||
except Exception: # noqa: BLE001 — malformed head; skip entire line
|
||||
continue
|
||||
if head_msg.get("type") == "DOCHEAD" and len(parts) >= 2:
|
||||
try:
|
||||
payload = json.loads(parts[1])
|
||||
except Exception: # noqa: BLE001
|
||||
payload = {}
|
||||
new_doc = payload.get("uuid")
|
||||
if new_doc:
|
||||
cur_doc = new_doc
|
||||
if cur_doc not in docs:
|
||||
docs[cur_doc] = {
|
||||
"lines": [],
|
||||
"doc_head": payload,
|
||||
}
|
||||
if cur_doc and cur_doc in docs:
|
||||
docs[cur_doc]["lines"].append(ln)
|
||||
time.sleep(sleep)
|
||||
|
||||
# 6. write per-doc .epro2 + manifest
|
||||
doc_metas: list[dict] = []
|
||||
editor_version: str | None = None
|
||||
for doc_uuid, info in docs.items():
|
||||
body = b"\n".join(info["lines"]) + b"\n"
|
||||
local_rel = f"source/{doc_uuid}.epro2"
|
||||
local_path = proj_dir / local_rel
|
||||
local_path.write_bytes(body)
|
||||
size = len(body)
|
||||
sha = hashlib.sha256(body).hexdigest()
|
||||
head = info["doc_head"]
|
||||
ev = head.get("editVersion") or head.get("editorVersion")
|
||||
if ev and not editor_version:
|
||||
editor_version = str(ev)
|
||||
doc_metas.append({
|
||||
"doc_uuid": doc_uuid,
|
||||
"docType": head.get("docType"), # "BOARD" / "PCB" / "SCH" / "SCH_PAGE" / "SYMBOL" / ...
|
||||
"path": local_rel,
|
||||
"size": size,
|
||||
"sha256": sha,
|
||||
"message_count": len(info["lines"]),
|
||||
})
|
||||
|
||||
# editor_version fallback: project meta if no DOCHEAD payload had it
|
||||
if not editor_version and project_editor_version:
|
||||
editor_version = str(project_editor_version)
|
||||
|
||||
structure_summary: dict[str, int] = {}
|
||||
if isinstance(structure, dict):
|
||||
for k, v in structure.items():
|
||||
if isinstance(v, dict):
|
||||
structure_summary[k] = len(v)
|
||||
elif isinstance(v, list):
|
||||
structure_summary[k] = len(v)
|
||||
|
||||
manifest = {
|
||||
"project_uuid": project_uuid,
|
||||
"branch_uuid": branch_uuid,
|
||||
"head_uuid": head_uuid,
|
||||
"fetched_at": datetime.now(timezone.utc).isoformat(),
|
||||
"editor_version": editor_version,
|
||||
"chain_length": len(chain),
|
||||
"blob_bytes_total": bytes_blob_total,
|
||||
"plain_bytes_total": bytes_plain_total,
|
||||
"documents": doc_metas,
|
||||
"structure_summary": structure_summary,
|
||||
}
|
||||
(src_dir / "manifest.json").write_text(
|
||||
json.dumps(manifest, ensure_ascii=False, indent=2), encoding="utf-8"
|
||||
)
|
||||
|
||||
return {
|
||||
"source_format": "easyeda-pro",
|
||||
"source_path": "source/",
|
||||
"source_documents": doc_metas,
|
||||
"editor_version": editor_version,
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Single-project crawl
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -350,6 +616,7 @@ def crawl_one(
|
||||
out_root: Path,
|
||||
fetch_files: bool = True,
|
||||
source_client: httpx.Client | None = None,
|
||||
pro_source_client: httpx.Client | None = None,
|
||||
) -> CrawlResult:
|
||||
uuid = list_item["uuid"]
|
||||
path = list_item["path"]
|
||||
@@ -446,9 +713,20 @@ def crawl_one(
|
||||
json.dumps(urls_manifest, ensure_ascii=False, indent=2), encoding="utf-8"
|
||||
)
|
||||
|
||||
# 6. Optional: EasyEDA Std project source (schematic + PCB dataStr)
|
||||
# 6. Optional: EasyEDA project source — dispatch on origin (std vs pro)
|
||||
src_meta: dict = {}
|
||||
if source_client is not None:
|
||||
origin = list_item.get("origin")
|
||||
if origin == "pro" and pro_source_client is not None:
|
||||
try:
|
||||
src_meta = fetch_pro_source(pro_source_client, uuid, proj_dir)
|
||||
print(
|
||||
f" pro source: {len(src_meta.get('source_documents', []))} docs, "
|
||||
f"editor={src_meta.get('editor_version')}"
|
||||
)
|
||||
except Exception as e: # noqa: BLE001
|
||||
print(f" pro source FAIL: {e}", file=sys.stderr)
|
||||
skipped.append(f"pro_source: {e}")
|
||||
elif origin != "pro" and source_client is not None:
|
||||
try:
|
||||
src_meta = fetch_std_source(source_client, uuid, proj_dir)
|
||||
print(
|
||||
@@ -525,9 +803,10 @@ def iter_candidates(
|
||||
pages: int,
|
||||
page_size: int,
|
||||
sort: str,
|
||||
origin: str | None = None,
|
||||
) -> Iterator[dict]:
|
||||
for p in range(1, pages + 1):
|
||||
res = list_projects(client, page=p, page_size=page_size, sort=sort)
|
||||
res = list_projects(client, page=p, page_size=page_size, sort=sort, origin=origin)
|
||||
for it in res["lists"]:
|
||||
yield it
|
||||
polite_sleep()
|
||||
@@ -542,6 +821,12 @@ def main(argv: list[str] | None = None) -> int:
|
||||
ap.add_argument("--pages", type=int, default=3, help="list API pages to scan")
|
||||
ap.add_argument("--page-size", type=int, default=30)
|
||||
ap.add_argument("--sort", default="hot")
|
||||
ap.add_argument(
|
||||
"--origin",
|
||||
choices=["std", "pro"],
|
||||
default=None,
|
||||
help="filter listing API by origin (server-side); needed to find Pro projects in top-N",
|
||||
)
|
||||
ap.add_argument("--uuids", type=str, default=None, help="comma-separated explicit UUID list")
|
||||
ap.add_argument("--no-files", action="store_true", help="do not download attachments")
|
||||
ap.add_argument("--limit", type=int, default=None, help="override --top, same effect")
|
||||
@@ -555,6 +840,22 @@ def main(argv: list[str] | None = None) -> int:
|
||||
action="store_true",
|
||||
help="skip listing/HTML/attachments; only fetch source for projects already in --out",
|
||||
)
|
||||
ap.add_argument(
|
||||
"--with-pro-source",
|
||||
action="store_true",
|
||||
help="also fetch EasyEDA Pro project source (full history chain, EPRO2 streams) per project",
|
||||
)
|
||||
ap.add_argument(
|
||||
"--backfill-pro-source",
|
||||
action="store_true",
|
||||
help="skip listing; only fetch Pro source for origin=pro projects already in --out",
|
||||
)
|
||||
ap.add_argument(
|
||||
"--pro-cookie",
|
||||
type=str,
|
||||
default=PRO_COOKIE_PATH_DEFAULT,
|
||||
help="path to file with Cookie header for pro.lceda.cn",
|
||||
)
|
||||
args = ap.parse_args(argv)
|
||||
|
||||
n_target = args.limit if args.limit is not None else args.top
|
||||
@@ -564,13 +865,19 @@ def main(argv: list[str] | None = None) -> int:
|
||||
# only fetches source. No listing/HTML/attachment work.
|
||||
if args.backfill_source:
|
||||
return _run_backfill_source(args.out, only_uuids=args.uuids)
|
||||
if args.backfill_pro_source:
|
||||
return _run_backfill_pro_source(
|
||||
args.out, only_uuids=args.uuids, cookie_path=args.pro_cookie
|
||||
)
|
||||
|
||||
with make_client() as client:
|
||||
# Build list of items to crawl
|
||||
if args.uuids:
|
||||
wanted = set(args.uuids.split(","))
|
||||
items: list[dict] = []
|
||||
for it in iter_candidates(client, args.pages, args.page_size, args.sort):
|
||||
for it in iter_candidates(
|
||||
client, args.pages, args.page_size, args.sort, origin=args.origin
|
||||
):
|
||||
if it["uuid"] in wanted:
|
||||
items.append(it)
|
||||
if len(items) == len(wanted):
|
||||
@@ -579,7 +886,11 @@ def main(argv: list[str] | None = None) -> int:
|
||||
missing = wanted - {i["uuid"] for i in items}
|
||||
print(f"WARN: missing uuids (not in top pages): {missing}", file=sys.stderr)
|
||||
else:
|
||||
pool = list(iter_candidates(client, args.pages, args.page_size, args.sort))
|
||||
pool = list(
|
||||
iter_candidates(
|
||||
client, args.pages, args.page_size, args.sort, origin=args.origin
|
||||
)
|
||||
)
|
||||
items = pick_top(
|
||||
pool, n=n_target, min_likes=args.min_likes, min_grade=args.min_grade
|
||||
)
|
||||
@@ -591,6 +902,9 @@ def main(argv: list[str] | None = None) -> int:
|
||||
)
|
||||
|
||||
source_client_ctx = make_source_client() if args.with_source else None
|
||||
pro_source_client_ctx = (
|
||||
make_pro_source_client(args.pro_cookie) if args.with_pro_source else None
|
||||
)
|
||||
try:
|
||||
print(f"Crawling {len(items)} projects -> {args.out}")
|
||||
for i, it in enumerate(items, 1):
|
||||
@@ -602,6 +916,7 @@ def main(argv: list[str] | None = None) -> int:
|
||||
args.out,
|
||||
fetch_files=not args.no_files,
|
||||
source_client=source_client_ctx,
|
||||
pro_source_client=pro_source_client_ctx,
|
||||
)
|
||||
print(
|
||||
f" OK: {r.files_count} files, {r.bytes_total / 1024 / 1024:.1f} MB "
|
||||
@@ -612,6 +927,8 @@ def main(argv: list[str] | None = None) -> int:
|
||||
finally:
|
||||
if source_client_ctx is not None:
|
||||
source_client_ctx.close()
|
||||
if pro_source_client_ctx is not None:
|
||||
pro_source_client_ctx.close()
|
||||
|
||||
return 0
|
||||
|
||||
@@ -664,5 +981,67 @@ def _run_backfill_source(out_root: Path, only_uuids: str | None = None) -> int:
|
||||
return 0
|
||||
|
||||
|
||||
def _run_backfill_pro_source(
|
||||
out_root: Path,
|
||||
only_uuids: str | None = None,
|
||||
cookie_path: str = PRO_COOKIE_PATH_DEFAULT,
|
||||
) -> int:
|
||||
"""Walk per-project dirs in out_root, fetch Pro source for origin=pro projects.
|
||||
|
||||
A project is considered Pro by either: existing metadata.json marks
|
||||
raw_fields.origin == 'pro', OR --uuids was passed and includes this UUID
|
||||
(caller is asserting Pro).
|
||||
"""
|
||||
wanted: set[str] | None = set(only_uuids.split(",")) if only_uuids else None
|
||||
targets: list[Path] = []
|
||||
for d in sorted(out_root.iterdir()):
|
||||
if not d.is_dir():
|
||||
continue
|
||||
meta_path = d / "metadata.json"
|
||||
if not meta_path.exists():
|
||||
continue
|
||||
if wanted is not None:
|
||||
if d.name not in wanted:
|
||||
continue
|
||||
else:
|
||||
try:
|
||||
m = json.loads(meta_path.read_text(encoding="utf-8"))
|
||||
except Exception: # noqa: BLE001
|
||||
continue
|
||||
if (m.get("raw_fields") or {}).get("origin") != "pro":
|
||||
continue
|
||||
targets.append(d)
|
||||
|
||||
print(f"Backfill pro source for {len(targets)} projects under {out_root}")
|
||||
pro_client = make_pro_source_client(cookie_path=cookie_path)
|
||||
try:
|
||||
for i, proj_dir in enumerate(targets, 1):
|
||||
uuid = proj_dir.name
|
||||
meta_path = proj_dir / "metadata.json"
|
||||
meta = json.loads(meta_path.read_text(encoding="utf-8"))
|
||||
print(f"[{i}/{len(targets)}] {uuid} ({meta.get('title', '?')})")
|
||||
try:
|
||||
src_meta = fetch_pro_source(pro_client, uuid, proj_dir)
|
||||
except Exception as e: # noqa: BLE001
|
||||
print(f" FAIL: {e}", file=sys.stderr)
|
||||
continue
|
||||
meta["source_format"] = src_meta["source_format"]
|
||||
meta["source_path"] = src_meta["source_path"]
|
||||
meta["source_documents"] = src_meta["source_documents"]
|
||||
if src_meta.get("editor_version"):
|
||||
meta["editor_version"] = src_meta["editor_version"]
|
||||
meta_path.write_text(
|
||||
json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8"
|
||||
)
|
||||
total = sum(d["size"] for d in src_meta["source_documents"])
|
||||
print(
|
||||
f" OK: {len(src_meta['source_documents'])} docs, "
|
||||
f"{total / 1024 / 1024:.1f} MB plain, editor={src_meta.get('editor_version')}"
|
||||
)
|
||||
finally:
|
||||
pro_client.close()
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
|
||||
Reference in New Issue
Block a user