Add EasyEDA Pro EPRO2 source ingestion (3/5 batch test)

打通 oshwhub origin=pro 现代 Pro 3.x 工程的 EPRO2 源抓取链路。3/5 modern Pro 项目完整解出（共 8423 docs / 542 MB plain）： - X86 主板 7374 docs / 481 MB plain (chain=85, editor=3.2.15) - 220V 桌面电源 771 docs / 26 MB plain (chain=28, editor=3.2.69) - ESP-VoCat 278 docs / 7.5 MB plain (chain=12, editor=3.2.91) 剩余 2/5 是 legacy Pro 2.x（立创泰山派 RK3566、梁山派），项目 meta 返回 branch_uuid=null + editorVersion="2.1.40"，没有 git-style chain 模型，文档直接挂在 boards[].sch/pcb 字段上，访问端点暂未挖通；元数据落库 metadata.json，source/ 留空。实现要点： - fetch_pro_source(): 4 步流程（project → branch HEAD → structures → /branches/<B>/histories/<HEAD> 即返完整 chain，无需 ?limit 批量端点）+ 逐 history 走 AES-128-GCM 解密（16 字节 IV，pycryptodome 原生支持）+ gunzip + 按 DOCHEAD 切 per-doc EPRO2 流 - EPRO2 解析坑：行末单 `|` 是行终止符不是字段分隔符，必须先 rstrip("|") 再 split("||")，否则 payload JSON 解析失败 silently swallow 导致 cur_doc 不设 → 第一轮 X86 板 7374 docs 抽出来只剩 2 个 - docType 实测远不止 BOARD/PCB/SCH/SCH_PAGE，还含 SYMBOL / FOOTPRINT / DEVICE / BLOB / FONT / CONFIG —— Pro 把组件库快照也随项目存到 history，下游做 EPRO2→KiCad 转换时必须先把这些 lib doc 加载进 symbol cache - Pro 2.x vs 3.x 是不同存储模型 —— 3.x 走 branch 模型（已打通）， 2.x 走 boards[] 直链（未打通）；判别条件：project meta 的 branch_uuid 是否为 null CLI 新增 --with-pro-source / --backfill-pro-source / --pro-cookie / --origin（按 origin 字段服务端过滤 listing API），crawl_one() 按 origin=pro 自动 dispatch 到 Pro fetcher。 schema：docType 类型从 integer 放宽到 [integer, string, null] （兼容 Std 的 1/3 + Pro 的 BOARD/SCH 等），新增 message_count 字段。 License 注意：本批 5 个项目全是 NC-SA / GPL，未达 Pro source doc §4.2 Forge 白名单（MIT/BSD/Apache/CC0/CC-BY/CERN-OHL-P/Unlicense）。按 CLAUDE.md "研究用、不再分发" 原则 raw 入库无碍；Forge 投影时另过白名单。详细技术细节见 docs/sources/easyeda_pro_source.md rev 3 + log.md。 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-28 21:45:52 +08:00
parent d874278bc5
commit 3282a028c4
8455 changed files with 2275248 additions and 36 deletions
--- a/crawlers/oshwhub/crawler.py
+++ b/crawlers/oshwhub/crawler.py
@@ -33,6 +33,9 @@ API_PROJECT = "https://oshwhub.com/api/project"  # /api/project/<uuid> for sourc
 BASE = "https://oshwhub.com"
 IMG_CDN = "https://image.lceda.cn"
 LCEDA_DOC_API = "https://lceda.cn/api/documents"
+PRO_API = "https://pro.lceda.cn/api/v4"
+PRO_EDITOR_VERSION = "3.2.127"
+PRO_COOKIE_PATH_DEFAULT = "/home/ubuntu/.secrets/pro-lceda-cookie-header.txt"
 UA = "FacereDataset/0.1 (+https://git.deepknow.site/Facere/FacereDataset)"
 # Std source endpoints reject FacereDataset UA on oshwhub /api/project; spoof browser UA only there.
 # See docs/sources/easyeda_std_source.md §3.
@@ -42,6 +45,7 @@ BROWSER_UA = (
 )
 SLEEP_BETWEEN = 2.0  # seconds between detail-page / file fetches
 SLEEP_SOURCE = 5.0   # source fetch is sensitive — QPS ≤ 0.2 per CLAUDE.md登录态 spirit
+SLEEP_PRO = 5.0      # Pro is logged-in; same QPS ≤ 0.2 per docs/sources/easyeda_pro_source.md §4.1


 # ---------------------------------------------------------------------------
@@ -75,6 +79,33 @@ def make_source_client(timeout: float = 60.0) -> httpx.Client:
    )


+def make_pro_source_client(
+    cookie_path: str = PRO_COOKIE_PATH_DEFAULT,
+    timeout: float = 90.0,
+) -> httpx.Client:
+    """Client for Pro source endpoints (pro.lceda.cn /api/v4/...).
+
+    Requires logged-in cookie header at `cookie_path` (mode 600). The cookie
+    file is a single Cookie header value (e.g. `lceda_pro_session=...; XSRF-TOKEN=...`).
+    Per-request `path: <project_uuid>` header MUST be added by callers — see
+    docs/sources/easyeda_pro_source.md §2.5.
+    """
+    cookie = Path(cookie_path).read_text(encoding="utf-8").strip()
+    return httpx.Client(
+        http2=True,
+        timeout=timeout,
+        headers={
+            "User-Agent": BROWSER_UA,
+            "Accept": "application/json, text/plain, */*",
+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "Editor-Version": PRO_EDITOR_VERSION,
+            "Referer": "https://pro.lceda.cn/editor",
+            "Cookie": cookie,
+        },
+        follow_redirects=False,
+    )
+
+
 def polite_sleep() -> None:
    time.sleep(SLEEP_BETWEEN)

@@ -88,8 +119,12 @@ def list_projects(
    page: int = 1,
    page_size: int = 30,
    sort: str = "hot",
+    origin: str | None = None,
 ) -> dict:
-    r = client.get(API_LIST, params={"page": page, "pageSize": page_size, "sort": sort})
+    params: dict[str, object] = {"page": page, "pageSize": page_size, "sort": sort}
+    if origin:
+        params["origin"] = origin  # 'std' or 'pro' — server-side filter
+    r = client.get(API_LIST, params=params)
    r.raise_for_status()
    data = r.json()
    if not data.get("success"):
@@ -331,6 +366,237 @@ def _extract_editor_version(body_json: dict) -> str | None:
    return None


+# ---------------------------------------------------------------------------
+# Pro source fetch  (pro.lceda.cn — EPRO2 message stream, AES-128-GCM)
+# See docs/sources/easyeda_pro_source.md.
+# ---------------------------------------------------------------------------
+
+def _pro_get_json(
+    client: httpx.Client,
+    url: str,
+    project_uuid: str,
+    *,
+    params: dict | None = None,
+) -> dict | list:
+    """GET a pro.lceda.cn /api/v4 endpoint with the per-project `path` header.
+
+    Raises if the JSON envelope's `success` is False; returns `result`.
+    """
+    r = client.get(url, params=params, headers={"path": project_uuid})
+    r.raise_for_status()
+    j = r.json()
+    if not j.get("success"):
+        raise RuntimeError(f"Pro API failed (url={url}): {j}")
+    return j["result"]
+
+
+def _order_history_chain(chain: list[dict]) -> list[dict]:
+    """Return the chain ordered root→HEAD by walking parent links.
+
+    Pro returns the chain HEAD-first as a flat list with `parent` links. We
+    walk from the unique root forward.
+    """
+    by_uuid = {h["uuid"]: h for h in chain}
+    roots = [h for h in chain if h.get("parent") not in by_uuid]
+    if len(roots) != 1:
+        raise RuntimeError(
+            f"history chain has {len(roots)} roots; not strictly linear"
+        )
+    children: dict[str | None, list[dict]] = {}
+    for h in chain:
+        children.setdefault(h.get("parent"), []).append(h)
+    ordered: list[dict] = []
+    cur: dict | None = roots[0]
+    while cur is not None:
+        ordered.append(cur)
+        nexts = children.get(cur["uuid"], [])
+        if len(nexts) > 1:
+            raise RuntimeError(
+                f"history chain not linear at {cur['uuid']!r}: {len(nexts)} children"
+            )
+        cur = nexts[0] if nexts else None
+    if len(ordered) != len(chain):
+        raise RuntimeError(
+            f"reconstructed {len(ordered)} of {len(chain)} histories; chain has cycles or orphans"
+        )
+    return ordered
+
+
+def fetch_pro_source(
+    pro_client: httpx.Client,
+    project_uuid: str,
+    proj_dir: Path,
+    sleep: float = SLEEP_PRO,
+) -> dict:
+    """Fetch EasyEDA Pro project source: full history chain, AES-GCM decrypted,
+    gunzipped, and partitioned into per-document EPRO2 streams.
+
+    Side effects under ``proj_dir``:
+      - source/structure.json     — project document tree (boards/schematics/sheets/pcbs/...)
+      - source/<doc_uuid>.epro2   — one file per document, raw EPRO2 messages (newline-separated)
+      - source/manifest.json      — per-doc index + chain summary
+
+    Returns dict matching the shape `fetch_std_source` returns.
+    """
+    import gzip
+    from collections import OrderedDict
+    from Crypto.Cipher import AES  # local import: cheap if pycryptodome already loaded
+
+    src_dir = proj_dir / "source"
+    src_dir.mkdir(parents=True, exist_ok=True)
+
+    # 1. project meta -> branch_uuid + editor_version fallback
+    proj = _pro_get_json(pro_client, f"{PRO_API}/projects/{project_uuid}", project_uuid)
+    branch_uuid = proj.get("branch_uuid")
+    if not branch_uuid:
+        raise RuntimeError(f"no branch_uuid in project meta for {project_uuid}")
+    # Some projects' DOCHEAD payloads lack `editVersion`; project meta has `editorVersion`
+    project_editor_version = proj.get("editorVersion")
+    time.sleep(sleep)
+
+    # 2. branch meta -> head history_uuid
+    branch = _pro_get_json(
+        pro_client,
+        f"{PRO_API}/projects/{project_uuid}/branches/{branch_uuid}",
+        project_uuid,
+    )
+    head_uuid = branch.get("history_uuid")
+    if not head_uuid:
+        raise RuntimeError(f"no history_uuid (HEAD) on branch {branch_uuid}")
+    time.sleep(sleep)
+
+    # 3. structure tree
+    st = _pro_get_json(
+        pro_client,
+        f"{PRO_API}/projects/{project_uuid}/branches/{branch_uuid}/structures",
+        project_uuid,
+    )
+    raw_structure = st.get("structure")
+    structure = json.loads(raw_structure) if isinstance(raw_structure, str) else raw_structure
+    (src_dir / "structure.json").write_text(
+        json.dumps(structure, ensure_ascii=False, indent=2), encoding="utf-8"
+    )
+    time.sleep(sleep)
+
+    # 4. history chain — single endpoint returns full chain (HAR-confirmed 2026-04-28)
+    chain = _pro_get_json(
+        pro_client,
+        f"{PRO_API}/projects/{project_uuid}/branches/{branch_uuid}/histories/{head_uuid}",
+        project_uuid,
+    )
+    if not isinstance(chain, list) or not chain:
+        raise RuntimeError(f"unexpected histories response: {type(chain).__name__}")
+    ordered = _order_history_chain(chain)
+    time.sleep(sleep)
+
+    # 5. download + decrypt + gunzip + partition by DOCHEAD
+    docs: OrderedDict[str, dict] = OrderedDict()
+    cur_doc: str | None = None
+    bytes_blob_total = 0
+    bytes_plain_total = 0
+    for h in ordered:
+        blob_r = pro_client.get(h["dataStrUrl"], headers={"path": project_uuid})
+        blob_r.raise_for_status()
+        blob = blob_r.content
+        bytes_blob_total += len(blob)
+        if len(blob) < 16:
+            raise RuntimeError(f"history {h['uuid']} blob too short ({len(blob)} B)")
+        ct, tag = blob[:-16], blob[-16:]
+        cipher = AES.new(
+            bytes.fromhex(h["key"]),
+            AES.MODE_GCM,
+            nonce=bytes.fromhex(h["iv"]),
+        )
+        gz = cipher.decrypt_and_verify(ct, tag)
+        plain = gzip.decompress(gz)
+        bytes_plain_total += len(plain)
+        for ln in plain.split(b"\n"):
+            if not ln.strip():
+                continue
+            # EPRO2 lines use `||` as field separator and terminate with a single
+            # `|`. Strip the trailing `|` first so each part parses as bare JSON.
+            stripped = ln.rstrip(b"|")
+            parts = stripped.split(b"||")
+            try:
+                head_msg = json.loads(parts[0])
+            except Exception:  # noqa: BLE001 — malformed head; skip entire line
+                continue
+            if head_msg.get("type") == "DOCHEAD" and len(parts) >= 2:
+                try:
+                    payload = json.loads(parts[1])
+                except Exception:  # noqa: BLE001
+                    payload = {}
+                new_doc = payload.get("uuid")
+                if new_doc:
+                    cur_doc = new_doc
+                    if cur_doc not in docs:
+                        docs[cur_doc] = {
+                            "lines": [],
+                            "doc_head": payload,
+                        }
+            if cur_doc and cur_doc in docs:
+                docs[cur_doc]["lines"].append(ln)
+        time.sleep(sleep)
+
+    # 6. write per-doc .epro2 + manifest
+    doc_metas: list[dict] = []
+    editor_version: str | None = None
+    for doc_uuid, info in docs.items():
+        body = b"\n".join(info["lines"]) + b"\n"
+        local_rel = f"source/{doc_uuid}.epro2"
+        local_path = proj_dir / local_rel
+        local_path.write_bytes(body)
+        size = len(body)
+        sha = hashlib.sha256(body).hexdigest()
+        head = info["doc_head"]
+        ev = head.get("editVersion") or head.get("editorVersion")
+        if ev and not editor_version:
+            editor_version = str(ev)
+        doc_metas.append({
+            "doc_uuid": doc_uuid,
+            "docType": head.get("docType"),  # "BOARD" / "PCB" / "SCH" / "SCH_PAGE" / "SYMBOL" / ...
+            "path": local_rel,
+            "size": size,
+            "sha256": sha,
+            "message_count": len(info["lines"]),
+        })
+
+    # editor_version fallback: project meta if no DOCHEAD payload had it
+    if not editor_version and project_editor_version:
+        editor_version = str(project_editor_version)
+
+    structure_summary: dict[str, int] = {}
+    if isinstance(structure, dict):
+        for k, v in structure.items():
+            if isinstance(v, dict):
+                structure_summary[k] = len(v)
+            elif isinstance(v, list):
+                structure_summary[k] = len(v)
+
+    manifest = {
+        "project_uuid": project_uuid,
+        "branch_uuid": branch_uuid,
+        "head_uuid": head_uuid,
+        "fetched_at": datetime.now(timezone.utc).isoformat(),
+        "editor_version": editor_version,
+        "chain_length": len(chain),
+        "blob_bytes_total": bytes_blob_total,
+        "plain_bytes_total": bytes_plain_total,
+        "documents": doc_metas,
+        "structure_summary": structure_summary,
+    }
+    (src_dir / "manifest.json").write_text(
+        json.dumps(manifest, ensure_ascii=False, indent=2), encoding="utf-8"
+    )
+
+    return {
+        "source_format": "easyeda-pro",
+        "source_path": "source/",
+        "source_documents": doc_metas,
+        "editor_version": editor_version,
+    }
+
+
 # ---------------------------------------------------------------------------
 # Single-project crawl
 # ---------------------------------------------------------------------------
@@ -350,6 +616,7 @@ def crawl_one(
    out_root: Path,
    fetch_files: bool = True,
    source_client: httpx.Client | None = None,
+    pro_source_client: httpx.Client | None = None,
 ) -> CrawlResult:
    uuid = list_item["uuid"]
    path = list_item["path"]
@@ -446,9 +713,20 @@ def crawl_one(
        json.dumps(urls_manifest, ensure_ascii=False, indent=2), encoding="utf-8"
    )

-    # 6. Optional: EasyEDA Std project source (schematic + PCB dataStr)
+    # 6. Optional: EasyEDA project source — dispatch on origin (std vs pro)
    src_meta: dict = {}
-    if source_client is not None:
+    origin = list_item.get("origin")
+    if origin == "pro" and pro_source_client is not None:
+        try:
+            src_meta = fetch_pro_source(pro_source_client, uuid, proj_dir)
+            print(
+                f"  pro source: {len(src_meta.get('source_documents', []))} docs, "
+                f"editor={src_meta.get('editor_version')}"
+            )
+        except Exception as e:  # noqa: BLE001
+            print(f"  pro source FAIL: {e}", file=sys.stderr)
+            skipped.append(f"pro_source: {e}")
+    elif origin != "pro" and source_client is not None:
        try:
            src_meta = fetch_std_source(source_client, uuid, proj_dir)
            print(
@@ -525,9 +803,10 @@ def iter_candidates(
    pages: int,
    page_size: int,
    sort: str,
+    origin: str | None = None,
 ) -> Iterator[dict]:
    for p in range(1, pages + 1):
-        res = list_projects(client, page=p, page_size=page_size, sort=sort)
+        res = list_projects(client, page=p, page_size=page_size, sort=sort, origin=origin)
        for it in res["lists"]:
            yield it
        polite_sleep()
@@ -542,6 +821,12 @@ def main(argv: list[str] | None = None) -> int:
    ap.add_argument("--pages", type=int, default=3, help="list API pages to scan")
    ap.add_argument("--page-size", type=int, default=30)
    ap.add_argument("--sort", default="hot")
+    ap.add_argument(
+        "--origin",
+        choices=["std", "pro"],
+        default=None,
+        help="filter listing API by origin (server-side); needed to find Pro projects in top-N",
+    )
    ap.add_argument("--uuids", type=str, default=None, help="comma-separated explicit UUID list")
    ap.add_argument("--no-files", action="store_true", help="do not download attachments")
    ap.add_argument("--limit", type=int, default=None, help="override --top, same effect")
@@ -555,6 +840,22 @@ def main(argv: list[str] | None = None) -> int:
        action="store_true",
        help="skip listing/HTML/attachments; only fetch source for projects already in --out",
    )
+    ap.add_argument(
+        "--with-pro-source",
+        action="store_true",
+        help="also fetch EasyEDA Pro project source (full history chain, EPRO2 streams) per project",
+    )
+    ap.add_argument(
+        "--backfill-pro-source",
+        action="store_true",
+        help="skip listing; only fetch Pro source for origin=pro projects already in --out",
+    )
+    ap.add_argument(
+        "--pro-cookie",
+        type=str,
+        default=PRO_COOKIE_PATH_DEFAULT,
+        help="path to file with Cookie header for pro.lceda.cn",
+    )
    args = ap.parse_args(argv)

    n_target = args.limit if args.limit is not None else args.top
@@ -564,13 +865,19 @@ def main(argv: list[str] | None = None) -> int:
    # only fetches source. No listing/HTML/attachment work.
    if args.backfill_source:
        return _run_backfill_source(args.out, only_uuids=args.uuids)
+    if args.backfill_pro_source:
+        return _run_backfill_pro_source(
+            args.out, only_uuids=args.uuids, cookie_path=args.pro_cookie
+        )

    with make_client() as client:
        # Build list of items to crawl
        if args.uuids:
            wanted = set(args.uuids.split(","))
            items: list[dict] = []
-            for it in iter_candidates(client, args.pages, args.page_size, args.sort):
+            for it in iter_candidates(
+                client, args.pages, args.page_size, args.sort, origin=args.origin
+            ):
                if it["uuid"] in wanted:
                    items.append(it)
                if len(items) == len(wanted):
@@ -579,7 +886,11 @@ def main(argv: list[str] | None = None) -> int:
                missing = wanted - {i["uuid"] for i in items}
                print(f"WARN: missing uuids (not in top pages): {missing}", file=sys.stderr)
        else:
-            pool = list(iter_candidates(client, args.pages, args.page_size, args.sort))
+            pool = list(
+                iter_candidates(
+                    client, args.pages, args.page_size, args.sort, origin=args.origin
+                )
+            )
            items = pick_top(
                pool, n=n_target, min_likes=args.min_likes, min_grade=args.min_grade
            )
@@ -591,6 +902,9 @@ def main(argv: list[str] | None = None) -> int:
                )

        source_client_ctx = make_source_client() if args.with_source else None
+        pro_source_client_ctx = (
+            make_pro_source_client(args.pro_cookie) if args.with_pro_source else None
+        )
        try:
            print(f"Crawling {len(items)} projects -> {args.out}")
            for i, it in enumerate(items, 1):
@@ -602,6 +916,7 @@ def main(argv: list[str] | None = None) -> int:
                        args.out,
                        fetch_files=not args.no_files,
                        source_client=source_client_ctx,
+                        pro_source_client=pro_source_client_ctx,
                    )
                    print(
                        f"  OK: {r.files_count} files, {r.bytes_total / 1024 / 1024:.1f} MB "
@@ -612,6 +927,8 @@ def main(argv: list[str] | None = None) -> int:
        finally:
            if source_client_ctx is not None:
                source_client_ctx.close()
+            if pro_source_client_ctx is not None:
+                pro_source_client_ctx.close()

    return 0

@@ -664,5 +981,67 @@ def _run_backfill_source(out_root: Path, only_uuids: str | None = None) -> int:
    return 0


+def _run_backfill_pro_source(
+    out_root: Path,
+    only_uuids: str | None = None,
+    cookie_path: str = PRO_COOKIE_PATH_DEFAULT,
+) -> int:
+    """Walk per-project dirs in out_root, fetch Pro source for origin=pro projects.
+
+    A project is considered Pro by either: existing metadata.json marks
+    raw_fields.origin == 'pro', OR --uuids was passed and includes this UUID
+    (caller is asserting Pro).
+    """
+    wanted: set[str] | None = set(only_uuids.split(",")) if only_uuids else None
+    targets: list[Path] = []
+    for d in sorted(out_root.iterdir()):
+        if not d.is_dir():
+            continue
+        meta_path = d / "metadata.json"
+        if not meta_path.exists():
+            continue
+        if wanted is not None:
+            if d.name not in wanted:
+                continue
+        else:
+            try:
+                m = json.loads(meta_path.read_text(encoding="utf-8"))
+            except Exception:  # noqa: BLE001
+                continue
+            if (m.get("raw_fields") or {}).get("origin") != "pro":
+                continue
+        targets.append(d)
+
+    print(f"Backfill pro source for {len(targets)} projects under {out_root}")
+    pro_client = make_pro_source_client(cookie_path=cookie_path)
+    try:
+        for i, proj_dir in enumerate(targets, 1):
+            uuid = proj_dir.name
+            meta_path = proj_dir / "metadata.json"
+            meta = json.loads(meta_path.read_text(encoding="utf-8"))
+            print(f"[{i}/{len(targets)}] {uuid}  ({meta.get('title', '?')})")
+            try:
+                src_meta = fetch_pro_source(pro_client, uuid, proj_dir)
+            except Exception as e:  # noqa: BLE001
+                print(f"  FAIL: {e}", file=sys.stderr)
+                continue
+            meta["source_format"] = src_meta["source_format"]
+            meta["source_path"] = src_meta["source_path"]
+            meta["source_documents"] = src_meta["source_documents"]
+            if src_meta.get("editor_version"):
+                meta["editor_version"] = src_meta["editor_version"]
+            meta_path.write_text(
+                json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8"
+            )
+            total = sum(d["size"] for d in src_meta["source_documents"])
+            print(
+                f"  OK: {len(src_meta['source_documents'])} docs, "
+                f"{total / 1024 / 1024:.1f} MB plain, editor={src_meta.get('editor_version')}"
+            )
+    finally:
+        pro_client.close()
+    return 0
+
+
 if __name__ == "__main__":
    raise SystemExit(main())