crawler: --skip-ext + --max-source-mb gates for batch-50 expansion

Two CLI gates needed before scaling Pro batch beyond top-5: --skip-ext mp4,qt,mov (attachment filter) Skips video extensions in attachment download. Phase 1 measurements showed mp4+qt occupy ~54% of attachment storage. Entry still recorded in metadata.json with skipped:ext:<token> so we can re-fetch later if the policy changes. Honors both server-declared `ext` and filename suffix, case-insensitively. --max-source-mb N (Pro source size cap) Trips inside the chain replay loop on encrypted-blob total. On trip: raise ProjectOversizeError, wipe partial source/, append a row to data/state/oshwhub_pro_oversize.jsonl. Lets us shortlist 50+ Pro projects without one X86-board-class outlier (~500 MB) blowing the LFS budget. Std and Pro 2.x legacy are not capped (both <2 MB in sample). Verified: - cap=0 trips on first blob (1.2 MB), source/ wiped, state recorded - cap=100 runs full ESP-VoCat (7.5 MB plain, 278 docs) - skip-ext microtest: 8/8 cases (case-insensitive, declared/suffix fallback, empty-token edge cases) Plan + frozen candidate list for the next 50 projects: - docs/plans/oshwhub_batch50.md - data/state/oshwhub_batch50_candidates.jsonl (gitignore exception added) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 00:24:55 +08:00
parent e61404478e
commit eee1a9b97e
5 changed files with 347 additions and 5 deletions
--- a/crawlers/oshwhub/crawler.py
+++ b/crawlers/oshwhub/crawler.py
@@ -19,6 +19,7 @@ import hashlib
 import html as _html
 import json
 import re
+import shutil
 import sys
 import time
 import urllib.parse
@@ -428,11 +429,42 @@ def _order_history_chain(chain: list[dict]) -> list[dict]:
    return ordered


+class ProjectOversizeError(Exception):
+    """Raised when a Pro project's chain replay would exceed the configured cap.
+
+    `cap_mb` is the trip threshold; `bytes_so_far` is the *encrypted blob* total
+    accumulated when we tripped (pre-decompression, pre-partition).
+    """
+
+    def __init__(self, bytes_so_far: int, cap_mb: int) -> None:
+        super().__init__(
+            f"oversize: blob bytes {bytes_so_far // 1024 // 1024} MB > cap {cap_mb} MB"
+        )
+        self.bytes_so_far = bytes_so_far
+        self.cap_mb = cap_mb
+
+
+def _record_oversize(out_root: Path, uuid: str, err: ProjectOversizeError) -> None:
+    """Append one row to data/state/oshwhub_pro_oversize.jsonl for later review."""
+    state_path = Path("data/state/oshwhub_pro_oversize.jsonl")
+    state_path.parent.mkdir(parents=True, exist_ok=True)
+    row = {
+        "uuid": uuid,
+        "out_root": str(out_root),
+        "bytes_so_far": err.bytes_so_far,
+        "cap_mb": err.cap_mb,
+        "ts": datetime.now(timezone.utc).isoformat(),
+    }
+    with state_path.open("a") as f:
+        f.write(json.dumps(row, ensure_ascii=False) + "\n")
+
+
 def fetch_pro_source(
    pro_client: httpx.Client,
    project_uuid: str,
    proj_dir: Path,
    sleep: float = SLEEP_PRO,
+    max_source_mb: int | None = None,
 ) -> dict:
    """Dispatcher: pick modern (3.x branch+EPRO2) vs legacy (2.x v2/documents/lists)
    based on whether project meta contains a non-null branch_uuid.
@@ -440,11 +472,16 @@ def fetch_pro_source(
    Pro 3.x stores in git-style branch+history with AES-encrypted EPRO2 streams;
    Pro 2.x predates that and uses Std-style per-doc dataStr served from
    /api/v2/documents/lists. See docs/sources/easyeda_pro_source.md §1.1.
+
+    `max_source_mb` only gates modern-path projects (legacy is always tiny: <2 MB
+    in our 2/2 sample) and trips before any blob is written to disk past the cap.
    """
    proj = _pro_get_json(pro_client, f"{PRO_API}/projects/{project_uuid}", project_uuid)
    time.sleep(sleep)
    if proj.get("branch_uuid"):
-        return _fetch_pro_modern(pro_client, project_uuid, proj, proj_dir, sleep)
+        return _fetch_pro_modern(
+            pro_client, project_uuid, proj, proj_dir, sleep, max_source_mb=max_source_mb
+        )
    return _fetch_pro_legacy(pro_client, project_uuid, proj, proj_dir, sleep)


@@ -454,6 +491,7 @@ def _fetch_pro_modern(
    proj: dict,
    proj_dir: Path,
    sleep: float = SLEEP_PRO,
+    max_source_mb: int | None = None,
 ) -> dict:
    """Modern Pro 3.x fetcher: full history chain, AES-GCM decrypted, gunzipped,
    and partitioned into per-document EPRO2 streams.
@@ -513,11 +551,19 @@ def _fetch_pro_modern(
    cur_doc: str | None = None
    bytes_blob_total = 0
    bytes_plain_total = 0
+    cap_bytes = max_source_mb * 1024 * 1024 if max_source_mb is not None else None
    for h in ordered:
        blob_r = pro_client.get(h["dataStrUrl"], headers={"path": project_uuid})
        blob_r.raise_for_status()
        blob = blob_r.content
        bytes_blob_total += len(blob)
+        # Trip cap on the encrypted blob total. Hits *after* the offending
+        # download, but before we decrypt/gunzip/partition (those scale with
+        # plain bytes which is even larger). Wipe any partial source/ so disk
+        # doesn't accumulate junk on multi-project runs.
+        if cap_bytes is not None and bytes_blob_total > cap_bytes:
+            shutil.rmtree(src_dir, ignore_errors=True)
+            raise ProjectOversizeError(bytes_blob_total, max_source_mb)
        if len(blob) < 16:
            raise RuntimeError(f"history {h['uuid']} blob too short ({len(blob)} B)")
        ct, tag = blob[:-16], blob[-16:]
@@ -849,6 +895,8 @@ def crawl_one(
    fetch_files: bool = True,
    source_client: httpx.Client | None = None,
    pro_source_client: httpx.Client | None = None,
+    skip_exts: set[str] | None = None,
+    max_source_mb: int | None = None,
 ) -> CrawlResult:
    uuid = list_item["uuid"]
    path = list_item["path"]
@@ -916,6 +964,14 @@ def crawl_one(
            "size": a.get("size"),
            "md5": a.get("md5"),
        }
+        # ext gate: declared `ext` first, fall back to filename suffix. Lower-case
+        # compare; entry kept in metadata so we can re-fetch later if policy changes.
+        ext_token = (a.get("ext") or Path(safe_name).suffix.lstrip(".")).lower()
+        if skip_exts and ext_token in skip_exts:
+            entry["skipped"] = f"ext:{ext_token}"
+            skipped.append(f"{name}: ext:{ext_token}")
+            files_meta.append(entry)
+            continue
        if fetch_files:
            try:
                size, sha = download_to(client, file_url, local_path)
@@ -950,11 +1006,17 @@ def crawl_one(
    origin = list_item.get("origin")
    if origin == "pro" and pro_source_client is not None:
        try:
-            src_meta = fetch_pro_source(pro_source_client, uuid, proj_dir)
+            src_meta = fetch_pro_source(
+                pro_source_client, uuid, proj_dir, max_source_mb=max_source_mb
+            )
            print(
                f"  pro source: {len(src_meta.get('source_documents', []))} docs, "
                f"editor={src_meta.get('editor_version')}"
            )
+        except ProjectOversizeError as e:
+            print(f"  pro source SKIPPED (oversize): {e}", file=sys.stderr)
+            skipped.append(f"pro_source: oversize ({e.bytes_so_far // 1024 // 1024} MB > {e.cap_mb} MB)")
+            _record_oversize(out_root, uuid, e)
        except Exception as e:  # noqa: BLE001
            print(f"  pro source FAIL: {e}", file=sys.stderr)
            skipped.append(f"pro_source: {e}")
@@ -1088,7 +1150,27 @@ def main(argv: list[str] | None = None) -> int:
        default=PRO_COOKIE_PATH_DEFAULT,
        help="path to file with Cookie header for pro.lceda.cn",
    )
+    ap.add_argument(
+        "--skip-ext",
+        type=str,
+        default=None,
+        help="comma-separated list of attachment extensions to skip (e.g. mp4,qt,mov). "
+             "Saves ~30-50%% LFS storage on average. Entry still recorded in metadata.json "
+             "with skipped:ext:<token> so we can re-fetch later.",
+    )
+    ap.add_argument(
+        "--max-source-mb",
+        type=int,
+        default=None,
+        help="skip Pro modern projects whose chain blob total exceeds N MB. "
+             "Trips inside the chain loop, wipes partial source/, records to "
+             "data/state/oshwhub_pro_oversize.jsonl. No effect on Std or Pro 2.x legacy.",
+    )
    args = ap.parse_args(argv)
+    skip_exts: set[str] | None = (
+        {x.strip().lower().lstrip(".") for x in args.skip_ext.split(",") if x.strip()}
+        if args.skip_ext else None
+    )

    n_target = args.limit if args.limit is not None else args.top
    args.out.mkdir(parents=True, exist_ok=True)
@@ -1099,7 +1181,10 @@ def main(argv: list[str] | None = None) -> int:
        return _run_backfill_source(args.out, only_uuids=args.uuids)
    if args.backfill_pro_source:
        return _run_backfill_pro_source(
-            args.out, only_uuids=args.uuids, cookie_path=args.pro_cookie
+            args.out,
+            only_uuids=args.uuids,
+            cookie_path=args.pro_cookie,
+            max_source_mb=args.max_source_mb,
        )

    with make_client() as client:
@@ -1149,6 +1234,8 @@ def main(argv: list[str] | None = None) -> int:
                        fetch_files=not args.no_files,
                        source_client=source_client_ctx,
                        pro_source_client=pro_source_client_ctx,
+                        skip_exts=skip_exts,
+                        max_source_mb=args.max_source_mb,
                    )
                    print(
                        f"  OK: {r.files_count} files, {r.bytes_total / 1024 / 1024:.1f} MB "
@@ -1217,6 +1304,7 @@ def _run_backfill_pro_source(
    out_root: Path,
    only_uuids: str | None = None,
    cookie_path: str = PRO_COOKIE_PATH_DEFAULT,
+    max_source_mb: int | None = None,
 ) -> int:
    """Walk per-project dirs in out_root, fetch Pro source for origin=pro projects.

@@ -1253,7 +1341,13 @@ def _run_backfill_pro_source(
            meta = json.loads(meta_path.read_text(encoding="utf-8"))
            print(f"[{i}/{len(targets)}] {uuid}  ({meta.get('title', '?')})")
            try:
-                src_meta = fetch_pro_source(pro_client, uuid, proj_dir)
+                src_meta = fetch_pro_source(
+                    pro_client, uuid, proj_dir, max_source_mb=max_source_mb
+                )
+            except ProjectOversizeError as e:
+                print(f"  SKIPPED (oversize): {e}", file=sys.stderr)
+                _record_oversize(out_root, uuid, e)
+                continue
            except Exception as e:  # noqa: BLE001
                print(f"  FAIL: {e}", file=sys.stderr)
                continue