crawler: --skip-ext + --max-source-mb gates for batch-50 expansion

Two CLI gates needed before scaling Pro batch beyond top-5:

--skip-ext mp4,qt,mov  (attachment filter)
  Skips video extensions in attachment download. Phase 1 measurements
  showed mp4+qt occupy ~54% of attachment storage. Entry still recorded
  in metadata.json with skipped:ext:<token> so we can re-fetch later if
  the policy changes. Honors both server-declared `ext` and filename
  suffix, case-insensitively.

--max-source-mb N  (Pro source size cap)
  Trips inside the chain replay loop on encrypted-blob total. On trip:
  raise ProjectOversizeError, wipe partial source/, append a row to
  data/state/oshwhub_pro_oversize.jsonl. Lets us shortlist 50+ Pro
  projects without one X86-board-class outlier (~500 MB) blowing the
  LFS budget. Std and Pro 2.x legacy are not capped (both <2 MB in
  sample).

Verified:
  - cap=0 trips on first blob (1.2 MB), source/ wiped, state recorded
  - cap=100 runs full ESP-VoCat (7.5 MB plain, 278 docs)
  - skip-ext microtest: 8/8 cases (case-insensitive, declared/suffix
    fallback, empty-token edge cases)

Plan + frozen candidate list for the next 50 projects:
  - docs/plans/oshwhub_batch50.md
  - data/state/oshwhub_batch50_candidates.jsonl (gitignore exception added)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-29 00:24:55 +08:00
parent e61404478e
commit eee1a9b97e
5 changed files with 347 additions and 5 deletions

View File

@@ -19,6 +19,7 @@ import hashlib
import html as _html
import json
import re
import shutil
import sys
import time
import urllib.parse
@@ -428,11 +429,42 @@ def _order_history_chain(chain: list[dict]) -> list[dict]:
return ordered
class ProjectOversizeError(Exception):
"""Raised when a Pro project's chain replay would exceed the configured cap.
`cap_mb` is the trip threshold; `bytes_so_far` is the *encrypted blob* total
accumulated when we tripped (pre-decompression, pre-partition).
"""
def __init__(self, bytes_so_far: int, cap_mb: int) -> None:
super().__init__(
f"oversize: blob bytes {bytes_so_far // 1024 // 1024} MB > cap {cap_mb} MB"
)
self.bytes_so_far = bytes_so_far
self.cap_mb = cap_mb
def _record_oversize(out_root: Path, uuid: str, err: ProjectOversizeError) -> None:
"""Append one row to data/state/oshwhub_pro_oversize.jsonl for later review."""
state_path = Path("data/state/oshwhub_pro_oversize.jsonl")
state_path.parent.mkdir(parents=True, exist_ok=True)
row = {
"uuid": uuid,
"out_root": str(out_root),
"bytes_so_far": err.bytes_so_far,
"cap_mb": err.cap_mb,
"ts": datetime.now(timezone.utc).isoformat(),
}
with state_path.open("a") as f:
f.write(json.dumps(row, ensure_ascii=False) + "\n")
def fetch_pro_source(
pro_client: httpx.Client,
project_uuid: str,
proj_dir: Path,
sleep: float = SLEEP_PRO,
max_source_mb: int | None = None,
) -> dict:
"""Dispatcher: pick modern (3.x branch+EPRO2) vs legacy (2.x v2/documents/lists)
based on whether project meta contains a non-null branch_uuid.
@@ -440,11 +472,16 @@ def fetch_pro_source(
Pro 3.x stores in git-style branch+history with AES-encrypted EPRO2 streams;
Pro 2.x predates that and uses Std-style per-doc dataStr served from
/api/v2/documents/lists. See docs/sources/easyeda_pro_source.md §1.1.
`max_source_mb` only gates modern-path projects (legacy is always tiny: <2 MB
in our 2/2 sample) and trips before any blob is written to disk past the cap.
"""
proj = _pro_get_json(pro_client, f"{PRO_API}/projects/{project_uuid}", project_uuid)
time.sleep(sleep)
if proj.get("branch_uuid"):
return _fetch_pro_modern(pro_client, project_uuid, proj, proj_dir, sleep)
return _fetch_pro_modern(
pro_client, project_uuid, proj, proj_dir, sleep, max_source_mb=max_source_mb
)
return _fetch_pro_legacy(pro_client, project_uuid, proj, proj_dir, sleep)
@@ -454,6 +491,7 @@ def _fetch_pro_modern(
proj: dict,
proj_dir: Path,
sleep: float = SLEEP_PRO,
max_source_mb: int | None = None,
) -> dict:
"""Modern Pro 3.x fetcher: full history chain, AES-GCM decrypted, gunzipped,
and partitioned into per-document EPRO2 streams.
@@ -513,11 +551,19 @@ def _fetch_pro_modern(
cur_doc: str | None = None
bytes_blob_total = 0
bytes_plain_total = 0
cap_bytes = max_source_mb * 1024 * 1024 if max_source_mb is not None else None
for h in ordered:
blob_r = pro_client.get(h["dataStrUrl"], headers={"path": project_uuid})
blob_r.raise_for_status()
blob = blob_r.content
bytes_blob_total += len(blob)
# Trip cap on the encrypted blob total. Hits *after* the offending
# download, but before we decrypt/gunzip/partition (those scale with
# plain bytes which is even larger). Wipe any partial source/ so disk
# doesn't accumulate junk on multi-project runs.
if cap_bytes is not None and bytes_blob_total > cap_bytes:
shutil.rmtree(src_dir, ignore_errors=True)
raise ProjectOversizeError(bytes_blob_total, max_source_mb)
if len(blob) < 16:
raise RuntimeError(f"history {h['uuid']} blob too short ({len(blob)} B)")
ct, tag = blob[:-16], blob[-16:]
@@ -849,6 +895,8 @@ def crawl_one(
fetch_files: bool = True,
source_client: httpx.Client | None = None,
pro_source_client: httpx.Client | None = None,
skip_exts: set[str] | None = None,
max_source_mb: int | None = None,
) -> CrawlResult:
uuid = list_item["uuid"]
path = list_item["path"]
@@ -916,6 +964,14 @@ def crawl_one(
"size": a.get("size"),
"md5": a.get("md5"),
}
# ext gate: declared `ext` first, fall back to filename suffix. Lower-case
# compare; entry kept in metadata so we can re-fetch later if policy changes.
ext_token = (a.get("ext") or Path(safe_name).suffix.lstrip(".")).lower()
if skip_exts and ext_token in skip_exts:
entry["skipped"] = f"ext:{ext_token}"
skipped.append(f"{name}: ext:{ext_token}")
files_meta.append(entry)
continue
if fetch_files:
try:
size, sha = download_to(client, file_url, local_path)
@@ -950,11 +1006,17 @@ def crawl_one(
origin = list_item.get("origin")
if origin == "pro" and pro_source_client is not None:
try:
src_meta = fetch_pro_source(pro_source_client, uuid, proj_dir)
src_meta = fetch_pro_source(
pro_source_client, uuid, proj_dir, max_source_mb=max_source_mb
)
print(
f" pro source: {len(src_meta.get('source_documents', []))} docs, "
f"editor={src_meta.get('editor_version')}"
)
except ProjectOversizeError as e:
print(f" pro source SKIPPED (oversize): {e}", file=sys.stderr)
skipped.append(f"pro_source: oversize ({e.bytes_so_far // 1024 // 1024} MB > {e.cap_mb} MB)")
_record_oversize(out_root, uuid, e)
except Exception as e: # noqa: BLE001
print(f" pro source FAIL: {e}", file=sys.stderr)
skipped.append(f"pro_source: {e}")
@@ -1088,7 +1150,27 @@ def main(argv: list[str] | None = None) -> int:
default=PRO_COOKIE_PATH_DEFAULT,
help="path to file with Cookie header for pro.lceda.cn",
)
ap.add_argument(
"--skip-ext",
type=str,
default=None,
help="comma-separated list of attachment extensions to skip (e.g. mp4,qt,mov). "
"Saves ~30-50%% LFS storage on average. Entry still recorded in metadata.json "
"with skipped:ext:<token> so we can re-fetch later.",
)
ap.add_argument(
"--max-source-mb",
type=int,
default=None,
help="skip Pro modern projects whose chain blob total exceeds N MB. "
"Trips inside the chain loop, wipes partial source/, records to "
"data/state/oshwhub_pro_oversize.jsonl. No effect on Std or Pro 2.x legacy.",
)
args = ap.parse_args(argv)
skip_exts: set[str] | None = (
{x.strip().lower().lstrip(".") for x in args.skip_ext.split(",") if x.strip()}
if args.skip_ext else None
)
n_target = args.limit if args.limit is not None else args.top
args.out.mkdir(parents=True, exist_ok=True)
@@ -1099,7 +1181,10 @@ def main(argv: list[str] | None = None) -> int:
return _run_backfill_source(args.out, only_uuids=args.uuids)
if args.backfill_pro_source:
return _run_backfill_pro_source(
args.out, only_uuids=args.uuids, cookie_path=args.pro_cookie
args.out,
only_uuids=args.uuids,
cookie_path=args.pro_cookie,
max_source_mb=args.max_source_mb,
)
with make_client() as client:
@@ -1149,6 +1234,8 @@ def main(argv: list[str] | None = None) -> int:
fetch_files=not args.no_files,
source_client=source_client_ctx,
pro_source_client=pro_source_client_ctx,
skip_exts=skip_exts,
max_source_mb=args.max_source_mb,
)
print(
f" OK: {r.files_count} files, {r.bytes_total / 1024 / 1024:.1f} MB "
@@ -1217,6 +1304,7 @@ def _run_backfill_pro_source(
out_root: Path,
only_uuids: str | None = None,
cookie_path: str = PRO_COOKIE_PATH_DEFAULT,
max_source_mb: int | None = None,
) -> int:
"""Walk per-project dirs in out_root, fetch Pro source for origin=pro projects.
@@ -1253,7 +1341,13 @@ def _run_backfill_pro_source(
meta = json.loads(meta_path.read_text(encoding="utf-8"))
print(f"[{i}/{len(targets)}] {uuid} ({meta.get('title', '?')})")
try:
src_meta = fetch_pro_source(pro_client, uuid, proj_dir)
src_meta = fetch_pro_source(
pro_client, uuid, proj_dir, max_source_mb=max_source_mb
)
except ProjectOversizeError as e:
print(f" SKIPPED (oversize): {e}", file=sys.stderr)
_record_oversize(out_root, uuid, e)
continue
except Exception as e: # noqa: BLE001
print(f" FAIL: {e}", file=sys.stderr)
continue