crawler: --skip-ext + --max-source-mb gates for batch-50 expansion
Two CLI gates needed before scaling Pro batch beyond top-5:
--skip-ext mp4,qt,mov (attachment filter)
Skips video extensions in attachment download. Phase 1 measurements
showed mp4+qt occupy ~54% of attachment storage. Entry still recorded
in metadata.json with skipped:ext:<token> so we can re-fetch later if
the policy changes. Honors both server-declared `ext` and filename
suffix, case-insensitively.
--max-source-mb N (Pro source size cap)
Trips inside the chain replay loop on encrypted-blob total. On trip:
raise ProjectOversizeError, wipe partial source/, append a row to
data/state/oshwhub_pro_oversize.jsonl. Lets us shortlist 50+ Pro
projects without one X86-board-class outlier (~500 MB) blowing the
LFS budget. Std and Pro 2.x legacy are not capped (both <2 MB in
sample).
Verified:
- cap=0 trips on first blob (1.2 MB), source/ wiped, state recorded
- cap=100 runs full ESP-VoCat (7.5 MB plain, 278 docs)
- skip-ext microtest: 8/8 cases (case-insensitive, declared/suffix
fallback, empty-token edge cases)
Plan + frozen candidate list for the next 50 projects:
- docs/plans/oshwhub_batch50.md
- data/state/oshwhub_batch50_candidates.jsonl (gitignore exception added)
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -19,6 +19,7 @@ import hashlib
|
||||
import html as _html
|
||||
import json
|
||||
import re
|
||||
import shutil
|
||||
import sys
|
||||
import time
|
||||
import urllib.parse
|
||||
@@ -428,11 +429,42 @@ def _order_history_chain(chain: list[dict]) -> list[dict]:
|
||||
return ordered
|
||||
|
||||
|
||||
class ProjectOversizeError(Exception):
|
||||
"""Raised when a Pro project's chain replay would exceed the configured cap.
|
||||
|
||||
`cap_mb` is the trip threshold; `bytes_so_far` is the *encrypted blob* total
|
||||
accumulated when we tripped (pre-decompression, pre-partition).
|
||||
"""
|
||||
|
||||
def __init__(self, bytes_so_far: int, cap_mb: int) -> None:
|
||||
super().__init__(
|
||||
f"oversize: blob bytes {bytes_so_far // 1024 // 1024} MB > cap {cap_mb} MB"
|
||||
)
|
||||
self.bytes_so_far = bytes_so_far
|
||||
self.cap_mb = cap_mb
|
||||
|
||||
|
||||
def _record_oversize(out_root: Path, uuid: str, err: ProjectOversizeError) -> None:
|
||||
"""Append one row to data/state/oshwhub_pro_oversize.jsonl for later review."""
|
||||
state_path = Path("data/state/oshwhub_pro_oversize.jsonl")
|
||||
state_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
row = {
|
||||
"uuid": uuid,
|
||||
"out_root": str(out_root),
|
||||
"bytes_so_far": err.bytes_so_far,
|
||||
"cap_mb": err.cap_mb,
|
||||
"ts": datetime.now(timezone.utc).isoformat(),
|
||||
}
|
||||
with state_path.open("a") as f:
|
||||
f.write(json.dumps(row, ensure_ascii=False) + "\n")
|
||||
|
||||
|
||||
def fetch_pro_source(
|
||||
pro_client: httpx.Client,
|
||||
project_uuid: str,
|
||||
proj_dir: Path,
|
||||
sleep: float = SLEEP_PRO,
|
||||
max_source_mb: int | None = None,
|
||||
) -> dict:
|
||||
"""Dispatcher: pick modern (3.x branch+EPRO2) vs legacy (2.x v2/documents/lists)
|
||||
based on whether project meta contains a non-null branch_uuid.
|
||||
@@ -440,11 +472,16 @@ def fetch_pro_source(
|
||||
Pro 3.x stores in git-style branch+history with AES-encrypted EPRO2 streams;
|
||||
Pro 2.x predates that and uses Std-style per-doc dataStr served from
|
||||
/api/v2/documents/lists. See docs/sources/easyeda_pro_source.md §1.1.
|
||||
|
||||
`max_source_mb` only gates modern-path projects (legacy is always tiny: <2 MB
|
||||
in our 2/2 sample) and trips before any blob is written to disk past the cap.
|
||||
"""
|
||||
proj = _pro_get_json(pro_client, f"{PRO_API}/projects/{project_uuid}", project_uuid)
|
||||
time.sleep(sleep)
|
||||
if proj.get("branch_uuid"):
|
||||
return _fetch_pro_modern(pro_client, project_uuid, proj, proj_dir, sleep)
|
||||
return _fetch_pro_modern(
|
||||
pro_client, project_uuid, proj, proj_dir, sleep, max_source_mb=max_source_mb
|
||||
)
|
||||
return _fetch_pro_legacy(pro_client, project_uuid, proj, proj_dir, sleep)
|
||||
|
||||
|
||||
@@ -454,6 +491,7 @@ def _fetch_pro_modern(
|
||||
proj: dict,
|
||||
proj_dir: Path,
|
||||
sleep: float = SLEEP_PRO,
|
||||
max_source_mb: int | None = None,
|
||||
) -> dict:
|
||||
"""Modern Pro 3.x fetcher: full history chain, AES-GCM decrypted, gunzipped,
|
||||
and partitioned into per-document EPRO2 streams.
|
||||
@@ -513,11 +551,19 @@ def _fetch_pro_modern(
|
||||
cur_doc: str | None = None
|
||||
bytes_blob_total = 0
|
||||
bytes_plain_total = 0
|
||||
cap_bytes = max_source_mb * 1024 * 1024 if max_source_mb is not None else None
|
||||
for h in ordered:
|
||||
blob_r = pro_client.get(h["dataStrUrl"], headers={"path": project_uuid})
|
||||
blob_r.raise_for_status()
|
||||
blob = blob_r.content
|
||||
bytes_blob_total += len(blob)
|
||||
# Trip cap on the encrypted blob total. Hits *after* the offending
|
||||
# download, but before we decrypt/gunzip/partition (those scale with
|
||||
# plain bytes which is even larger). Wipe any partial source/ so disk
|
||||
# doesn't accumulate junk on multi-project runs.
|
||||
if cap_bytes is not None and bytes_blob_total > cap_bytes:
|
||||
shutil.rmtree(src_dir, ignore_errors=True)
|
||||
raise ProjectOversizeError(bytes_blob_total, max_source_mb)
|
||||
if len(blob) < 16:
|
||||
raise RuntimeError(f"history {h['uuid']} blob too short ({len(blob)} B)")
|
||||
ct, tag = blob[:-16], blob[-16:]
|
||||
@@ -849,6 +895,8 @@ def crawl_one(
|
||||
fetch_files: bool = True,
|
||||
source_client: httpx.Client | None = None,
|
||||
pro_source_client: httpx.Client | None = None,
|
||||
skip_exts: set[str] | None = None,
|
||||
max_source_mb: int | None = None,
|
||||
) -> CrawlResult:
|
||||
uuid = list_item["uuid"]
|
||||
path = list_item["path"]
|
||||
@@ -916,6 +964,14 @@ def crawl_one(
|
||||
"size": a.get("size"),
|
||||
"md5": a.get("md5"),
|
||||
}
|
||||
# ext gate: declared `ext` first, fall back to filename suffix. Lower-case
|
||||
# compare; entry kept in metadata so we can re-fetch later if policy changes.
|
||||
ext_token = (a.get("ext") or Path(safe_name).suffix.lstrip(".")).lower()
|
||||
if skip_exts and ext_token in skip_exts:
|
||||
entry["skipped"] = f"ext:{ext_token}"
|
||||
skipped.append(f"{name}: ext:{ext_token}")
|
||||
files_meta.append(entry)
|
||||
continue
|
||||
if fetch_files:
|
||||
try:
|
||||
size, sha = download_to(client, file_url, local_path)
|
||||
@@ -950,11 +1006,17 @@ def crawl_one(
|
||||
origin = list_item.get("origin")
|
||||
if origin == "pro" and pro_source_client is not None:
|
||||
try:
|
||||
src_meta = fetch_pro_source(pro_source_client, uuid, proj_dir)
|
||||
src_meta = fetch_pro_source(
|
||||
pro_source_client, uuid, proj_dir, max_source_mb=max_source_mb
|
||||
)
|
||||
print(
|
||||
f" pro source: {len(src_meta.get('source_documents', []))} docs, "
|
||||
f"editor={src_meta.get('editor_version')}"
|
||||
)
|
||||
except ProjectOversizeError as e:
|
||||
print(f" pro source SKIPPED (oversize): {e}", file=sys.stderr)
|
||||
skipped.append(f"pro_source: oversize ({e.bytes_so_far // 1024 // 1024} MB > {e.cap_mb} MB)")
|
||||
_record_oversize(out_root, uuid, e)
|
||||
except Exception as e: # noqa: BLE001
|
||||
print(f" pro source FAIL: {e}", file=sys.stderr)
|
||||
skipped.append(f"pro_source: {e}")
|
||||
@@ -1088,7 +1150,27 @@ def main(argv: list[str] | None = None) -> int:
|
||||
default=PRO_COOKIE_PATH_DEFAULT,
|
||||
help="path to file with Cookie header for pro.lceda.cn",
|
||||
)
|
||||
ap.add_argument(
|
||||
"--skip-ext",
|
||||
type=str,
|
||||
default=None,
|
||||
help="comma-separated list of attachment extensions to skip (e.g. mp4,qt,mov). "
|
||||
"Saves ~30-50%% LFS storage on average. Entry still recorded in metadata.json "
|
||||
"with skipped:ext:<token> so we can re-fetch later.",
|
||||
)
|
||||
ap.add_argument(
|
||||
"--max-source-mb",
|
||||
type=int,
|
||||
default=None,
|
||||
help="skip Pro modern projects whose chain blob total exceeds N MB. "
|
||||
"Trips inside the chain loop, wipes partial source/, records to "
|
||||
"data/state/oshwhub_pro_oversize.jsonl. No effect on Std or Pro 2.x legacy.",
|
||||
)
|
||||
args = ap.parse_args(argv)
|
||||
skip_exts: set[str] | None = (
|
||||
{x.strip().lower().lstrip(".") for x in args.skip_ext.split(",") if x.strip()}
|
||||
if args.skip_ext else None
|
||||
)
|
||||
|
||||
n_target = args.limit if args.limit is not None else args.top
|
||||
args.out.mkdir(parents=True, exist_ok=True)
|
||||
@@ -1099,7 +1181,10 @@ def main(argv: list[str] | None = None) -> int:
|
||||
return _run_backfill_source(args.out, only_uuids=args.uuids)
|
||||
if args.backfill_pro_source:
|
||||
return _run_backfill_pro_source(
|
||||
args.out, only_uuids=args.uuids, cookie_path=args.pro_cookie
|
||||
args.out,
|
||||
only_uuids=args.uuids,
|
||||
cookie_path=args.pro_cookie,
|
||||
max_source_mb=args.max_source_mb,
|
||||
)
|
||||
|
||||
with make_client() as client:
|
||||
@@ -1149,6 +1234,8 @@ def main(argv: list[str] | None = None) -> int:
|
||||
fetch_files=not args.no_files,
|
||||
source_client=source_client_ctx,
|
||||
pro_source_client=pro_source_client_ctx,
|
||||
skip_exts=skip_exts,
|
||||
max_source_mb=args.max_source_mb,
|
||||
)
|
||||
print(
|
||||
f" OK: {r.files_count} files, {r.bytes_total / 1024 / 1024:.1f} MB "
|
||||
@@ -1217,6 +1304,7 @@ def _run_backfill_pro_source(
|
||||
out_root: Path,
|
||||
only_uuids: str | None = None,
|
||||
cookie_path: str = PRO_COOKIE_PATH_DEFAULT,
|
||||
max_source_mb: int | None = None,
|
||||
) -> int:
|
||||
"""Walk per-project dirs in out_root, fetch Pro source for origin=pro projects.
|
||||
|
||||
@@ -1253,7 +1341,13 @@ def _run_backfill_pro_source(
|
||||
meta = json.loads(meta_path.read_text(encoding="utf-8"))
|
||||
print(f"[{i}/{len(targets)}] {uuid} ({meta.get('title', '?')})")
|
||||
try:
|
||||
src_meta = fetch_pro_source(pro_client, uuid, proj_dir)
|
||||
src_meta = fetch_pro_source(
|
||||
pro_client, uuid, proj_dir, max_source_mb=max_source_mb
|
||||
)
|
||||
except ProjectOversizeError as e:
|
||||
print(f" SKIPPED (oversize): {e}", file=sys.stderr)
|
||||
_record_oversize(out_root, uuid, e)
|
||||
continue
|
||||
except Exception as e: # noqa: BLE001
|
||||
print(f" FAIL: {e}", file=sys.stderr)
|
||||
continue
|
||||
|
||||
Reference in New Issue
Block a user