crawler: --no-cover, --concurrency, drop cross-host sleep + batch-50 Step 1 done

Three crawler ergonomics for batch operations: --no-cover Skip cover image download. For scan-only modes (license/meta scrape) this drops ~1.3s/project and avoids slow-CDN hangs. --concurrency N ThreadPoolExecutor wrapping the per-project loop. Default 1 = serial (current behavior). Anonymous endpoints tolerate 5+ comfortably; output uses a print lock for readable interleaved progress. fetch_cover plumbs through crawl_one. Drop cross-host sleep #1: in crawl_one between detail HTML (oshwhub.com) and cover image (image.lceda.cn). Different hosts — sleep was unnecessary. Saves ~1s/project. Sleep #2 (post-cover, before next iteration) stays — it gates the next oshwhub.com hit. download_to gains max_seconds wall budget (default 60s, cover uses 15s). Defends against pathologically slow CDN connections — observed 10 KB/s on image.lceda.cn for one project, would have hung 6+ min on a 3.6 MB cover otherwise. httpx default timeout resets per chunk, so streaming downloads need an external wall-clock guard. batch-50 Step 1 (license/meta scrape) shipped: 50/50 candidates have metadata.json + license recorded License distribution: GPL 3.0 32, Public Domain 6, NC variants 8, CERN-OHL 1, MIT 1, CC BY 3.0 1 Forge-friendly (non-NC): 41/50 (82%) Declared attachments: 180 files / 2.36 GB (median 18 MB/proj, max 304 MB) Walltime: 3min 26s for 28 projects at concurrency=5 (server-side HTML render bound, not sleep-bound) One orphan partial cover (a670e60a...) cleaned up — leftover from the first aborted run before the timeout fix landed. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 01:35:11 +08:00
parent fe6971f3f9
commit c6fd111d6d
171 changed files with 5410 additions and 17 deletions
--- a/crawlers/oshwhub/crawler.py
+++ b/crawlers/oshwhub/crawler.py
@@ -21,6 +21,7 @@ import json
 import re
 import shutil
 import sys
+import threading
 import time
 import urllib.parse
 from datetime import datetime, timezone
@@ -258,15 +259,30 @@ def parse_detail_html(h: str) -> dict:
 # Download helpers
 # ---------------------------------------------------------------------------

-def download_to(client: httpx.Client, url: str, dest: Path) -> tuple[int, str]:
-    """Stream-download url to dest. Returns (size, sha256)."""
+def download_to(
+    client: httpx.Client, url: str, dest: Path, *, max_seconds: float = 60.0
+) -> tuple[int, str]:
+    """Stream-download url to dest. Returns (size, sha256).
+
+    `max_seconds` caps total download walltime — defends against
+    pathologically slow CDN connections (observed 10 KB/s on
+    image.lceda.cn for one project, would have hung 6+ min on a 3.6 MB
+    cover otherwise). httpx's default timeout resets per chunk, so
+    relying on it for streaming is unsafe. We track wall time ourselves.
+    """
    dest.parent.mkdir(parents=True, exist_ok=True)
    h = hashlib.sha256()
    size = 0
+    t_start = time.monotonic()
    with client.stream("GET", url) as r:
        r.raise_for_status()
        with open(dest, "wb") as f:
            for chunk in r.iter_bytes(1 << 15):
+                if time.monotonic() - t_start > max_seconds:
+                    raise httpx.ReadTimeout(
+                        f"download exceeded {max_seconds}s wall budget "
+                        f"(got {size} of {r.headers.get('content-length', '?')} bytes)"
+                    )
                f.write(chunk)
                h.update(chunk)
                size += len(chunk)
@@ -904,6 +920,7 @@ def crawl_one(
    list_item: dict,
    out_root: Path,
    fetch_files: bool = True,
+    fetch_cover: bool = True,
    source_client: httpx.Client | None = None,
    pro_source_client: httpx.Client | None = None,
    skip_exts: set[str] | None = None,
@@ -914,23 +931,28 @@ def crawl_one(
    proj_dir = out_root / uuid
    proj_dir.mkdir(parents=True, exist_ok=True)

-    # 1. Fetch detail HTML
+    # 1. Fetch detail HTML.
+    # No polite_sleep after — the next call goes to image.lceda.cn (cover)
+    # or to attachment CDN, both different hosts from the detail server.
+    # Sleep is used to space hits to the *same* host before the next iteration.
    detail_url = f"{BASE}/{path}"
    r = client.get(detail_url)
    r.raise_for_status()
    detail = parse_detail_html(r.text)
-    polite_sleep()

-    # 2. Cover image
+    # 2. Cover image (skipped via fetch_cover=False — useful for scan-only modes
+    # like Step 1 of batch ingest where we only want license/meta).
    thumb_url = list_item["thumb"]
    if thumb_url.startswith("//"):
        thumb_url = "https:" + thumb_url
    cover_rel = None
-    if thumb_url:
+    if thumb_url and fetch_cover:
        ext = Path(urllib.parse.urlparse(thumb_url).path).suffix or ".jpg"
        cover_rel = f"cover{ext}"
        try:
-            download_to(client, thumb_url, proj_dir / cover_rel)
+            # Cover thumbs should be small (~100-300 KB). Cap walltime at 15s
+            # so a pathologically slow CDN connection can't hang the loop.
+            download_to(client, thumb_url, proj_dir / cover_rel, max_seconds=15.0)
        except httpx.HTTPError as e:
            print(f"  cover failed: {e}", file=sys.stderr)
            cover_rel = None
@@ -1177,6 +1199,30 @@ def main(argv: list[str] | None = None) -> int:
             "Trips inside the chain loop, wipes partial source/, records to "
             "data/state/oshwhub_pro_oversize.jsonl. No effect on Std or Pro 2.x legacy.",
    )
+    ap.add_argument(
+        "--from-jsonl",
+        type=Path,
+        default=None,
+        help="read pre-selected listing items from a jsonl (one item per line, "
+             "shape matches /api/project listing entries). Bypasses the listing "
+             "API entirely — useful when candidates were chosen offline from the "
+             "full-corpus index, where most aren't in the default top-N pages.",
+    )
+    ap.add_argument(
+        "--no-cover",
+        action="store_true",
+        help="skip cover image download. For scan-only runs (license / meta scrape) "
+             "this drops ~1.3s/project and avoids slow-CDN hangs.",
+    )
+    ap.add_argument(
+        "--concurrency",
+        type=int,
+        default=1,
+        help="number of parallel crawl_one workers (ThreadPoolExecutor). Default 1 "
+             "= serial. Anonymous endpoints (oshwhub.com detail/listing) tolerate "
+             "concurrency 5+ comfortably; only enable when no auth is involved or "
+             "you've probed the host.",
+    )
    args = ap.parse_args(argv)
    skip_exts: set[str] | None = (
        {x.strip().lower().lstrip(".") for x in args.skip_ext.split(",") if x.strip()}
@@ -1200,7 +1246,13 @@ def main(argv: list[str] | None = None) -> int:

    with make_client() as client:
        # Build list of items to crawl
-        if args.uuids:
+        if args.from_jsonl:
+            items = [json.loads(ln) for ln in args.from_jsonl.read_text().splitlines() if ln.strip()]
+            if args.uuids:
+                wanted = set(args.uuids.split(","))
+                items = [i for i in items if i.get("uuid") in wanted]
+            print(f"loaded {len(items)} items from {args.from_jsonl}")
+        elif args.uuids:
            wanted = set(args.uuids.split(","))
            items: list[dict] = []
            for it in iter_candidates(
@@ -1234,26 +1286,41 @@ def main(argv: list[str] | None = None) -> int:
            make_pro_source_client(args.pro_cookie) if args.with_pro_source else None
        )
        try:
-            print(f"Crawling {len(items)} projects -> {args.out}")
-            for i, it in enumerate(items, 1):
-                print(f"[{i}/{len(items)}] {it['path']}  ({it['name']})")
+            print(f"Crawling {len(items)} projects -> {args.out}  (concurrency={args.concurrency})")
+            print_lock = threading.Lock()
+
+            def _do_one(i: int, it: dict) -> None:
                try:
                    r = crawl_one(
                        client,
                        it,
                        args.out,
                        fetch_files=not args.no_files,
+                        fetch_cover=not args.no_cover,
                        source_client=source_client_ctx,
                        pro_source_client=pro_source_client_ctx,
                        skip_exts=skip_exts,
                        max_source_mb=args.max_source_mb,
                    )
-                    print(
-                        f"  OK: {r.files_count} files, {r.bytes_total / 1024 / 1024:.1f} MB "
-                        f"(skipped: {len(r.skipped_files)})"
-                    )
-                except Exception as e:
-                    print(f"  FAIL: {e}", file=sys.stderr)
+                    with print_lock:
+                        print(
+                            f"[{i}/{len(items)}] OK {it['path']}: "
+                            f"{r.files_count} files, {r.bytes_total / 1024 / 1024:.1f} MB "
+                            f"(skipped: {len(r.skipped_files)})"
+                        )
+                except Exception as e:  # noqa: BLE001
+                    with print_lock:
+                        print(f"[{i}/{len(items)}] FAIL {it['path']}: {e}", file=sys.stderr)
+
+            if args.concurrency <= 1:
+                for i, it in enumerate(items, 1):
+                    _do_one(i, it)
+            else:
+                from concurrent.futures import ThreadPoolExecutor, as_completed
+                with ThreadPoolExecutor(max_workers=args.concurrency) as pool:
+                    futs = [pool.submit(_do_one, i, it) for i, it in enumerate(items, 1)]
+                    for f in as_completed(futs):
+                        f.result()
        finally:
            if source_client_ctx is not None:
                source_client_ctx.close()