crawler: --no-cover, --concurrency, drop cross-host sleep + batch-50 Step 1 done

Three crawler ergonomics for batch operations:

--no-cover  Skip cover image download. For scan-only modes (license/meta
            scrape) this drops ~1.3s/project and avoids slow-CDN hangs.

--concurrency N  ThreadPoolExecutor wrapping the per-project loop. Default
                 1 = serial (current behavior). Anonymous endpoints tolerate
                 5+ comfortably; output uses a print lock for readable
                 interleaved progress. fetch_cover plumbs through crawl_one.

Drop cross-host sleep #1: in crawl_one between detail HTML (oshwhub.com)
and cover image (image.lceda.cn). Different hosts — sleep was unnecessary.
Saves ~1s/project. Sleep #2 (post-cover, before next iteration) stays — it
gates the next oshwhub.com hit.

download_to gains max_seconds wall budget (default 60s, cover uses 15s).
Defends against pathologically slow CDN connections — observed 10 KB/s
on image.lceda.cn for one project, would have hung 6+ min on a 3.6 MB
cover otherwise. httpx default timeout resets per chunk, so streaming
downloads need an external wall-clock guard.

batch-50 Step 1 (license/meta scrape) shipped:
  50/50 candidates have metadata.json + license recorded
  License distribution: GPL 3.0 32, Public Domain 6, NC variants 8,
                        CERN-OHL 1, MIT 1, CC BY 3.0 1
  Forge-friendly (non-NC): 41/50 (82%)
  Declared attachments: 180 files / 2.36 GB (median 18 MB/proj, max 304 MB)
  Walltime: 3min 26s for 28 projects at concurrency=5 (server-side
            HTML render bound, not sleep-bound)

One orphan partial cover (a670e60a...) cleaned up — leftover from the
first aborted run before the timeout fix landed.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-29 01:35:11 +08:00
parent fe6971f3f9
commit c6fd111d6d
171 changed files with 5410 additions and 17 deletions

View File

@@ -21,6 +21,7 @@ import json
import re
import shutil
import sys
import threading
import time
import urllib.parse
from datetime import datetime, timezone
@@ -258,15 +259,30 @@ def parse_detail_html(h: str) -> dict:
# Download helpers
# ---------------------------------------------------------------------------
def download_to(client: httpx.Client, url: str, dest: Path) -> tuple[int, str]:
"""Stream-download url to dest. Returns (size, sha256)."""
def download_to(
client: httpx.Client, url: str, dest: Path, *, max_seconds: float = 60.0
) -> tuple[int, str]:
"""Stream-download url to dest. Returns (size, sha256).
`max_seconds` caps total download walltime — defends against
pathologically slow CDN connections (observed 10 KB/s on
image.lceda.cn for one project, would have hung 6+ min on a 3.6 MB
cover otherwise). httpx's default timeout resets per chunk, so
relying on it for streaming is unsafe. We track wall time ourselves.
"""
dest.parent.mkdir(parents=True, exist_ok=True)
h = hashlib.sha256()
size = 0
t_start = time.monotonic()
with client.stream("GET", url) as r:
r.raise_for_status()
with open(dest, "wb") as f:
for chunk in r.iter_bytes(1 << 15):
if time.monotonic() - t_start > max_seconds:
raise httpx.ReadTimeout(
f"download exceeded {max_seconds}s wall budget "
f"(got {size} of {r.headers.get('content-length', '?')} bytes)"
)
f.write(chunk)
h.update(chunk)
size += len(chunk)
@@ -904,6 +920,7 @@ def crawl_one(
list_item: dict,
out_root: Path,
fetch_files: bool = True,
fetch_cover: bool = True,
source_client: httpx.Client | None = None,
pro_source_client: httpx.Client | None = None,
skip_exts: set[str] | None = None,
@@ -914,23 +931,28 @@ def crawl_one(
proj_dir = out_root / uuid
proj_dir.mkdir(parents=True, exist_ok=True)
# 1. Fetch detail HTML
# 1. Fetch detail HTML.
# No polite_sleep after — the next call goes to image.lceda.cn (cover)
# or to attachment CDN, both different hosts from the detail server.
# Sleep is used to space hits to the *same* host before the next iteration.
detail_url = f"{BASE}/{path}"
r = client.get(detail_url)
r.raise_for_status()
detail = parse_detail_html(r.text)
polite_sleep()
# 2. Cover image
# 2. Cover image (skipped via fetch_cover=False — useful for scan-only modes
# like Step 1 of batch ingest where we only want license/meta).
thumb_url = list_item["thumb"]
if thumb_url.startswith("//"):
thumb_url = "https:" + thumb_url
cover_rel = None
if thumb_url:
if thumb_url and fetch_cover:
ext = Path(urllib.parse.urlparse(thumb_url).path).suffix or ".jpg"
cover_rel = f"cover{ext}"
try:
download_to(client, thumb_url, proj_dir / cover_rel)
# Cover thumbs should be small (~100-300 KB). Cap walltime at 15s
# so a pathologically slow CDN connection can't hang the loop.
download_to(client, thumb_url, proj_dir / cover_rel, max_seconds=15.0)
except httpx.HTTPError as e:
print(f" cover failed: {e}", file=sys.stderr)
cover_rel = None
@@ -1177,6 +1199,30 @@ def main(argv: list[str] | None = None) -> int:
"Trips inside the chain loop, wipes partial source/, records to "
"data/state/oshwhub_pro_oversize.jsonl. No effect on Std or Pro 2.x legacy.",
)
ap.add_argument(
"--from-jsonl",
type=Path,
default=None,
help="read pre-selected listing items from a jsonl (one item per line, "
"shape matches /api/project listing entries). Bypasses the listing "
"API entirely — useful when candidates were chosen offline from the "
"full-corpus index, where most aren't in the default top-N pages.",
)
ap.add_argument(
"--no-cover",
action="store_true",
help="skip cover image download. For scan-only runs (license / meta scrape) "
"this drops ~1.3s/project and avoids slow-CDN hangs.",
)
ap.add_argument(
"--concurrency",
type=int,
default=1,
help="number of parallel crawl_one workers (ThreadPoolExecutor). Default 1 "
"= serial. Anonymous endpoints (oshwhub.com detail/listing) tolerate "
"concurrency 5+ comfortably; only enable when no auth is involved or "
"you've probed the host.",
)
args = ap.parse_args(argv)
skip_exts: set[str] | None = (
{x.strip().lower().lstrip(".") for x in args.skip_ext.split(",") if x.strip()}
@@ -1200,7 +1246,13 @@ def main(argv: list[str] | None = None) -> int:
with make_client() as client:
# Build list of items to crawl
if args.uuids:
if args.from_jsonl:
items = [json.loads(ln) for ln in args.from_jsonl.read_text().splitlines() if ln.strip()]
if args.uuids:
wanted = set(args.uuids.split(","))
items = [i for i in items if i.get("uuid") in wanted]
print(f"loaded {len(items)} items from {args.from_jsonl}")
elif args.uuids:
wanted = set(args.uuids.split(","))
items: list[dict] = []
for it in iter_candidates(
@@ -1234,26 +1286,41 @@ def main(argv: list[str] | None = None) -> int:
make_pro_source_client(args.pro_cookie) if args.with_pro_source else None
)
try:
print(f"Crawling {len(items)} projects -> {args.out}")
for i, it in enumerate(items, 1):
print(f"[{i}/{len(items)}] {it['path']} ({it['name']})")
print(f"Crawling {len(items)} projects -> {args.out} (concurrency={args.concurrency})")
print_lock = threading.Lock()
def _do_one(i: int, it: dict) -> None:
try:
r = crawl_one(
client,
it,
args.out,
fetch_files=not args.no_files,
fetch_cover=not args.no_cover,
source_client=source_client_ctx,
pro_source_client=pro_source_client_ctx,
skip_exts=skip_exts,
max_source_mb=args.max_source_mb,
)
print(
f" OK: {r.files_count} files, {r.bytes_total / 1024 / 1024:.1f} MB "
f"(skipped: {len(r.skipped_files)})"
)
except Exception as e:
print(f" FAIL: {e}", file=sys.stderr)
with print_lock:
print(
f"[{i}/{len(items)}] OK {it['path']}: "
f"{r.files_count} files, {r.bytes_total / 1024 / 1024:.1f} MB "
f"(skipped: {len(r.skipped_files)})"
)
except Exception as e: # noqa: BLE001
with print_lock:
print(f"[{i}/{len(items)}] FAIL {it['path']}: {e}", file=sys.stderr)
if args.concurrency <= 1:
for i, it in enumerate(items, 1):
_do_one(i, it)
else:
from concurrent.futures import ThreadPoolExecutor, as_completed
with ThreadPoolExecutor(max_workers=args.concurrency) as pool:
futs = [pool.submit(_do_one, i, it) for i, it in enumerate(items, 1)]
for f in as_completed(futs):
f.result()
finally:
if source_client_ctx is not None:
source_client_ctx.close()