crawler: --no-cover, --concurrency, drop cross-host sleep + batch-50 Step 1 done
Three crawler ergonomics for batch operations:
--no-cover Skip cover image download. For scan-only modes (license/meta
scrape) this drops ~1.3s/project and avoids slow-CDN hangs.
--concurrency N ThreadPoolExecutor wrapping the per-project loop. Default
1 = serial (current behavior). Anonymous endpoints tolerate
5+ comfortably; output uses a print lock for readable
interleaved progress. fetch_cover plumbs through crawl_one.
Drop cross-host sleep #1: in crawl_one between detail HTML (oshwhub.com)
and cover image (image.lceda.cn). Different hosts — sleep was unnecessary.
Saves ~1s/project. Sleep #2 (post-cover, before next iteration) stays — it
gates the next oshwhub.com hit.
download_to gains max_seconds wall budget (default 60s, cover uses 15s).
Defends against pathologically slow CDN connections — observed 10 KB/s
on image.lceda.cn for one project, would have hung 6+ min on a 3.6 MB
cover otherwise. httpx default timeout resets per chunk, so streaming
downloads need an external wall-clock guard.
batch-50 Step 1 (license/meta scrape) shipped:
50/50 candidates have metadata.json + license recorded
License distribution: GPL 3.0 32, Public Domain 6, NC variants 8,
CERN-OHL 1, MIT 1, CC BY 3.0 1
Forge-friendly (non-NC): 41/50 (82%)
Declared attachments: 180 files / 2.36 GB (median 18 MB/proj, max 304 MB)
Walltime: 3min 26s for 28 projects at concurrency=5 (server-side
HTML render bound, not sleep-bound)
One orphan partial cover (a670e60a...) cleaned up — leftover from the
first aborted run before the timeout fix landed.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -21,6 +21,7 @@ import json
|
||||
import re
|
||||
import shutil
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
import urllib.parse
|
||||
from datetime import datetime, timezone
|
||||
@@ -258,15 +259,30 @@ def parse_detail_html(h: str) -> dict:
|
||||
# Download helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def download_to(client: httpx.Client, url: str, dest: Path) -> tuple[int, str]:
|
||||
"""Stream-download url to dest. Returns (size, sha256)."""
|
||||
def download_to(
|
||||
client: httpx.Client, url: str, dest: Path, *, max_seconds: float = 60.0
|
||||
) -> tuple[int, str]:
|
||||
"""Stream-download url to dest. Returns (size, sha256).
|
||||
|
||||
`max_seconds` caps total download walltime — defends against
|
||||
pathologically slow CDN connections (observed 10 KB/s on
|
||||
image.lceda.cn for one project, would have hung 6+ min on a 3.6 MB
|
||||
cover otherwise). httpx's default timeout resets per chunk, so
|
||||
relying on it for streaming is unsafe. We track wall time ourselves.
|
||||
"""
|
||||
dest.parent.mkdir(parents=True, exist_ok=True)
|
||||
h = hashlib.sha256()
|
||||
size = 0
|
||||
t_start = time.monotonic()
|
||||
with client.stream("GET", url) as r:
|
||||
r.raise_for_status()
|
||||
with open(dest, "wb") as f:
|
||||
for chunk in r.iter_bytes(1 << 15):
|
||||
if time.monotonic() - t_start > max_seconds:
|
||||
raise httpx.ReadTimeout(
|
||||
f"download exceeded {max_seconds}s wall budget "
|
||||
f"(got {size} of {r.headers.get('content-length', '?')} bytes)"
|
||||
)
|
||||
f.write(chunk)
|
||||
h.update(chunk)
|
||||
size += len(chunk)
|
||||
@@ -904,6 +920,7 @@ def crawl_one(
|
||||
list_item: dict,
|
||||
out_root: Path,
|
||||
fetch_files: bool = True,
|
||||
fetch_cover: bool = True,
|
||||
source_client: httpx.Client | None = None,
|
||||
pro_source_client: httpx.Client | None = None,
|
||||
skip_exts: set[str] | None = None,
|
||||
@@ -914,23 +931,28 @@ def crawl_one(
|
||||
proj_dir = out_root / uuid
|
||||
proj_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 1. Fetch detail HTML
|
||||
# 1. Fetch detail HTML.
|
||||
# No polite_sleep after — the next call goes to image.lceda.cn (cover)
|
||||
# or to attachment CDN, both different hosts from the detail server.
|
||||
# Sleep is used to space hits to the *same* host before the next iteration.
|
||||
detail_url = f"{BASE}/{path}"
|
||||
r = client.get(detail_url)
|
||||
r.raise_for_status()
|
||||
detail = parse_detail_html(r.text)
|
||||
polite_sleep()
|
||||
|
||||
# 2. Cover image
|
||||
# 2. Cover image (skipped via fetch_cover=False — useful for scan-only modes
|
||||
# like Step 1 of batch ingest where we only want license/meta).
|
||||
thumb_url = list_item["thumb"]
|
||||
if thumb_url.startswith("//"):
|
||||
thumb_url = "https:" + thumb_url
|
||||
cover_rel = None
|
||||
if thumb_url:
|
||||
if thumb_url and fetch_cover:
|
||||
ext = Path(urllib.parse.urlparse(thumb_url).path).suffix or ".jpg"
|
||||
cover_rel = f"cover{ext}"
|
||||
try:
|
||||
download_to(client, thumb_url, proj_dir / cover_rel)
|
||||
# Cover thumbs should be small (~100-300 KB). Cap walltime at 15s
|
||||
# so a pathologically slow CDN connection can't hang the loop.
|
||||
download_to(client, thumb_url, proj_dir / cover_rel, max_seconds=15.0)
|
||||
except httpx.HTTPError as e:
|
||||
print(f" cover failed: {e}", file=sys.stderr)
|
||||
cover_rel = None
|
||||
@@ -1177,6 +1199,30 @@ def main(argv: list[str] | None = None) -> int:
|
||||
"Trips inside the chain loop, wipes partial source/, records to "
|
||||
"data/state/oshwhub_pro_oversize.jsonl. No effect on Std or Pro 2.x legacy.",
|
||||
)
|
||||
ap.add_argument(
|
||||
"--from-jsonl",
|
||||
type=Path,
|
||||
default=None,
|
||||
help="read pre-selected listing items from a jsonl (one item per line, "
|
||||
"shape matches /api/project listing entries). Bypasses the listing "
|
||||
"API entirely — useful when candidates were chosen offline from the "
|
||||
"full-corpus index, where most aren't in the default top-N pages.",
|
||||
)
|
||||
ap.add_argument(
|
||||
"--no-cover",
|
||||
action="store_true",
|
||||
help="skip cover image download. For scan-only runs (license / meta scrape) "
|
||||
"this drops ~1.3s/project and avoids slow-CDN hangs.",
|
||||
)
|
||||
ap.add_argument(
|
||||
"--concurrency",
|
||||
type=int,
|
||||
default=1,
|
||||
help="number of parallel crawl_one workers (ThreadPoolExecutor). Default 1 "
|
||||
"= serial. Anonymous endpoints (oshwhub.com detail/listing) tolerate "
|
||||
"concurrency 5+ comfortably; only enable when no auth is involved or "
|
||||
"you've probed the host.",
|
||||
)
|
||||
args = ap.parse_args(argv)
|
||||
skip_exts: set[str] | None = (
|
||||
{x.strip().lower().lstrip(".") for x in args.skip_ext.split(",") if x.strip()}
|
||||
@@ -1200,7 +1246,13 @@ def main(argv: list[str] | None = None) -> int:
|
||||
|
||||
with make_client() as client:
|
||||
# Build list of items to crawl
|
||||
if args.uuids:
|
||||
if args.from_jsonl:
|
||||
items = [json.loads(ln) for ln in args.from_jsonl.read_text().splitlines() if ln.strip()]
|
||||
if args.uuids:
|
||||
wanted = set(args.uuids.split(","))
|
||||
items = [i for i in items if i.get("uuid") in wanted]
|
||||
print(f"loaded {len(items)} items from {args.from_jsonl}")
|
||||
elif args.uuids:
|
||||
wanted = set(args.uuids.split(","))
|
||||
items: list[dict] = []
|
||||
for it in iter_candidates(
|
||||
@@ -1234,26 +1286,41 @@ def main(argv: list[str] | None = None) -> int:
|
||||
make_pro_source_client(args.pro_cookie) if args.with_pro_source else None
|
||||
)
|
||||
try:
|
||||
print(f"Crawling {len(items)} projects -> {args.out}")
|
||||
for i, it in enumerate(items, 1):
|
||||
print(f"[{i}/{len(items)}] {it['path']} ({it['name']})")
|
||||
print(f"Crawling {len(items)} projects -> {args.out} (concurrency={args.concurrency})")
|
||||
print_lock = threading.Lock()
|
||||
|
||||
def _do_one(i: int, it: dict) -> None:
|
||||
try:
|
||||
r = crawl_one(
|
||||
client,
|
||||
it,
|
||||
args.out,
|
||||
fetch_files=not args.no_files,
|
||||
fetch_cover=not args.no_cover,
|
||||
source_client=source_client_ctx,
|
||||
pro_source_client=pro_source_client_ctx,
|
||||
skip_exts=skip_exts,
|
||||
max_source_mb=args.max_source_mb,
|
||||
)
|
||||
print(
|
||||
f" OK: {r.files_count} files, {r.bytes_total / 1024 / 1024:.1f} MB "
|
||||
f"(skipped: {len(r.skipped_files)})"
|
||||
)
|
||||
except Exception as e:
|
||||
print(f" FAIL: {e}", file=sys.stderr)
|
||||
with print_lock:
|
||||
print(
|
||||
f"[{i}/{len(items)}] OK {it['path']}: "
|
||||
f"{r.files_count} files, {r.bytes_total / 1024 / 1024:.1f} MB "
|
||||
f"(skipped: {len(r.skipped_files)})"
|
||||
)
|
||||
except Exception as e: # noqa: BLE001
|
||||
with print_lock:
|
||||
print(f"[{i}/{len(items)}] FAIL {it['path']}: {e}", file=sys.stderr)
|
||||
|
||||
if args.concurrency <= 1:
|
||||
for i, it in enumerate(items, 1):
|
||||
_do_one(i, it)
|
||||
else:
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
with ThreadPoolExecutor(max_workers=args.concurrency) as pool:
|
||||
futs = [pool.submit(_do_one, i, it) for i, it in enumerate(items, 1)]
|
||||
for f in as_completed(futs):
|
||||
f.result()
|
||||
finally:
|
||||
if source_client_ctx is not None:
|
||||
source_client_ctx.close()
|
||||
|
||||
Reference in New Issue
Block a user