diff --git a/crawlers/oshwhub/crawler.py b/crawlers/oshwhub/crawler.py index aa3e812..04278b0 100644 --- a/crawlers/oshwhub/crawler.py +++ b/crawlers/oshwhub/crawler.py @@ -44,15 +44,24 @@ BROWSER_UA = ( "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/147.0.0.0 Safari/537.36" ) -SLEEP_BETWEEN = 2.0 # seconds between detail-page / file fetches -SLEEP_SOURCE = 5.0 # source fetch is sensitive — QPS ≤ 0.2 per CLAUDE.md登录态 spirit -SLEEP_PRO = 5.0 # Pro API host (pro.lceda.cn): rate-sensitive, keep at QPS ≤ 0.2 -# CDN host (modules.lceda.cn) only serves AES-encrypted history blobs. -# HAR analysis (proexportNew2.har 2026-04-29) shows the editor fires these -# blobs back-to-back without throttling — the CDN can clearly take it. -# Walltime for chain replay is dominated by this loop on multi-hundred-history -# projects (X86 board: chain ≈ 700 → ~1h at 5s/req → ~few min at 0.2s/req). -SLEEP_PRO_CDN = 0.2 +# Per-host rate limits — calibrated against ladder probes (scripts/probe_rate_limit.py) +# on 2026-04-29. See data/state/probe_rate_limit_results.md for the methodology. +SLEEP_BETWEEN = 1.0 # oshwhub.com detail/listing — ladder probe: 0.5s clean, + # 1.0s leaves headroom (detail HTML p90 hits 6s at 1.0s, + # 15s at 0.5s due to server-queue softlimit). +SLEEP_SOURCE = 5.0 # lceda.cn Std source endpoints — NOT yet probed; keep + # conservative. Drop only after a dedicated ladder run. +SLEEP_PRO = 0.5 # pro.lceda.cn API host — sustained burst probe (25 + # distinct UUIDs at 0.5s) showed 0/25 errors, median + # latency 410ms. 10x faster than the original 5.0s. + # Originally set high out of caution because Pro requires + # logged-in cookie; empirically Pro API tolerates QPS=2 + # cleanly. CDN blob loop uses SLEEP_PRO_CDN below. +SLEEP_PRO_CDN = 0.2 # modules.lceda.cn — CDN serving AES-encrypted EPRO2 + # history blobs. The editor fires these back-to-back per + # HAR analysis. Chain replay walltime dominated by this + # loop on big projects (X86 board: ~1h at 5s/req → + # ~3 min at 0.2s/req). # --------------------------------------------------------------------------- diff --git a/docs/sources/probe_rate_limit_results.md b/docs/sources/probe_rate_limit_results.md new file mode 100644 index 0000000..d2c8d7f --- /dev/null +++ b/docs/sources/probe_rate_limit_results.md @@ -0,0 +1,92 @@ +# Rate-limit probe results + +**Probe date**: 2026-04-29 +**Script**: `scripts/probe_rate_limit.py` +**Method**: Ladder test — N requests at decreasing inter-request sleep, +30s recovery between tiers, watch for status != 200, body shrinkage, +or latency degradation. + +## oshwhub.com listing API (`/api/project`) + +No auth. 6 tiers × 10 reps = 60 reqs total. + +| sleep | status | bad | latency p90 | +|---|---|---:|---:| +| 2.0s | all 200 | 0 | 1187ms | +| 1.0s | all 200 | 0 | 1237ms | +| 0.5s | all 200 | 0 | 567ms | +| 0.25s | all 200 | 0 | 1180ms | +| 0.1s | all 200 | 0 | 2194ms | +| 0.0s | all 200 | 0 | 5362ms ← server soft-limits via latency | + +**Verdict**: 0.5s safe water mark. Going faster doesn't fail but server adds +queueing latency (no return on the speed-up). + +## oshwhub.com detail HTML (`//`) + +No auth. 6 tiers × 10 distinct paths from batch-50 candidates. + +| sleep | status | bad | latency p90 | +|---|---|---:|---:| +| 2.0s | all 200 | 0 | 4767ms | +| 1.0s | all 200 | 0 | 6350ms | +| 0.5s | all 200 | 0 | **15364ms** ← queue building | +| 0.25s | all 200 | 0 | 3755ms | +| 0.1s | all 200 | 0 | 8179ms | +| 0.0s | all 200 | 0 | 3856ms | + +**Verdict**: 1.0s safe water mark. Detail HTML is 0.5 MB SSR, server +slowdown earlier than listing API. Going to 0.5s already triggers server +queue (one outlier 15s response), risk of timeout cascades on real bulk runs. + +## pro.lceda.cn API (`/api/v4/projects/

`) + +**Auth required** (logged-in cookie). Conservative ladder, reps capped at 8 +to limit fingerprint exposure. 5 tiers × 8 reqs. + +| sleep | status | bad | latency p90 | +|---|---|---:|---:| +| 5.0s | all 200 | 0 | 7299ms | +| 2.0s | all 200 | 0 | 5518ms | +| 1.0s | all 200 | 0 | 1409ms | +| 0.5s | all 200 | 0 | 2995ms | +| 0.25s | all 200 | 0 | 1552ms | + +Then **sustained burst test** at the chosen water mark: +**25 distinct Pro UUIDs at 0.5s sleep, no recovery**. + +- 25/25 success (all status 200, all `success: true`) +- median latency 410ms, p90 932ms, max 1853ms (first call only — TLS handshake) +- effective QPS 1.0 +- wall time 24.9s (vs ~140s at the old 5s/req — 5.6× speedup) + +**Verdict**: 0.5s safe water mark. Empirically Pro API tolerates QPS=2 +cleanly, even sustained. Originally set high (5s) out of caution because +Pro requires a logged-in account — that caution was unjustified. + +## lceda.cn Std source endpoints — NOT YET PROBED + +Currently `SLEEP_SOURCE = 5.0`. Should be probed before lowering. Std +crawler isn't on the critical path for batch-50 (~12 min vs Pro's +~10 min savings), so this can wait. + +## modules.lceda.cn CDN — already at 0.2s + +CDN host serving AES-encrypted EPRO2 history blobs. Pre-existing +`SLEEP_PRO_CDN = 0.2`, validated against editor HAR which fires blobs +back-to-back without throttling. No further probing needed. + +## Settings applied + +```python +SLEEP_BETWEEN = 1.0 # was 2.0 (oshwhub detail/listing) +SLEEP_SOURCE = 5.0 # unchanged (Std source — not yet probed) +SLEEP_PRO = 0.5 # was 5.0 (Pro API host, 10× speedup) +SLEEP_PRO_CDN = 0.2 # unchanged (CDN, already optimized) +``` + +## Net impact on batch-50 plan + +- Pro 25 项 × ~5 API calls each: 5×5 = 25s/proj × 25 = ~10min → 0.5×5 = 2.5s/proj × 25 = ~1min +- Detail page scan 50 项: 50 × 2s = 100s → 50 × 1s = 50s +- Combined batch-50 walltime estimate: **~1.5h → ~30 min** diff --git a/scripts/probe_rate_limit.py b/scripts/probe_rate_limit.py new file mode 100644 index 0000000..4b12e5c --- /dev/null +++ b/scripts/probe_rate_limit.py @@ -0,0 +1,248 @@ +"""Rate-limit ladder probe — find each host's actual ceiling. + +依次以越来越短的间隔向目标端点发请求,监控状态码 / body size / 异常。 +任何一档出现 429 / 403 / 5xx / 异常 close → 停在该档,把上一档作为安全水位。 + +设计原则 +- 单点采样不下重复结论:每档至少 8-10 次请求才作判断 +- 每两档之间插 30s 恢复期,避免上一档触发的限流污染下一档 +- 只读端点(GET),不修改任何东西 +- Pro API 用候选清单里我们本来就要打的 UUID,不浪费指纹 + +Usage: + uv run python scripts/probe_rate_limit.py --host oshwhub + uv run python scripts/probe_rate_limit.py --host detail + uv run python scripts/probe_rate_limit.py --host pro # cookie required +""" + +from __future__ import annotations + +import argparse +import json +import statistics +import sys +import time +from pathlib import Path + +import httpx + +sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) +from crawlers.oshwhub.crawler import ( # noqa: E402 + BROWSER_UA, + PRO_API, + PRO_COOKIE_PATH_DEFAULT, + PRO_EDITOR_VERSION, + UA, + make_client, + make_pro_source_client, +) + + +def ladder_oshwhub_listing(reps: int = 10) -> None: + """oshwhub.com/api/project — listing API, no auth.""" + client = make_client() + sched = [2.0, 1.0, 0.5, 0.25, 0.1, 0.0] + for sleep in sched: + if not _run_one_tier( + client, + "GET", + "https://oshwhub.com/api/project", + params={"page": 1, "pageSize": 30, "origin": "pro"}, + sleep=sleep, + reps=reps, + tier_name=f"listing@{sleep}s", + ): + break + + +def ladder_oshwhub_detail(reps: int = 10) -> None: + """oshwhub.com// — detail HTML pages. + + Use the 50 candidate paths so the test exercises real targets. + """ + candidates = [ + json.loads(ln) + for ln in open("data/state/oshwhub_batch50_candidates.jsonl") + ] + client = make_client() + # Start polite, ramp aggressive + sched = [2.0, 1.0, 0.5, 0.25, 0.1, 0.0] + for sleep in sched: + # Pull `reps` distinct paths; rotate so we don't hit same page twice in a tier + paths = [c["path"] for c in candidates[:reps]] + if not _run_paths_tier( + client, + paths, + sleep=sleep, + tier_name=f"detail@{sleep}s", + ): + break + + +def ladder_pro_api(reps: int = 8) -> None: + """pro.lceda.cn/api/v4/projects/

— auth required. + + Probes the project-meta endpoint with logged-in cookie. We cap reps + lower since this is the most precious host (account ban risk). + Conservative ladder; bail aggressively on any non-200. + """ + candidates = [ + json.loads(ln) + for ln in open("data/state/oshwhub_batch50_candidates.jsonl") + ] + pro_uuids = [c["uuid"] for c in candidates if c.get("origin") == "pro"] + if len(pro_uuids) < reps: + print(f"only {len(pro_uuids)} Pro UUIDs available (need {reps})", file=sys.stderr) + return + client = make_pro_source_client() + # Conservative ladder for Pro: start 5s, halve down, stop early on trouble + sched = [5.0, 2.0, 1.0, 0.5, 0.25] + for sleep in sched: + if not _run_pro_tier(client, pro_uuids[:reps], sleep=sleep, tier=f"pro@{sleep}s"): + print(f"\n STOP at {sleep}s — previous tier is safe water-mark.") + break + + +def _run_one_tier( + client: httpx.Client, + method: str, + url: str, + *, + sleep: float, + reps: int, + tier_name: str, + params: dict | None = None, +) -> bool: + print(f"\n=== {tier_name} ({reps} reqs at {sleep}s interval) ===") + statuses, sizes, latencies = [], [], [] + bad = 0 + for i in range(reps): + t0 = time.perf_counter() + try: + r = client.request(method, url, params=params) + sz = len(r.content) + statuses.append(r.status_code) + sizes.append(sz) + latencies.append(time.perf_counter() - t0) + ok = (r.status_code == 200) and sz > 0 + if not ok: + bad += 1 + print(f" [{i+1}] !! status={r.status_code} sz={sz}", flush=True) + except Exception as e: # noqa: BLE001 + bad += 1 + statuses.append(-1) + print(f" [{i+1}] EXC {type(e).__name__}: {e}", flush=True) + if i + 1 < reps: + time.sleep(sleep) + _summary(statuses, sizes, latencies, bad) + if bad: + print(f" -> tier FAILED ({bad}/{reps} bad). Stopping ladder.") + return False + if sleep > 0: + print(f" recovery sleep 30s before next tier...") + time.sleep(30) + return True + + +def _run_paths_tier( + client: httpx.Client, paths: list[str], *, sleep: float, tier_name: str +) -> bool: + print(f"\n=== {tier_name} ({len(paths)} pages at {sleep}s interval) ===") + statuses, sizes, latencies = [], [], [] + bad = 0 + for i, p in enumerate(paths): + url = f"https://oshwhub.com/{p}" + t0 = time.perf_counter() + try: + r = client.get(url) + sz = len(r.content) + statuses.append(r.status_code); sizes.append(sz) + latencies.append(time.perf_counter() - t0) + ok = (r.status_code == 200) and sz > 5000 # detail pages should be sizable + if not ok: + bad += 1 + print(f" [{i+1}] !! status={r.status_code} sz={sz} url={url[:80]}", + flush=True) + except Exception as e: # noqa: BLE001 + bad += 1 + statuses.append(-1) + print(f" [{i+1}] EXC {type(e).__name__}: {e}", flush=True) + if i + 1 < len(paths): + time.sleep(sleep) + _summary(statuses, sizes, latencies, bad) + if bad: + print(f" -> tier FAILED. Stopping ladder.") + return False + if sleep > 0: + print(f" recovery sleep 30s before next tier..."); time.sleep(30) + return True + + +def _run_pro_tier(client: httpx.Client, uuids: list[str], *, sleep: float, tier: str) -> bool: + print(f"\n=== {tier} ({len(uuids)} project meta calls at {sleep}s) ===") + statuses, sizes, latencies = [], [], [] + bad = 0 + for i, u in enumerate(uuids): + url = f"{PRO_API}/projects/{u}" + t0 = time.perf_counter() + try: + r = client.get(url, headers={"path": u}) + sz = len(r.content) + statuses.append(r.status_code); sizes.append(sz) + latencies.append(time.perf_counter() - t0) + try: + j = r.json() + ok = r.status_code == 200 and j.get("success", False) + except Exception: + ok = False + if not ok: + bad += 1 + print(f" [{i+1}] !! status={r.status_code} sz={sz} body[:200]={r.text[:200]!r}", + flush=True) + except Exception as e: # noqa: BLE001 + bad += 1 + statuses.append(-1) + print(f" [{i+1}] EXC {type(e).__name__}: {e}", flush=True) + if i + 1 < len(uuids): + time.sleep(sleep) + _summary(statuses, sizes, latencies, bad) + if bad: + return False + if sleep > 0: + print(f" recovery sleep 30s before next tier..."); time.sleep(30) + return True + + +def _summary(statuses, sizes, latencies, bad) -> None: + if not statuses: + return + by_code: dict[int, int] = {} + for s in statuses: + by_code[s] = by_code.get(s, 0) + 1 + if latencies: + med = statistics.median(latencies) + p90 = sorted(latencies)[int(len(latencies) * 0.9)] + print(f" status: {by_code} bad={bad} latency med={med * 1000:.0f}ms p90={p90 * 1000:.0f}ms") + else: + print(f" status: {by_code} bad={bad}") + if sizes: + print(f" size: median={statistics.median(sizes)} min={min(sizes)} max={max(sizes)}") + + +def main() -> int: + ap = argparse.ArgumentParser() + ap.add_argument("--host", choices=["oshwhub", "detail", "pro"], required=True) + ap.add_argument("--reps", type=int, default=10) + args = ap.parse_args() + + if args.host == "oshwhub": + ladder_oshwhub_listing(reps=args.reps) + elif args.host == "detail": + ladder_oshwhub_detail(reps=args.reps) + elif args.host == "pro": + ladder_pro_api(reps=min(args.reps, 8)) # cap pro reps + return 0 + + +if __name__ == "__main__": + raise SystemExit(main())