crawler: drop sleep rates 10x for Pro API, 2x for oshwhub detail

Calibrated against ladder probes on 2026-04-29. Findings in docs/sources/probe_rate_limit_results.md. SLEEP_PRO 5.0 -> 0.5 (pro.lceda.cn API) SLEEP_BETWEEN 2.0 -> 1.0 (oshwhub detail/listing) SLEEP_SOURCE 5.0 unchanged (lceda.cn Std endpoints — not yet probed) SLEEP_PRO_CDN 0.2 unchanged (modules.lceda.cn — already optimized) The original 5s rate for Pro API was set out of caution because Pro requires a logged-in cookie. Empirical sustained-burst probe (25 distinct UUIDs at 0.5s sleep, no recovery): 0/25 errors, median latency 410ms, p90 932ms. The "Pro is rate-sensitive" assumption was wrong — server tolerates QPS=2 cleanly. oshwhub detail HTML pages slowed from p90 6.4s at 1.0s sleep to p90 15s at 0.5s — server queue backs up. 1.0s is the headroom-safe water mark. Net effect on batch-50 estimate: ~1.5h -> ~30min. scripts/probe_rate_limit.py: rate-limit ladder probe tool. Reusable for new endpoints (Std source still owes a probe). Designed for safety: 30s tier recovery, low rep counts on auth hosts, bail on first non-200. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 00:45:34 +08:00
parent 3c00edf6db
commit cb868988b9
3 changed files with 358 additions and 9 deletions
--- a/scripts/probe_rate_limit.py
+++ b/scripts/probe_rate_limit.py
@@ -0,0 +1,248 @@
+"""Rate-limit ladder probe — find each host's actual ceiling.
+
+依次以越来越短的间隔向目标端点发请求，监控状态码 / body size / 异常。
+任何一档出现 429 / 403 / 5xx / 异常 close → 停在该档，把上一档作为安全水位。
+
+设计原则
+- 单点采样不下重复结论：每档至少 8-10 次请求才作判断
+- 每两档之间插 30s 恢复期，避免上一档触发的限流污染下一档
+- 只读端点（GET），不修改任何东西
+- Pro API 用候选清单里我们本来就要打的 UUID，不浪费指纹
+
+Usage:
+    uv run python scripts/probe_rate_limit.py --host oshwhub
+    uv run python scripts/probe_rate_limit.py --host detail
+    uv run python scripts/probe_rate_limit.py --host pro     # cookie required
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import statistics
+import sys
+import time
+from pathlib import Path
+
+import httpx
+
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+from crawlers.oshwhub.crawler import (  # noqa: E402
+    BROWSER_UA,
+    PRO_API,
+    PRO_COOKIE_PATH_DEFAULT,
+    PRO_EDITOR_VERSION,
+    UA,
+    make_client,
+    make_pro_source_client,
+)
+
+
+def ladder_oshwhub_listing(reps: int = 10) -> None:
+    """oshwhub.com/api/project — listing API, no auth."""
+    client = make_client()
+    sched = [2.0, 1.0, 0.5, 0.25, 0.1, 0.0]
+    for sleep in sched:
+        if not _run_one_tier(
+            client,
+            "GET",
+            "https://oshwhub.com/api/project",
+            params={"page": 1, "pageSize": 30, "origin": "pro"},
+            sleep=sleep,
+            reps=reps,
+            tier_name=f"listing@{sleep}s",
+        ):
+            break
+
+
+def ladder_oshwhub_detail(reps: int = 10) -> None:
+    """oshwhub.com/<owner>/<path> — detail HTML pages.
+
+    Use the 50 candidate paths so the test exercises real targets.
+    """
+    candidates = [
+        json.loads(ln)
+        for ln in open("data/state/oshwhub_batch50_candidates.jsonl")
+    ]
+    client = make_client()
+    # Start polite, ramp aggressive
+    sched = [2.0, 1.0, 0.5, 0.25, 0.1, 0.0]
+    for sleep in sched:
+        # Pull `reps` distinct paths; rotate so we don't hit same page twice in a tier
+        paths = [c["path"] for c in candidates[:reps]]
+        if not _run_paths_tier(
+            client,
+            paths,
+            sleep=sleep,
+            tier_name=f"detail@{sleep}s",
+        ):
+            break
+
+
+def ladder_pro_api(reps: int = 8) -> None:
+    """pro.lceda.cn/api/v4/projects/<P> — auth required.
+
+    Probes the project-meta endpoint with logged-in cookie. We cap reps
+    lower since this is the most precious host (account ban risk).
+    Conservative ladder; bail aggressively on any non-200.
+    """
+    candidates = [
+        json.loads(ln)
+        for ln in open("data/state/oshwhub_batch50_candidates.jsonl")
+    ]
+    pro_uuids = [c["uuid"] for c in candidates if c.get("origin") == "pro"]
+    if len(pro_uuids) < reps:
+        print(f"only {len(pro_uuids)} Pro UUIDs available (need {reps})", file=sys.stderr)
+        return
+    client = make_pro_source_client()
+    # Conservative ladder for Pro: start 5s, halve down, stop early on trouble
+    sched = [5.0, 2.0, 1.0, 0.5, 0.25]
+    for sleep in sched:
+        if not _run_pro_tier(client, pro_uuids[:reps], sleep=sleep, tier=f"pro@{sleep}s"):
+            print(f"\n  STOP at {sleep}s — previous tier is safe water-mark.")
+            break
+
+
+def _run_one_tier(
+    client: httpx.Client,
+    method: str,
+    url: str,
+    *,
+    sleep: float,
+    reps: int,
+    tier_name: str,
+    params: dict | None = None,
+) -> bool:
+    print(f"\n=== {tier_name}  ({reps} reqs at {sleep}s interval) ===")
+    statuses, sizes, latencies = [], [], []
+    bad = 0
+    for i in range(reps):
+        t0 = time.perf_counter()
+        try:
+            r = client.request(method, url, params=params)
+            sz = len(r.content)
+            statuses.append(r.status_code)
+            sizes.append(sz)
+            latencies.append(time.perf_counter() - t0)
+            ok = (r.status_code == 200) and sz > 0
+            if not ok:
+                bad += 1
+                print(f"  [{i+1}] !! status={r.status_code} sz={sz}", flush=True)
+        except Exception as e:  # noqa: BLE001
+            bad += 1
+            statuses.append(-1)
+            print(f"  [{i+1}] EXC {type(e).__name__}: {e}", flush=True)
+        if i + 1 < reps:
+            time.sleep(sleep)
+    _summary(statuses, sizes, latencies, bad)
+    if bad:
+        print(f"  -> tier FAILED ({bad}/{reps} bad). Stopping ladder.")
+        return False
+    if sleep > 0:
+        print(f"  recovery sleep 30s before next tier...")
+        time.sleep(30)
+    return True
+
+
+def _run_paths_tier(
+    client: httpx.Client, paths: list[str], *, sleep: float, tier_name: str
+) -> bool:
+    print(f"\n=== {tier_name}  ({len(paths)} pages at {sleep}s interval) ===")
+    statuses, sizes, latencies = [], [], []
+    bad = 0
+    for i, p in enumerate(paths):
+        url = f"https://oshwhub.com/{p}"
+        t0 = time.perf_counter()
+        try:
+            r = client.get(url)
+            sz = len(r.content)
+            statuses.append(r.status_code); sizes.append(sz)
+            latencies.append(time.perf_counter() - t0)
+            ok = (r.status_code == 200) and sz > 5000  # detail pages should be sizable
+            if not ok:
+                bad += 1
+                print(f"  [{i+1}] !! status={r.status_code} sz={sz} url={url[:80]}",
+                      flush=True)
+        except Exception as e:  # noqa: BLE001
+            bad += 1
+            statuses.append(-1)
+            print(f"  [{i+1}] EXC {type(e).__name__}: {e}", flush=True)
+        if i + 1 < len(paths):
+            time.sleep(sleep)
+    _summary(statuses, sizes, latencies, bad)
+    if bad:
+        print(f"  -> tier FAILED. Stopping ladder.")
+        return False
+    if sleep > 0:
+        print(f"  recovery sleep 30s before next tier..."); time.sleep(30)
+    return True
+
+
+def _run_pro_tier(client: httpx.Client, uuids: list[str], *, sleep: float, tier: str) -> bool:
+    print(f"\n=== {tier}  ({len(uuids)} project meta calls at {sleep}s) ===")
+    statuses, sizes, latencies = [], [], []
+    bad = 0
+    for i, u in enumerate(uuids):
+        url = f"{PRO_API}/projects/{u}"
+        t0 = time.perf_counter()
+        try:
+            r = client.get(url, headers={"path": u})
+            sz = len(r.content)
+            statuses.append(r.status_code); sizes.append(sz)
+            latencies.append(time.perf_counter() - t0)
+            try:
+                j = r.json()
+                ok = r.status_code == 200 and j.get("success", False)
+            except Exception:
+                ok = False
+            if not ok:
+                bad += 1
+                print(f"  [{i+1}] !! status={r.status_code} sz={sz} body[:200]={r.text[:200]!r}",
+                      flush=True)
+        except Exception as e:  # noqa: BLE001
+            bad += 1
+            statuses.append(-1)
+            print(f"  [{i+1}] EXC {type(e).__name__}: {e}", flush=True)
+        if i + 1 < len(uuids):
+            time.sleep(sleep)
+    _summary(statuses, sizes, latencies, bad)
+    if bad:
+        return False
+    if sleep > 0:
+        print(f"  recovery sleep 30s before next tier..."); time.sleep(30)
+    return True
+
+
+def _summary(statuses, sizes, latencies, bad) -> None:
+    if not statuses:
+        return
+    by_code: dict[int, int] = {}
+    for s in statuses:
+        by_code[s] = by_code.get(s, 0) + 1
+    if latencies:
+        med = statistics.median(latencies)
+        p90 = sorted(latencies)[int(len(latencies) * 0.9)]
+        print(f"  status: {by_code}  bad={bad}  latency med={med * 1000:.0f}ms p90={p90 * 1000:.0f}ms")
+    else:
+        print(f"  status: {by_code}  bad={bad}")
+    if sizes:
+        print(f"  size:   median={statistics.median(sizes)} min={min(sizes)} max={max(sizes)}")
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--host", choices=["oshwhub", "detail", "pro"], required=True)
+    ap.add_argument("--reps", type=int, default=10)
+    args = ap.parse_args()
+
+    if args.host == "oshwhub":
+        ladder_oshwhub_listing(reps=args.reps)
+    elif args.host == "detail":
+        ladder_oshwhub_detail(reps=args.reps)
+    elif args.host == "pro":
+        ladder_pro_api(reps=min(args.reps, 8))  # cap pro reps
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())