"""Rate-limit ladder probe — find each host's actual ceiling. 依次以越来越短的间隔向目标端点发请求,监控状态码 / body size / 异常。 任何一档出现 429 / 403 / 5xx / 异常 close → 停在该档,把上一档作为安全水位。 设计原则 - 单点采样不下重复结论:每档至少 8-10 次请求才作判断 - 每两档之间插 30s 恢复期,避免上一档触发的限流污染下一档 - 只读端点(GET),不修改任何东西 - Pro API 用候选清单里我们本来就要打的 UUID,不浪费指纹 Usage: uv run python scripts/probe_rate_limit.py --host oshwhub uv run python scripts/probe_rate_limit.py --host detail uv run python scripts/probe_rate_limit.py --host pro # cookie required """ from __future__ import annotations import argparse import json import statistics import sys import time from pathlib import Path import httpx sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) from crawlers.oshwhub.crawler import ( # noqa: E402 BROWSER_UA, LCEDA_DOC_API, PRO_API, PRO_COOKIE_PATH_DEFAULT, PRO_EDITOR_VERSION, UA, make_client, make_pro_source_client, make_source_client, ) def ladder_oshwhub_listing(reps: int = 10) -> None: """oshwhub.com/api/project — listing API, no auth.""" client = make_client() sched = [2.0, 1.0, 0.5, 0.25, 0.1, 0.0] for sleep in sched: if not _run_one_tier( client, "GET", "https://oshwhub.com/api/project", params={"page": 1, "pageSize": 30, "origin": "pro"}, sleep=sleep, reps=reps, tier_name=f"listing@{sleep}s", ): break def ladder_oshwhub_detail(reps: int = 10) -> None: """oshwhub.com// — detail HTML pages. Use the 50 candidate paths so the test exercises real targets. """ candidates = [ json.loads(ln) for ln in open("data/state/oshwhub_batch50_candidates.jsonl") ] client = make_client() # Start polite, ramp aggressive sched = [2.0, 1.0, 0.5, 0.25, 0.1, 0.0] for sleep in sched: # Pull `reps` distinct paths; rotate so we don't hit same page twice in a tier paths = [c["path"] for c in candidates[:reps]] if not _run_paths_tier( client, paths, sleep=sleep, tier_name=f"detail@{sleep}s", ): break def ladder_std_doc(reps: int = 10) -> None: """lceda.cn/api/documents/ — anonymous Std doc fetch. Reads /tmp/std_doc_uuids.json (collected from already-crawled Std projects). Std endpoints want browser UA + Referer (see docs/sources/easyeda_std_source.md §3) — use the real source client. """ uuids = json.loads(Path("/tmp/std_doc_uuids.json").read_text()) if len(uuids) < reps: print(f"only {len(uuids)} Std doc UUIDs available", file=sys.stderr) return client = make_source_client() sched = [5.0, 2.0, 1.0, 0.5, 0.25] cursor = 0 for sleep in sched: # rotate so each tier hits distinct doc UUIDs slot = uuids[cursor : cursor + reps] cursor += reps if len(slot) < reps: slot = uuids[:reps] # fall back to repeats if pool exhausted if not _run_std_tier(client, slot, sleep=sleep, tier=f"std-doc@{sleep}s"): break def ladder_pro_api(reps: int = 8) -> None: """pro.lceda.cn/api/v4/projects/

— auth required. Probes the project-meta endpoint with logged-in cookie. We cap reps lower since this is the most precious host (account ban risk). Conservative ladder; bail aggressively on any non-200. """ candidates = [ json.loads(ln) for ln in open("data/state/oshwhub_batch50_candidates.jsonl") ] pro_uuids = [c["uuid"] for c in candidates if c.get("origin") == "pro"] if len(pro_uuids) < reps: print(f"only {len(pro_uuids)} Pro UUIDs available (need {reps})", file=sys.stderr) return client = make_pro_source_client() # Conservative ladder for Pro: start 5s, halve down, stop early on trouble sched = [5.0, 2.0, 1.0, 0.5, 0.25] for sleep in sched: if not _run_pro_tier(client, pro_uuids[:reps], sleep=sleep, tier=f"pro@{sleep}s"): print(f"\n STOP at {sleep}s — previous tier is safe water-mark.") break def _run_one_tier( client: httpx.Client, method: str, url: str, *, sleep: float, reps: int, tier_name: str, params: dict | None = None, ) -> bool: print(f"\n=== {tier_name} ({reps} reqs at {sleep}s interval) ===") statuses, sizes, latencies = [], [], [] bad = 0 for i in range(reps): t0 = time.perf_counter() try: r = client.request(method, url, params=params) sz = len(r.content) statuses.append(r.status_code) sizes.append(sz) latencies.append(time.perf_counter() - t0) ok = (r.status_code == 200) and sz > 0 if not ok: bad += 1 print(f" [{i+1}] !! status={r.status_code} sz={sz}", flush=True) except Exception as e: # noqa: BLE001 bad += 1 statuses.append(-1) print(f" [{i+1}] EXC {type(e).__name__}: {e}", flush=True) if i + 1 < reps: time.sleep(sleep) _summary(statuses, sizes, latencies, bad) if bad: print(f" -> tier FAILED ({bad}/{reps} bad). Stopping ladder.") return False if sleep > 0: print(f" recovery sleep 30s before next tier...") time.sleep(30) return True def _run_paths_tier( client: httpx.Client, paths: list[str], *, sleep: float, tier_name: str ) -> bool: print(f"\n=== {tier_name} ({len(paths)} pages at {sleep}s interval) ===") statuses, sizes, latencies = [], [], [] bad = 0 for i, p in enumerate(paths): url = f"https://oshwhub.com/{p}" t0 = time.perf_counter() try: r = client.get(url) sz = len(r.content) statuses.append(r.status_code); sizes.append(sz) latencies.append(time.perf_counter() - t0) ok = (r.status_code == 200) and sz > 5000 # detail pages should be sizable if not ok: bad += 1 print(f" [{i+1}] !! status={r.status_code} sz={sz} url={url[:80]}", flush=True) except Exception as e: # noqa: BLE001 bad += 1 statuses.append(-1) print(f" [{i+1}] EXC {type(e).__name__}: {e}", flush=True) if i + 1 < len(paths): time.sleep(sleep) _summary(statuses, sizes, latencies, bad) if bad: print(f" -> tier FAILED. Stopping ladder.") return False if sleep > 0: print(f" recovery sleep 30s before next tier..."); time.sleep(30) return True def _run_std_tier(client: httpx.Client, uuids: list[str], *, sleep: float, tier: str) -> bool: print(f"\n=== {tier} ({len(uuids)} doc fetches at {sleep}s) ===") statuses, sizes, lats = [], [], [] bad = 0 for i, u in enumerate(uuids): url = f"{LCEDA_DOC_API}/{u}" t0 = time.perf_counter() try: r = client.get(url, params={"uuid": u, "path": u}) sz = len(r.content) statuses.append(r.status_code); sizes.append(sz) lats.append(time.perf_counter() - t0) try: ok = r.status_code == 200 and r.json().get("success", False) except Exception: ok = False if not ok: bad += 1 print(f" [{i+1}] !! status={r.status_code} sz={sz} body[:200]={r.text[:200]!r}", flush=True) except Exception as e: # noqa: BLE001 bad += 1 statuses.append(-1) print(f" [{i+1}] EXC {type(e).__name__}: {e}", flush=True) if i + 1 < len(uuids): time.sleep(sleep) _summary(statuses, sizes, lats, bad) if bad: return False if sleep > 0: print(f" recovery sleep 30s before next tier..."); time.sleep(30) return True def _run_pro_tier(client: httpx.Client, uuids: list[str], *, sleep: float, tier: str) -> bool: print(f"\n=== {tier} ({len(uuids)} project meta calls at {sleep}s) ===") statuses, sizes, latencies = [], [], [] bad = 0 for i, u in enumerate(uuids): url = f"{PRO_API}/projects/{u}" t0 = time.perf_counter() try: r = client.get(url, headers={"path": u}) sz = len(r.content) statuses.append(r.status_code); sizes.append(sz) latencies.append(time.perf_counter() - t0) try: j = r.json() ok = r.status_code == 200 and j.get("success", False) except Exception: ok = False if not ok: bad += 1 print(f" [{i+1}] !! status={r.status_code} sz={sz} body[:200]={r.text[:200]!r}", flush=True) except Exception as e: # noqa: BLE001 bad += 1 statuses.append(-1) print(f" [{i+1}] EXC {type(e).__name__}: {e}", flush=True) if i + 1 < len(uuids): time.sleep(sleep) _summary(statuses, sizes, latencies, bad) if bad: return False if sleep > 0: print(f" recovery sleep 30s before next tier..."); time.sleep(30) return True def _summary(statuses, sizes, latencies, bad) -> None: if not statuses: return by_code: dict[int, int] = {} for s in statuses: by_code[s] = by_code.get(s, 0) + 1 if latencies: med = statistics.median(latencies) p90 = sorted(latencies)[int(len(latencies) * 0.9)] print(f" status: {by_code} bad={bad} latency med={med * 1000:.0f}ms p90={p90 * 1000:.0f}ms") else: print(f" status: {by_code} bad={bad}") if sizes: print(f" size: median={statistics.median(sizes)} min={min(sizes)} max={max(sizes)}") def main() -> int: ap = argparse.ArgumentParser() ap.add_argument("--host", choices=["oshwhub", "detail", "pro", "std-doc"], required=True) ap.add_argument("--reps", type=int, default=10) args = ap.parse_args() if args.host == "oshwhub": ladder_oshwhub_listing(reps=args.reps) elif args.host == "detail": ladder_oshwhub_detail(reps=args.reps) elif args.host == "pro": ladder_pro_api(reps=min(args.reps, 8)) # cap pro reps elif args.host == "std-doc": ladder_std_doc(reps=args.reps) return 0 if __name__ == "__main__": raise SystemExit(main())