crawler: drop sleep rates 10x for Pro API, 2x for oshwhub detail

Calibrated against ladder probes on 2026-04-29. Findings in
docs/sources/probe_rate_limit_results.md.

  SLEEP_PRO     5.0 -> 0.5  (pro.lceda.cn API)
  SLEEP_BETWEEN 2.0 -> 1.0  (oshwhub detail/listing)
  SLEEP_SOURCE  5.0 unchanged (lceda.cn Std endpoints — not yet probed)
  SLEEP_PRO_CDN 0.2 unchanged (modules.lceda.cn — already optimized)

The original 5s rate for Pro API was set out of caution because Pro
requires a logged-in cookie. Empirical sustained-burst probe (25
distinct UUIDs at 0.5s sleep, no recovery): 0/25 errors, median
latency 410ms, p90 932ms. The "Pro is rate-sensitive" assumption was
wrong — server tolerates QPS=2 cleanly.

oshwhub detail HTML pages slowed from p90 6.4s at 1.0s sleep to
p90 15s at 0.5s — server queue backs up. 1.0s is the headroom-safe
water mark.

Net effect on batch-50 estimate: ~1.5h -> ~30min.

scripts/probe_rate_limit.py: rate-limit ladder probe tool. Reusable
for new endpoints (Std source still owes a probe). Designed for safety:
30s tier recovery, low rep counts on auth hosts, bail on first non-200.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-29 00:45:34 +08:00
parent 3c00edf6db
commit cb868988b9
3 changed files with 358 additions and 9 deletions

248
scripts/probe_rate_limit.py Normal file
View File

@@ -0,0 +1,248 @@
"""Rate-limit ladder probe — find each host's actual ceiling.
依次以越来越短的间隔向目标端点发请求,监控状态码 / body size / 异常。
任何一档出现 429 / 403 / 5xx / 异常 close → 停在该档,把上一档作为安全水位。
设计原则
- 单点采样不下重复结论:每档至少 8-10 次请求才作判断
- 每两档之间插 30s 恢复期,避免上一档触发的限流污染下一档
- 只读端点GET不修改任何东西
- Pro API 用候选清单里我们本来就要打的 UUID不浪费指纹
Usage:
uv run python scripts/probe_rate_limit.py --host oshwhub
uv run python scripts/probe_rate_limit.py --host detail
uv run python scripts/probe_rate_limit.py --host pro # cookie required
"""
from __future__ import annotations
import argparse
import json
import statistics
import sys
import time
from pathlib import Path
import httpx
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from crawlers.oshwhub.crawler import ( # noqa: E402
BROWSER_UA,
PRO_API,
PRO_COOKIE_PATH_DEFAULT,
PRO_EDITOR_VERSION,
UA,
make_client,
make_pro_source_client,
)
def ladder_oshwhub_listing(reps: int = 10) -> None:
"""oshwhub.com/api/project — listing API, no auth."""
client = make_client()
sched = [2.0, 1.0, 0.5, 0.25, 0.1, 0.0]
for sleep in sched:
if not _run_one_tier(
client,
"GET",
"https://oshwhub.com/api/project",
params={"page": 1, "pageSize": 30, "origin": "pro"},
sleep=sleep,
reps=reps,
tier_name=f"listing@{sleep}s",
):
break
def ladder_oshwhub_detail(reps: int = 10) -> None:
"""oshwhub.com/<owner>/<path> — detail HTML pages.
Use the 50 candidate paths so the test exercises real targets.
"""
candidates = [
json.loads(ln)
for ln in open("data/state/oshwhub_batch50_candidates.jsonl")
]
client = make_client()
# Start polite, ramp aggressive
sched = [2.0, 1.0, 0.5, 0.25, 0.1, 0.0]
for sleep in sched:
# Pull `reps` distinct paths; rotate so we don't hit same page twice in a tier
paths = [c["path"] for c in candidates[:reps]]
if not _run_paths_tier(
client,
paths,
sleep=sleep,
tier_name=f"detail@{sleep}s",
):
break
def ladder_pro_api(reps: int = 8) -> None:
"""pro.lceda.cn/api/v4/projects/<P> — auth required.
Probes the project-meta endpoint with logged-in cookie. We cap reps
lower since this is the most precious host (account ban risk).
Conservative ladder; bail aggressively on any non-200.
"""
candidates = [
json.loads(ln)
for ln in open("data/state/oshwhub_batch50_candidates.jsonl")
]
pro_uuids = [c["uuid"] for c in candidates if c.get("origin") == "pro"]
if len(pro_uuids) < reps:
print(f"only {len(pro_uuids)} Pro UUIDs available (need {reps})", file=sys.stderr)
return
client = make_pro_source_client()
# Conservative ladder for Pro: start 5s, halve down, stop early on trouble
sched = [5.0, 2.0, 1.0, 0.5, 0.25]
for sleep in sched:
if not _run_pro_tier(client, pro_uuids[:reps], sleep=sleep, tier=f"pro@{sleep}s"):
print(f"\n STOP at {sleep}s — previous tier is safe water-mark.")
break
def _run_one_tier(
client: httpx.Client,
method: str,
url: str,
*,
sleep: float,
reps: int,
tier_name: str,
params: dict | None = None,
) -> bool:
print(f"\n=== {tier_name} ({reps} reqs at {sleep}s interval) ===")
statuses, sizes, latencies = [], [], []
bad = 0
for i in range(reps):
t0 = time.perf_counter()
try:
r = client.request(method, url, params=params)
sz = len(r.content)
statuses.append(r.status_code)
sizes.append(sz)
latencies.append(time.perf_counter() - t0)
ok = (r.status_code == 200) and sz > 0
if not ok:
bad += 1
print(f" [{i+1}] !! status={r.status_code} sz={sz}", flush=True)
except Exception as e: # noqa: BLE001
bad += 1
statuses.append(-1)
print(f" [{i+1}] EXC {type(e).__name__}: {e}", flush=True)
if i + 1 < reps:
time.sleep(sleep)
_summary(statuses, sizes, latencies, bad)
if bad:
print(f" -> tier FAILED ({bad}/{reps} bad). Stopping ladder.")
return False
if sleep > 0:
print(f" recovery sleep 30s before next tier...")
time.sleep(30)
return True
def _run_paths_tier(
client: httpx.Client, paths: list[str], *, sleep: float, tier_name: str
) -> bool:
print(f"\n=== {tier_name} ({len(paths)} pages at {sleep}s interval) ===")
statuses, sizes, latencies = [], [], []
bad = 0
for i, p in enumerate(paths):
url = f"https://oshwhub.com/{p}"
t0 = time.perf_counter()
try:
r = client.get(url)
sz = len(r.content)
statuses.append(r.status_code); sizes.append(sz)
latencies.append(time.perf_counter() - t0)
ok = (r.status_code == 200) and sz > 5000 # detail pages should be sizable
if not ok:
bad += 1
print(f" [{i+1}] !! status={r.status_code} sz={sz} url={url[:80]}",
flush=True)
except Exception as e: # noqa: BLE001
bad += 1
statuses.append(-1)
print(f" [{i+1}] EXC {type(e).__name__}: {e}", flush=True)
if i + 1 < len(paths):
time.sleep(sleep)
_summary(statuses, sizes, latencies, bad)
if bad:
print(f" -> tier FAILED. Stopping ladder.")
return False
if sleep > 0:
print(f" recovery sleep 30s before next tier..."); time.sleep(30)
return True
def _run_pro_tier(client: httpx.Client, uuids: list[str], *, sleep: float, tier: str) -> bool:
print(f"\n=== {tier} ({len(uuids)} project meta calls at {sleep}s) ===")
statuses, sizes, latencies = [], [], []
bad = 0
for i, u in enumerate(uuids):
url = f"{PRO_API}/projects/{u}"
t0 = time.perf_counter()
try:
r = client.get(url, headers={"path": u})
sz = len(r.content)
statuses.append(r.status_code); sizes.append(sz)
latencies.append(time.perf_counter() - t0)
try:
j = r.json()
ok = r.status_code == 200 and j.get("success", False)
except Exception:
ok = False
if not ok:
bad += 1
print(f" [{i+1}] !! status={r.status_code} sz={sz} body[:200]={r.text[:200]!r}",
flush=True)
except Exception as e: # noqa: BLE001
bad += 1
statuses.append(-1)
print(f" [{i+1}] EXC {type(e).__name__}: {e}", flush=True)
if i + 1 < len(uuids):
time.sleep(sleep)
_summary(statuses, sizes, latencies, bad)
if bad:
return False
if sleep > 0:
print(f" recovery sleep 30s before next tier..."); time.sleep(30)
return True
def _summary(statuses, sizes, latencies, bad) -> None:
if not statuses:
return
by_code: dict[int, int] = {}
for s in statuses:
by_code[s] = by_code.get(s, 0) + 1
if latencies:
med = statistics.median(latencies)
p90 = sorted(latencies)[int(len(latencies) * 0.9)]
print(f" status: {by_code} bad={bad} latency med={med * 1000:.0f}ms p90={p90 * 1000:.0f}ms")
else:
print(f" status: {by_code} bad={bad}")
if sizes:
print(f" size: median={statistics.median(sizes)} min={min(sizes)} max={max(sizes)}")
def main() -> int:
ap = argparse.ArgumentParser()
ap.add_argument("--host", choices=["oshwhub", "detail", "pro"], required=True)
ap.add_argument("--reps", type=int, default=10)
args = ap.parse_args()
if args.host == "oshwhub":
ladder_oshwhub_listing(reps=args.reps)
elif args.host == "detail":
ladder_oshwhub_detail(reps=args.reps)
elif args.host == "pro":
ladder_pro_api(reps=min(args.reps, 8)) # cap pro reps
return 0
if __name__ == "__main__":
raise SystemExit(main())