crawler: drop sleep rates 10x for Pro API, 2x for oshwhub detail
Calibrated against ladder probes on 2026-04-29. Findings in docs/sources/probe_rate_limit_results.md. SLEEP_PRO 5.0 -> 0.5 (pro.lceda.cn API) SLEEP_BETWEEN 2.0 -> 1.0 (oshwhub detail/listing) SLEEP_SOURCE 5.0 unchanged (lceda.cn Std endpoints — not yet probed) SLEEP_PRO_CDN 0.2 unchanged (modules.lceda.cn — already optimized) The original 5s rate for Pro API was set out of caution because Pro requires a logged-in cookie. Empirical sustained-burst probe (25 distinct UUIDs at 0.5s sleep, no recovery): 0/25 errors, median latency 410ms, p90 932ms. The "Pro is rate-sensitive" assumption was wrong — server tolerates QPS=2 cleanly. oshwhub detail HTML pages slowed from p90 6.4s at 1.0s sleep to p90 15s at 0.5s — server queue backs up. 1.0s is the headroom-safe water mark. Net effect on batch-50 estimate: ~1.5h -> ~30min. scripts/probe_rate_limit.py: rate-limit ladder probe tool. Reusable for new endpoints (Std source still owes a probe). Designed for safety: 30s tier recovery, low rep counts on auth hosts, bail on first non-200. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
248
scripts/probe_rate_limit.py
Normal file
248
scripts/probe_rate_limit.py
Normal file
@@ -0,0 +1,248 @@
|
||||
"""Rate-limit ladder probe — find each host's actual ceiling.
|
||||
|
||||
依次以越来越短的间隔向目标端点发请求,监控状态码 / body size / 异常。
|
||||
任何一档出现 429 / 403 / 5xx / 异常 close → 停在该档,把上一档作为安全水位。
|
||||
|
||||
设计原则
|
||||
- 单点采样不下重复结论:每档至少 8-10 次请求才作判断
|
||||
- 每两档之间插 30s 恢复期,避免上一档触发的限流污染下一档
|
||||
- 只读端点(GET),不修改任何东西
|
||||
- Pro API 用候选清单里我们本来就要打的 UUID,不浪费指纹
|
||||
|
||||
Usage:
|
||||
uv run python scripts/probe_rate_limit.py --host oshwhub
|
||||
uv run python scripts/probe_rate_limit.py --host detail
|
||||
uv run python scripts/probe_rate_limit.py --host pro # cookie required
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import statistics
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
||||
from crawlers.oshwhub.crawler import ( # noqa: E402
|
||||
BROWSER_UA,
|
||||
PRO_API,
|
||||
PRO_COOKIE_PATH_DEFAULT,
|
||||
PRO_EDITOR_VERSION,
|
||||
UA,
|
||||
make_client,
|
||||
make_pro_source_client,
|
||||
)
|
||||
|
||||
|
||||
def ladder_oshwhub_listing(reps: int = 10) -> None:
|
||||
"""oshwhub.com/api/project — listing API, no auth."""
|
||||
client = make_client()
|
||||
sched = [2.0, 1.0, 0.5, 0.25, 0.1, 0.0]
|
||||
for sleep in sched:
|
||||
if not _run_one_tier(
|
||||
client,
|
||||
"GET",
|
||||
"https://oshwhub.com/api/project",
|
||||
params={"page": 1, "pageSize": 30, "origin": "pro"},
|
||||
sleep=sleep,
|
||||
reps=reps,
|
||||
tier_name=f"listing@{sleep}s",
|
||||
):
|
||||
break
|
||||
|
||||
|
||||
def ladder_oshwhub_detail(reps: int = 10) -> None:
|
||||
"""oshwhub.com/<owner>/<path> — detail HTML pages.
|
||||
|
||||
Use the 50 candidate paths so the test exercises real targets.
|
||||
"""
|
||||
candidates = [
|
||||
json.loads(ln)
|
||||
for ln in open("data/state/oshwhub_batch50_candidates.jsonl")
|
||||
]
|
||||
client = make_client()
|
||||
# Start polite, ramp aggressive
|
||||
sched = [2.0, 1.0, 0.5, 0.25, 0.1, 0.0]
|
||||
for sleep in sched:
|
||||
# Pull `reps` distinct paths; rotate so we don't hit same page twice in a tier
|
||||
paths = [c["path"] for c in candidates[:reps]]
|
||||
if not _run_paths_tier(
|
||||
client,
|
||||
paths,
|
||||
sleep=sleep,
|
||||
tier_name=f"detail@{sleep}s",
|
||||
):
|
||||
break
|
||||
|
||||
|
||||
def ladder_pro_api(reps: int = 8) -> None:
|
||||
"""pro.lceda.cn/api/v4/projects/<P> — auth required.
|
||||
|
||||
Probes the project-meta endpoint with logged-in cookie. We cap reps
|
||||
lower since this is the most precious host (account ban risk).
|
||||
Conservative ladder; bail aggressively on any non-200.
|
||||
"""
|
||||
candidates = [
|
||||
json.loads(ln)
|
||||
for ln in open("data/state/oshwhub_batch50_candidates.jsonl")
|
||||
]
|
||||
pro_uuids = [c["uuid"] for c in candidates if c.get("origin") == "pro"]
|
||||
if len(pro_uuids) < reps:
|
||||
print(f"only {len(pro_uuids)} Pro UUIDs available (need {reps})", file=sys.stderr)
|
||||
return
|
||||
client = make_pro_source_client()
|
||||
# Conservative ladder for Pro: start 5s, halve down, stop early on trouble
|
||||
sched = [5.0, 2.0, 1.0, 0.5, 0.25]
|
||||
for sleep in sched:
|
||||
if not _run_pro_tier(client, pro_uuids[:reps], sleep=sleep, tier=f"pro@{sleep}s"):
|
||||
print(f"\n STOP at {sleep}s — previous tier is safe water-mark.")
|
||||
break
|
||||
|
||||
|
||||
def _run_one_tier(
|
||||
client: httpx.Client,
|
||||
method: str,
|
||||
url: str,
|
||||
*,
|
||||
sleep: float,
|
||||
reps: int,
|
||||
tier_name: str,
|
||||
params: dict | None = None,
|
||||
) -> bool:
|
||||
print(f"\n=== {tier_name} ({reps} reqs at {sleep}s interval) ===")
|
||||
statuses, sizes, latencies = [], [], []
|
||||
bad = 0
|
||||
for i in range(reps):
|
||||
t0 = time.perf_counter()
|
||||
try:
|
||||
r = client.request(method, url, params=params)
|
||||
sz = len(r.content)
|
||||
statuses.append(r.status_code)
|
||||
sizes.append(sz)
|
||||
latencies.append(time.perf_counter() - t0)
|
||||
ok = (r.status_code == 200) and sz > 0
|
||||
if not ok:
|
||||
bad += 1
|
||||
print(f" [{i+1}] !! status={r.status_code} sz={sz}", flush=True)
|
||||
except Exception as e: # noqa: BLE001
|
||||
bad += 1
|
||||
statuses.append(-1)
|
||||
print(f" [{i+1}] EXC {type(e).__name__}: {e}", flush=True)
|
||||
if i + 1 < reps:
|
||||
time.sleep(sleep)
|
||||
_summary(statuses, sizes, latencies, bad)
|
||||
if bad:
|
||||
print(f" -> tier FAILED ({bad}/{reps} bad). Stopping ladder.")
|
||||
return False
|
||||
if sleep > 0:
|
||||
print(f" recovery sleep 30s before next tier...")
|
||||
time.sleep(30)
|
||||
return True
|
||||
|
||||
|
||||
def _run_paths_tier(
|
||||
client: httpx.Client, paths: list[str], *, sleep: float, tier_name: str
|
||||
) -> bool:
|
||||
print(f"\n=== {tier_name} ({len(paths)} pages at {sleep}s interval) ===")
|
||||
statuses, sizes, latencies = [], [], []
|
||||
bad = 0
|
||||
for i, p in enumerate(paths):
|
||||
url = f"https://oshwhub.com/{p}"
|
||||
t0 = time.perf_counter()
|
||||
try:
|
||||
r = client.get(url)
|
||||
sz = len(r.content)
|
||||
statuses.append(r.status_code); sizes.append(sz)
|
||||
latencies.append(time.perf_counter() - t0)
|
||||
ok = (r.status_code == 200) and sz > 5000 # detail pages should be sizable
|
||||
if not ok:
|
||||
bad += 1
|
||||
print(f" [{i+1}] !! status={r.status_code} sz={sz} url={url[:80]}",
|
||||
flush=True)
|
||||
except Exception as e: # noqa: BLE001
|
||||
bad += 1
|
||||
statuses.append(-1)
|
||||
print(f" [{i+1}] EXC {type(e).__name__}: {e}", flush=True)
|
||||
if i + 1 < len(paths):
|
||||
time.sleep(sleep)
|
||||
_summary(statuses, sizes, latencies, bad)
|
||||
if bad:
|
||||
print(f" -> tier FAILED. Stopping ladder.")
|
||||
return False
|
||||
if sleep > 0:
|
||||
print(f" recovery sleep 30s before next tier..."); time.sleep(30)
|
||||
return True
|
||||
|
||||
|
||||
def _run_pro_tier(client: httpx.Client, uuids: list[str], *, sleep: float, tier: str) -> bool:
|
||||
print(f"\n=== {tier} ({len(uuids)} project meta calls at {sleep}s) ===")
|
||||
statuses, sizes, latencies = [], [], []
|
||||
bad = 0
|
||||
for i, u in enumerate(uuids):
|
||||
url = f"{PRO_API}/projects/{u}"
|
||||
t0 = time.perf_counter()
|
||||
try:
|
||||
r = client.get(url, headers={"path": u})
|
||||
sz = len(r.content)
|
||||
statuses.append(r.status_code); sizes.append(sz)
|
||||
latencies.append(time.perf_counter() - t0)
|
||||
try:
|
||||
j = r.json()
|
||||
ok = r.status_code == 200 and j.get("success", False)
|
||||
except Exception:
|
||||
ok = False
|
||||
if not ok:
|
||||
bad += 1
|
||||
print(f" [{i+1}] !! status={r.status_code} sz={sz} body[:200]={r.text[:200]!r}",
|
||||
flush=True)
|
||||
except Exception as e: # noqa: BLE001
|
||||
bad += 1
|
||||
statuses.append(-1)
|
||||
print(f" [{i+1}] EXC {type(e).__name__}: {e}", flush=True)
|
||||
if i + 1 < len(uuids):
|
||||
time.sleep(sleep)
|
||||
_summary(statuses, sizes, latencies, bad)
|
||||
if bad:
|
||||
return False
|
||||
if sleep > 0:
|
||||
print(f" recovery sleep 30s before next tier..."); time.sleep(30)
|
||||
return True
|
||||
|
||||
|
||||
def _summary(statuses, sizes, latencies, bad) -> None:
|
||||
if not statuses:
|
||||
return
|
||||
by_code: dict[int, int] = {}
|
||||
for s in statuses:
|
||||
by_code[s] = by_code.get(s, 0) + 1
|
||||
if latencies:
|
||||
med = statistics.median(latencies)
|
||||
p90 = sorted(latencies)[int(len(latencies) * 0.9)]
|
||||
print(f" status: {by_code} bad={bad} latency med={med * 1000:.0f}ms p90={p90 * 1000:.0f}ms")
|
||||
else:
|
||||
print(f" status: {by_code} bad={bad}")
|
||||
if sizes:
|
||||
print(f" size: median={statistics.median(sizes)} min={min(sizes)} max={max(sizes)}")
|
||||
|
||||
|
||||
def main() -> int:
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--host", choices=["oshwhub", "detail", "pro"], required=True)
|
||||
ap.add_argument("--reps", type=int, default=10)
|
||||
args = ap.parse_args()
|
||||
|
||||
if args.host == "oshwhub":
|
||||
ladder_oshwhub_listing(reps=args.reps)
|
||||
elif args.host == "detail":
|
||||
ladder_oshwhub_detail(reps=args.reps)
|
||||
elif args.host == "pro":
|
||||
ladder_pro_api(reps=min(args.reps, 8)) # cap pro reps
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user