Ladder probe lceda.cn/api/documents/<uuid>: 5 tiers (5/2/1/0.5/0.25s) × 9 distinct Std doc UUIDs = 45 reqs total, all 200/success. Latency variance is dominated by payload size (Std docs span 4 KB to 4.5 MB) not server backpressure. Same posture as Pro API. Net effect on batch-50 estimate: Std 25 项 × 10 doc calls saved ~19 min wall time (21min sleep -> 2min sleep). Combined plan now projects ~2h -> ~10min walltime exclusive of download bytes. scripts/probe_rate_limit.py: --host std-doc tier added. Reads doc UUIDs from /tmp/std_doc_uuids.json (assembled by caller from any source/manifest.json upstream_version_documents lists). Reusable. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
311 lines
11 KiB
Python
311 lines
11 KiB
Python
"""Rate-limit ladder probe — find each host's actual ceiling.
|
||
|
||
依次以越来越短的间隔向目标端点发请求,监控状态码 / body size / 异常。
|
||
任何一档出现 429 / 403 / 5xx / 异常 close → 停在该档,把上一档作为安全水位。
|
||
|
||
设计原则
|
||
- 单点采样不下重复结论:每档至少 8-10 次请求才作判断
|
||
- 每两档之间插 30s 恢复期,避免上一档触发的限流污染下一档
|
||
- 只读端点(GET),不修改任何东西
|
||
- Pro API 用候选清单里我们本来就要打的 UUID,不浪费指纹
|
||
|
||
Usage:
|
||
uv run python scripts/probe_rate_limit.py --host oshwhub
|
||
uv run python scripts/probe_rate_limit.py --host detail
|
||
uv run python scripts/probe_rate_limit.py --host pro # cookie required
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import json
|
||
import statistics
|
||
import sys
|
||
import time
|
||
from pathlib import Path
|
||
|
||
import httpx
|
||
|
||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
||
from crawlers.oshwhub.crawler import ( # noqa: E402
|
||
BROWSER_UA,
|
||
LCEDA_DOC_API,
|
||
PRO_API,
|
||
PRO_COOKIE_PATH_DEFAULT,
|
||
PRO_EDITOR_VERSION,
|
||
UA,
|
||
make_client,
|
||
make_pro_source_client,
|
||
make_source_client,
|
||
)
|
||
|
||
|
||
def ladder_oshwhub_listing(reps: int = 10) -> None:
|
||
"""oshwhub.com/api/project — listing API, no auth."""
|
||
client = make_client()
|
||
sched = [2.0, 1.0, 0.5, 0.25, 0.1, 0.0]
|
||
for sleep in sched:
|
||
if not _run_one_tier(
|
||
client,
|
||
"GET",
|
||
"https://oshwhub.com/api/project",
|
||
params={"page": 1, "pageSize": 30, "origin": "pro"},
|
||
sleep=sleep,
|
||
reps=reps,
|
||
tier_name=f"listing@{sleep}s",
|
||
):
|
||
break
|
||
|
||
|
||
def ladder_oshwhub_detail(reps: int = 10) -> None:
|
||
"""oshwhub.com/<owner>/<path> — detail HTML pages.
|
||
|
||
Use the 50 candidate paths so the test exercises real targets.
|
||
"""
|
||
candidates = [
|
||
json.loads(ln)
|
||
for ln in open("data/state/oshwhub_batch50_candidates.jsonl")
|
||
]
|
||
client = make_client()
|
||
# Start polite, ramp aggressive
|
||
sched = [2.0, 1.0, 0.5, 0.25, 0.1, 0.0]
|
||
for sleep in sched:
|
||
# Pull `reps` distinct paths; rotate so we don't hit same page twice in a tier
|
||
paths = [c["path"] for c in candidates[:reps]]
|
||
if not _run_paths_tier(
|
||
client,
|
||
paths,
|
||
sleep=sleep,
|
||
tier_name=f"detail@{sleep}s",
|
||
):
|
||
break
|
||
|
||
|
||
def ladder_std_doc(reps: int = 10) -> None:
|
||
"""lceda.cn/api/documents/<doc_uuid> — anonymous Std doc fetch.
|
||
|
||
Reads /tmp/std_doc_uuids.json (collected from already-crawled Std
|
||
projects). Std endpoints want browser UA + Referer (see
|
||
docs/sources/easyeda_std_source.md §3) — use the real source client.
|
||
"""
|
||
uuids = json.loads(Path("/tmp/std_doc_uuids.json").read_text())
|
||
if len(uuids) < reps:
|
||
print(f"only {len(uuids)} Std doc UUIDs available", file=sys.stderr)
|
||
return
|
||
client = make_source_client()
|
||
sched = [5.0, 2.0, 1.0, 0.5, 0.25]
|
||
cursor = 0
|
||
for sleep in sched:
|
||
# rotate so each tier hits distinct doc UUIDs
|
||
slot = uuids[cursor : cursor + reps]
|
||
cursor += reps
|
||
if len(slot) < reps:
|
||
slot = uuids[:reps] # fall back to repeats if pool exhausted
|
||
if not _run_std_tier(client, slot, sleep=sleep, tier=f"std-doc@{sleep}s"):
|
||
break
|
||
|
||
|
||
def ladder_pro_api(reps: int = 8) -> None:
|
||
"""pro.lceda.cn/api/v4/projects/<P> — auth required.
|
||
|
||
Probes the project-meta endpoint with logged-in cookie. We cap reps
|
||
lower since this is the most precious host (account ban risk).
|
||
Conservative ladder; bail aggressively on any non-200.
|
||
"""
|
||
candidates = [
|
||
json.loads(ln)
|
||
for ln in open("data/state/oshwhub_batch50_candidates.jsonl")
|
||
]
|
||
pro_uuids = [c["uuid"] for c in candidates if c.get("origin") == "pro"]
|
||
if len(pro_uuids) < reps:
|
||
print(f"only {len(pro_uuids)} Pro UUIDs available (need {reps})", file=sys.stderr)
|
||
return
|
||
client = make_pro_source_client()
|
||
# Conservative ladder for Pro: start 5s, halve down, stop early on trouble
|
||
sched = [5.0, 2.0, 1.0, 0.5, 0.25]
|
||
for sleep in sched:
|
||
if not _run_pro_tier(client, pro_uuids[:reps], sleep=sleep, tier=f"pro@{sleep}s"):
|
||
print(f"\n STOP at {sleep}s — previous tier is safe water-mark.")
|
||
break
|
||
|
||
|
||
def _run_one_tier(
|
||
client: httpx.Client,
|
||
method: str,
|
||
url: str,
|
||
*,
|
||
sleep: float,
|
||
reps: int,
|
||
tier_name: str,
|
||
params: dict | None = None,
|
||
) -> bool:
|
||
print(f"\n=== {tier_name} ({reps} reqs at {sleep}s interval) ===")
|
||
statuses, sizes, latencies = [], [], []
|
||
bad = 0
|
||
for i in range(reps):
|
||
t0 = time.perf_counter()
|
||
try:
|
||
r = client.request(method, url, params=params)
|
||
sz = len(r.content)
|
||
statuses.append(r.status_code)
|
||
sizes.append(sz)
|
||
latencies.append(time.perf_counter() - t0)
|
||
ok = (r.status_code == 200) and sz > 0
|
||
if not ok:
|
||
bad += 1
|
||
print(f" [{i+1}] !! status={r.status_code} sz={sz}", flush=True)
|
||
except Exception as e: # noqa: BLE001
|
||
bad += 1
|
||
statuses.append(-1)
|
||
print(f" [{i+1}] EXC {type(e).__name__}: {e}", flush=True)
|
||
if i + 1 < reps:
|
||
time.sleep(sleep)
|
||
_summary(statuses, sizes, latencies, bad)
|
||
if bad:
|
||
print(f" -> tier FAILED ({bad}/{reps} bad). Stopping ladder.")
|
||
return False
|
||
if sleep > 0:
|
||
print(f" recovery sleep 30s before next tier...")
|
||
time.sleep(30)
|
||
return True
|
||
|
||
|
||
def _run_paths_tier(
|
||
client: httpx.Client, paths: list[str], *, sleep: float, tier_name: str
|
||
) -> bool:
|
||
print(f"\n=== {tier_name} ({len(paths)} pages at {sleep}s interval) ===")
|
||
statuses, sizes, latencies = [], [], []
|
||
bad = 0
|
||
for i, p in enumerate(paths):
|
||
url = f"https://oshwhub.com/{p}"
|
||
t0 = time.perf_counter()
|
||
try:
|
||
r = client.get(url)
|
||
sz = len(r.content)
|
||
statuses.append(r.status_code); sizes.append(sz)
|
||
latencies.append(time.perf_counter() - t0)
|
||
ok = (r.status_code == 200) and sz > 5000 # detail pages should be sizable
|
||
if not ok:
|
||
bad += 1
|
||
print(f" [{i+1}] !! status={r.status_code} sz={sz} url={url[:80]}",
|
||
flush=True)
|
||
except Exception as e: # noqa: BLE001
|
||
bad += 1
|
||
statuses.append(-1)
|
||
print(f" [{i+1}] EXC {type(e).__name__}: {e}", flush=True)
|
||
if i + 1 < len(paths):
|
||
time.sleep(sleep)
|
||
_summary(statuses, sizes, latencies, bad)
|
||
if bad:
|
||
print(f" -> tier FAILED. Stopping ladder.")
|
||
return False
|
||
if sleep > 0:
|
||
print(f" recovery sleep 30s before next tier..."); time.sleep(30)
|
||
return True
|
||
|
||
|
||
def _run_std_tier(client: httpx.Client, uuids: list[str], *, sleep: float, tier: str) -> bool:
|
||
print(f"\n=== {tier} ({len(uuids)} doc fetches at {sleep}s) ===")
|
||
statuses, sizes, lats = [], [], []
|
||
bad = 0
|
||
for i, u in enumerate(uuids):
|
||
url = f"{LCEDA_DOC_API}/{u}"
|
||
t0 = time.perf_counter()
|
||
try:
|
||
r = client.get(url, params={"uuid": u, "path": u})
|
||
sz = len(r.content)
|
||
statuses.append(r.status_code); sizes.append(sz)
|
||
lats.append(time.perf_counter() - t0)
|
||
try:
|
||
ok = r.status_code == 200 and r.json().get("success", False)
|
||
except Exception:
|
||
ok = False
|
||
if not ok:
|
||
bad += 1
|
||
print(f" [{i+1}] !! status={r.status_code} sz={sz} body[:200]={r.text[:200]!r}",
|
||
flush=True)
|
||
except Exception as e: # noqa: BLE001
|
||
bad += 1
|
||
statuses.append(-1)
|
||
print(f" [{i+1}] EXC {type(e).__name__}: {e}", flush=True)
|
||
if i + 1 < len(uuids):
|
||
time.sleep(sleep)
|
||
_summary(statuses, sizes, lats, bad)
|
||
if bad:
|
||
return False
|
||
if sleep > 0:
|
||
print(f" recovery sleep 30s before next tier..."); time.sleep(30)
|
||
return True
|
||
|
||
|
||
def _run_pro_tier(client: httpx.Client, uuids: list[str], *, sleep: float, tier: str) -> bool:
|
||
print(f"\n=== {tier} ({len(uuids)} project meta calls at {sleep}s) ===")
|
||
statuses, sizes, latencies = [], [], []
|
||
bad = 0
|
||
for i, u in enumerate(uuids):
|
||
url = f"{PRO_API}/projects/{u}"
|
||
t0 = time.perf_counter()
|
||
try:
|
||
r = client.get(url, headers={"path": u})
|
||
sz = len(r.content)
|
||
statuses.append(r.status_code); sizes.append(sz)
|
||
latencies.append(time.perf_counter() - t0)
|
||
try:
|
||
j = r.json()
|
||
ok = r.status_code == 200 and j.get("success", False)
|
||
except Exception:
|
||
ok = False
|
||
if not ok:
|
||
bad += 1
|
||
print(f" [{i+1}] !! status={r.status_code} sz={sz} body[:200]={r.text[:200]!r}",
|
||
flush=True)
|
||
except Exception as e: # noqa: BLE001
|
||
bad += 1
|
||
statuses.append(-1)
|
||
print(f" [{i+1}] EXC {type(e).__name__}: {e}", flush=True)
|
||
if i + 1 < len(uuids):
|
||
time.sleep(sleep)
|
||
_summary(statuses, sizes, latencies, bad)
|
||
if bad:
|
||
return False
|
||
if sleep > 0:
|
||
print(f" recovery sleep 30s before next tier..."); time.sleep(30)
|
||
return True
|
||
|
||
|
||
def _summary(statuses, sizes, latencies, bad) -> None:
|
||
if not statuses:
|
||
return
|
||
by_code: dict[int, int] = {}
|
||
for s in statuses:
|
||
by_code[s] = by_code.get(s, 0) + 1
|
||
if latencies:
|
||
med = statistics.median(latencies)
|
||
p90 = sorted(latencies)[int(len(latencies) * 0.9)]
|
||
print(f" status: {by_code} bad={bad} latency med={med * 1000:.0f}ms p90={p90 * 1000:.0f}ms")
|
||
else:
|
||
print(f" status: {by_code} bad={bad}")
|
||
if sizes:
|
||
print(f" size: median={statistics.median(sizes)} min={min(sizes)} max={max(sizes)}")
|
||
|
||
|
||
def main() -> int:
|
||
ap = argparse.ArgumentParser()
|
||
ap.add_argument("--host", choices=["oshwhub", "detail", "pro", "std-doc"], required=True)
|
||
ap.add_argument("--reps", type=int, default=10)
|
||
args = ap.parse_args()
|
||
|
||
if args.host == "oshwhub":
|
||
ladder_oshwhub_listing(reps=args.reps)
|
||
elif args.host == "detail":
|
||
ladder_oshwhub_detail(reps=args.reps)
|
||
elif args.host == "pro":
|
||
ladder_pro_api(reps=min(args.reps, 8)) # cap pro reps
|
||
elif args.host == "std-doc":
|
||
ladder_std_doc(reps=args.reps)
|
||
return 0
|
||
|
||
|
||
if __name__ == "__main__":
|
||
raise SystemExit(main())
|