Files
FacereDataset/scripts/probe_rate_limit.py
Knowit 183f82a3be crawler: drop SLEEP_SOURCE 5.0 -> 0.5 (Std doc endpoint probe)
Ladder probe lceda.cn/api/documents/<uuid>: 5 tiers (5/2/1/0.5/0.25s)
× 9 distinct Std doc UUIDs = 45 reqs total, all 200/success. Latency
variance is dominated by payload size (Std docs span 4 KB to 4.5 MB)
not server backpressure. Same posture as Pro API.

Net effect on batch-50 estimate: Std 25 项 × 10 doc calls saved ~19
min wall time (21min sleep -> 2min sleep). Combined plan now projects
~2h -> ~10min walltime exclusive of download bytes.

scripts/probe_rate_limit.py: --host std-doc tier added. Reads doc UUIDs
from /tmp/std_doc_uuids.json (assembled by caller from any source/manifest.json
upstream_version_documents lists). Reusable.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 00:54:46 +08:00

311 lines
11 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Rate-limit ladder probe — find each host's actual ceiling.
依次以越来越短的间隔向目标端点发请求,监控状态码 / body size / 异常。
任何一档出现 429 / 403 / 5xx / 异常 close → 停在该档,把上一档作为安全水位。
设计原则
- 单点采样不下重复结论:每档至少 8-10 次请求才作判断
- 每两档之间插 30s 恢复期,避免上一档触发的限流污染下一档
- 只读端点GET不修改任何东西
- Pro API 用候选清单里我们本来就要打的 UUID不浪费指纹
Usage:
uv run python scripts/probe_rate_limit.py --host oshwhub
uv run python scripts/probe_rate_limit.py --host detail
uv run python scripts/probe_rate_limit.py --host pro # cookie required
"""
from __future__ import annotations
import argparse
import json
import statistics
import sys
import time
from pathlib import Path
import httpx
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from crawlers.oshwhub.crawler import ( # noqa: E402
BROWSER_UA,
LCEDA_DOC_API,
PRO_API,
PRO_COOKIE_PATH_DEFAULT,
PRO_EDITOR_VERSION,
UA,
make_client,
make_pro_source_client,
make_source_client,
)
def ladder_oshwhub_listing(reps: int = 10) -> None:
"""oshwhub.com/api/project — listing API, no auth."""
client = make_client()
sched = [2.0, 1.0, 0.5, 0.25, 0.1, 0.0]
for sleep in sched:
if not _run_one_tier(
client,
"GET",
"https://oshwhub.com/api/project",
params={"page": 1, "pageSize": 30, "origin": "pro"},
sleep=sleep,
reps=reps,
tier_name=f"listing@{sleep}s",
):
break
def ladder_oshwhub_detail(reps: int = 10) -> None:
"""oshwhub.com/<owner>/<path> — detail HTML pages.
Use the 50 candidate paths so the test exercises real targets.
"""
candidates = [
json.loads(ln)
for ln in open("data/state/oshwhub_batch50_candidates.jsonl")
]
client = make_client()
# Start polite, ramp aggressive
sched = [2.0, 1.0, 0.5, 0.25, 0.1, 0.0]
for sleep in sched:
# Pull `reps` distinct paths; rotate so we don't hit same page twice in a tier
paths = [c["path"] for c in candidates[:reps]]
if not _run_paths_tier(
client,
paths,
sleep=sleep,
tier_name=f"detail@{sleep}s",
):
break
def ladder_std_doc(reps: int = 10) -> None:
"""lceda.cn/api/documents/<doc_uuid> — anonymous Std doc fetch.
Reads /tmp/std_doc_uuids.json (collected from already-crawled Std
projects). Std endpoints want browser UA + Referer (see
docs/sources/easyeda_std_source.md §3) — use the real source client.
"""
uuids = json.loads(Path("/tmp/std_doc_uuids.json").read_text())
if len(uuids) < reps:
print(f"only {len(uuids)} Std doc UUIDs available", file=sys.stderr)
return
client = make_source_client()
sched = [5.0, 2.0, 1.0, 0.5, 0.25]
cursor = 0
for sleep in sched:
# rotate so each tier hits distinct doc UUIDs
slot = uuids[cursor : cursor + reps]
cursor += reps
if len(slot) < reps:
slot = uuids[:reps] # fall back to repeats if pool exhausted
if not _run_std_tier(client, slot, sleep=sleep, tier=f"std-doc@{sleep}s"):
break
def ladder_pro_api(reps: int = 8) -> None:
"""pro.lceda.cn/api/v4/projects/<P> — auth required.
Probes the project-meta endpoint with logged-in cookie. We cap reps
lower since this is the most precious host (account ban risk).
Conservative ladder; bail aggressively on any non-200.
"""
candidates = [
json.loads(ln)
for ln in open("data/state/oshwhub_batch50_candidates.jsonl")
]
pro_uuids = [c["uuid"] for c in candidates if c.get("origin") == "pro"]
if len(pro_uuids) < reps:
print(f"only {len(pro_uuids)} Pro UUIDs available (need {reps})", file=sys.stderr)
return
client = make_pro_source_client()
# Conservative ladder for Pro: start 5s, halve down, stop early on trouble
sched = [5.0, 2.0, 1.0, 0.5, 0.25]
for sleep in sched:
if not _run_pro_tier(client, pro_uuids[:reps], sleep=sleep, tier=f"pro@{sleep}s"):
print(f"\n STOP at {sleep}s — previous tier is safe water-mark.")
break
def _run_one_tier(
client: httpx.Client,
method: str,
url: str,
*,
sleep: float,
reps: int,
tier_name: str,
params: dict | None = None,
) -> bool:
print(f"\n=== {tier_name} ({reps} reqs at {sleep}s interval) ===")
statuses, sizes, latencies = [], [], []
bad = 0
for i in range(reps):
t0 = time.perf_counter()
try:
r = client.request(method, url, params=params)
sz = len(r.content)
statuses.append(r.status_code)
sizes.append(sz)
latencies.append(time.perf_counter() - t0)
ok = (r.status_code == 200) and sz > 0
if not ok:
bad += 1
print(f" [{i+1}] !! status={r.status_code} sz={sz}", flush=True)
except Exception as e: # noqa: BLE001
bad += 1
statuses.append(-1)
print(f" [{i+1}] EXC {type(e).__name__}: {e}", flush=True)
if i + 1 < reps:
time.sleep(sleep)
_summary(statuses, sizes, latencies, bad)
if bad:
print(f" -> tier FAILED ({bad}/{reps} bad). Stopping ladder.")
return False
if sleep > 0:
print(f" recovery sleep 30s before next tier...")
time.sleep(30)
return True
def _run_paths_tier(
client: httpx.Client, paths: list[str], *, sleep: float, tier_name: str
) -> bool:
print(f"\n=== {tier_name} ({len(paths)} pages at {sleep}s interval) ===")
statuses, sizes, latencies = [], [], []
bad = 0
for i, p in enumerate(paths):
url = f"https://oshwhub.com/{p}"
t0 = time.perf_counter()
try:
r = client.get(url)
sz = len(r.content)
statuses.append(r.status_code); sizes.append(sz)
latencies.append(time.perf_counter() - t0)
ok = (r.status_code == 200) and sz > 5000 # detail pages should be sizable
if not ok:
bad += 1
print(f" [{i+1}] !! status={r.status_code} sz={sz} url={url[:80]}",
flush=True)
except Exception as e: # noqa: BLE001
bad += 1
statuses.append(-1)
print(f" [{i+1}] EXC {type(e).__name__}: {e}", flush=True)
if i + 1 < len(paths):
time.sleep(sleep)
_summary(statuses, sizes, latencies, bad)
if bad:
print(f" -> tier FAILED. Stopping ladder.")
return False
if sleep > 0:
print(f" recovery sleep 30s before next tier..."); time.sleep(30)
return True
def _run_std_tier(client: httpx.Client, uuids: list[str], *, sleep: float, tier: str) -> bool:
print(f"\n=== {tier} ({len(uuids)} doc fetches at {sleep}s) ===")
statuses, sizes, lats = [], [], []
bad = 0
for i, u in enumerate(uuids):
url = f"{LCEDA_DOC_API}/{u}"
t0 = time.perf_counter()
try:
r = client.get(url, params={"uuid": u, "path": u})
sz = len(r.content)
statuses.append(r.status_code); sizes.append(sz)
lats.append(time.perf_counter() - t0)
try:
ok = r.status_code == 200 and r.json().get("success", False)
except Exception:
ok = False
if not ok:
bad += 1
print(f" [{i+1}] !! status={r.status_code} sz={sz} body[:200]={r.text[:200]!r}",
flush=True)
except Exception as e: # noqa: BLE001
bad += 1
statuses.append(-1)
print(f" [{i+1}] EXC {type(e).__name__}: {e}", flush=True)
if i + 1 < len(uuids):
time.sleep(sleep)
_summary(statuses, sizes, lats, bad)
if bad:
return False
if sleep > 0:
print(f" recovery sleep 30s before next tier..."); time.sleep(30)
return True
def _run_pro_tier(client: httpx.Client, uuids: list[str], *, sleep: float, tier: str) -> bool:
print(f"\n=== {tier} ({len(uuids)} project meta calls at {sleep}s) ===")
statuses, sizes, latencies = [], [], []
bad = 0
for i, u in enumerate(uuids):
url = f"{PRO_API}/projects/{u}"
t0 = time.perf_counter()
try:
r = client.get(url, headers={"path": u})
sz = len(r.content)
statuses.append(r.status_code); sizes.append(sz)
latencies.append(time.perf_counter() - t0)
try:
j = r.json()
ok = r.status_code == 200 and j.get("success", False)
except Exception:
ok = False
if not ok:
bad += 1
print(f" [{i+1}] !! status={r.status_code} sz={sz} body[:200]={r.text[:200]!r}",
flush=True)
except Exception as e: # noqa: BLE001
bad += 1
statuses.append(-1)
print(f" [{i+1}] EXC {type(e).__name__}: {e}", flush=True)
if i + 1 < len(uuids):
time.sleep(sleep)
_summary(statuses, sizes, latencies, bad)
if bad:
return False
if sleep > 0:
print(f" recovery sleep 30s before next tier..."); time.sleep(30)
return True
def _summary(statuses, sizes, latencies, bad) -> None:
if not statuses:
return
by_code: dict[int, int] = {}
for s in statuses:
by_code[s] = by_code.get(s, 0) + 1
if latencies:
med = statistics.median(latencies)
p90 = sorted(latencies)[int(len(latencies) * 0.9)]
print(f" status: {by_code} bad={bad} latency med={med * 1000:.0f}ms p90={p90 * 1000:.0f}ms")
else:
print(f" status: {by_code} bad={bad}")
if sizes:
print(f" size: median={statistics.median(sizes)} min={min(sizes)} max={max(sizes)}")
def main() -> int:
ap = argparse.ArgumentParser()
ap.add_argument("--host", choices=["oshwhub", "detail", "pro", "std-doc"], required=True)
ap.add_argument("--reps", type=int, default=10)
args = ap.parse_args()
if args.host == "oshwhub":
ladder_oshwhub_listing(reps=args.reps)
elif args.host == "detail":
ladder_oshwhub_detail(reps=args.reps)
elif args.host == "pro":
ladder_pro_api(reps=min(args.reps, 8)) # cap pro reps
elif args.host == "std-doc":
ladder_std_doc(reps=args.reps)
return 0
if __name__ == "__main__":
raise SystemExit(main())