crawler: drop SLEEP_SOURCE 5.0 -> 0.5 (Std doc endpoint probe)

Ladder probe lceda.cn/api/documents/<uuid>: 5 tiers (5/2/1/0.5/0.25s)
× 9 distinct Std doc UUIDs = 45 reqs total, all 200/success. Latency
variance is dominated by payload size (Std docs span 4 KB to 4.5 MB)
not server backpressure. Same posture as Pro API.

Net effect on batch-50 estimate: Std 25 项 × 10 doc calls saved ~19
min wall time (21min sleep -> 2min sleep). Combined plan now projects
~2h -> ~10min walltime exclusive of download bytes.

scripts/probe_rate_limit.py: --host std-doc tier added. Reads doc UUIDs
from /tmp/std_doc_uuids.json (assembled by caller from any source/manifest.json
upstream_version_documents lists). Reusable.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-29 00:54:46 +08:00
parent 8b857428e3
commit 183f82a3be
3 changed files with 86 additions and 9 deletions

View File

@@ -29,12 +29,14 @@ import httpx
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from crawlers.oshwhub.crawler import ( # noqa: E402
BROWSER_UA,
LCEDA_DOC_API,
PRO_API,
PRO_COOKIE_PATH_DEFAULT,
PRO_EDITOR_VERSION,
UA,
make_client,
make_pro_source_client,
make_source_client,
)
@@ -79,6 +81,30 @@ def ladder_oshwhub_detail(reps: int = 10) -> None:
break
def ladder_std_doc(reps: int = 10) -> None:
"""lceda.cn/api/documents/<doc_uuid> — anonymous Std doc fetch.
Reads /tmp/std_doc_uuids.json (collected from already-crawled Std
projects). Std endpoints want browser UA + Referer (see
docs/sources/easyeda_std_source.md §3) — use the real source client.
"""
uuids = json.loads(Path("/tmp/std_doc_uuids.json").read_text())
if len(uuids) < reps:
print(f"only {len(uuids)} Std doc UUIDs available", file=sys.stderr)
return
client = make_source_client()
sched = [5.0, 2.0, 1.0, 0.5, 0.25]
cursor = 0
for sleep in sched:
# rotate so each tier hits distinct doc UUIDs
slot = uuids[cursor : cursor + reps]
cursor += reps
if len(slot) < reps:
slot = uuids[:reps] # fall back to repeats if pool exhausted
if not _run_std_tier(client, slot, sleep=sleep, tier=f"std-doc@{sleep}s"):
break
def ladder_pro_api(reps: int = 8) -> None:
"""pro.lceda.cn/api/v4/projects/<P> — auth required.
@@ -178,6 +204,40 @@ def _run_paths_tier(
return True
def _run_std_tier(client: httpx.Client, uuids: list[str], *, sleep: float, tier: str) -> bool:
print(f"\n=== {tier} ({len(uuids)} doc fetches at {sleep}s) ===")
statuses, sizes, lats = [], [], []
bad = 0
for i, u in enumerate(uuids):
url = f"{LCEDA_DOC_API}/{u}"
t0 = time.perf_counter()
try:
r = client.get(url, params={"uuid": u, "path": u})
sz = len(r.content)
statuses.append(r.status_code); sizes.append(sz)
lats.append(time.perf_counter() - t0)
try:
ok = r.status_code == 200 and r.json().get("success", False)
except Exception:
ok = False
if not ok:
bad += 1
print(f" [{i+1}] !! status={r.status_code} sz={sz} body[:200]={r.text[:200]!r}",
flush=True)
except Exception as e: # noqa: BLE001
bad += 1
statuses.append(-1)
print(f" [{i+1}] EXC {type(e).__name__}: {e}", flush=True)
if i + 1 < len(uuids):
time.sleep(sleep)
_summary(statuses, sizes, lats, bad)
if bad:
return False
if sleep > 0:
print(f" recovery sleep 30s before next tier..."); time.sleep(30)
return True
def _run_pro_tier(client: httpx.Client, uuids: list[str], *, sleep: float, tier: str) -> bool:
print(f"\n=== {tier} ({len(uuids)} project meta calls at {sleep}s) ===")
statuses, sizes, latencies = [], [], []
@@ -231,7 +291,7 @@ def _summary(statuses, sizes, latencies, bad) -> None:
def main() -> int:
ap = argparse.ArgumentParser()
ap.add_argument("--host", choices=["oshwhub", "detail", "pro"], required=True)
ap.add_argument("--host", choices=["oshwhub", "detail", "pro", "std-doc"], required=True)
ap.add_argument("--reps", type=int, default=10)
args = ap.parse_args()
@@ -241,6 +301,8 @@ def main() -> int:
ladder_oshwhub_detail(reps=args.reps)
elif args.host == "pro":
ladder_pro_api(reps=min(args.reps, 8)) # cap pro reps
elif args.host == "std-doc":
ladder_std_doc(reps=args.reps)
return 0