From 183f82a3beb6f1a9fb105ff099d26cee66bbe211 Mon Sep 17 00:00:00 2001 From: Knowit Date: Wed, 29 Apr 2026 00:54:46 +0800 Subject: [PATCH] crawler: drop SLEEP_SOURCE 5.0 -> 0.5 (Std doc endpoint probe) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ladder probe lceda.cn/api/documents/: 5 tiers (5/2/1/0.5/0.25s) × 9 distinct Std doc UUIDs = 45 reqs total, all 200/success. Latency variance is dominated by payload size (Std docs span 4 KB to 4.5 MB) not server backpressure. Same posture as Pro API. Net effect on batch-50 estimate: Std 25 项 × 10 doc calls saved ~19 min wall time (21min sleep -> 2min sleep). Combined plan now projects ~2h -> ~10min walltime exclusive of download bytes. scripts/probe_rate_limit.py: --host std-doc tier added. Reads doc UUIDs from /tmp/std_doc_uuids.json (assembled by caller from any source/manifest.json upstream_version_documents lists). Reusable. Co-Authored-By: Claude Opus 4.7 (1M context) --- crawlers/oshwhub/crawler.py | 6 ++- docs/sources/probe_rate_limit_results.md | 25 ++++++--- scripts/probe_rate_limit.py | 64 +++++++++++++++++++++++- 3 files changed, 86 insertions(+), 9 deletions(-) diff --git a/crawlers/oshwhub/crawler.py b/crawlers/oshwhub/crawler.py index 04278b0..f53143c 100644 --- a/crawlers/oshwhub/crawler.py +++ b/crawlers/oshwhub/crawler.py @@ -49,8 +49,10 @@ BROWSER_UA = ( SLEEP_BETWEEN = 1.0 # oshwhub.com detail/listing — ladder probe: 0.5s clean, # 1.0s leaves headroom (detail HTML p90 hits 6s at 1.0s, # 15s at 0.5s due to server-queue softlimit). -SLEEP_SOURCE = 5.0 # lceda.cn Std source endpoints — NOT yet probed; keep - # conservative. Drop only after a dedicated ladder run. +SLEEP_SOURCE = 0.5 # lceda.cn Std source endpoints — ladder probe 5/2/1/0.5/0.25s + # all clean (45/45 200/success). Latency is dominated by + # payload size (Std docs span 4 KB to 4.5 MB) not server + # backpressure. Same posture as Pro API. 10x speedup. SLEEP_PRO = 0.5 # pro.lceda.cn API host — sustained burst probe (25 # distinct UUIDs at 0.5s) showed 0/25 errors, median # latency 410ms. 10x faster than the original 5.0s. diff --git a/docs/sources/probe_rate_limit_results.md b/docs/sources/probe_rate_limit_results.md index d2c8d7f..efec6b7 100644 --- a/docs/sources/probe_rate_limit_results.md +++ b/docs/sources/probe_rate_limit_results.md @@ -64,11 +64,23 @@ Then **sustained burst test** at the chosen water mark: cleanly, even sustained. Originally set high (5s) out of caution because Pro requires a logged-in account — that caution was unjustified. -## lceda.cn Std source endpoints — NOT YET PROBED +## lceda.cn Std doc endpoint (`/api/documents/`) -Currently `SLEEP_SOURCE = 5.0`. Should be probed before lowering. Std -crawler isn't on the critical path for batch-50 (~12 min vs Pro's -~10 min savings), so this can wait. +No auth (Std is anonymous-readable, browser UA + Referer only). +5 tiers × 9 distinct doc UUIDs from already-crawled Std projects. + +| sleep | status | bad | latency med | latency p90 | body median | +|---|---|---:|---:|---:|---:| +| 5.0s | all 200 | 0 | 1124ms | 3846ms | 31 KB | +| 2.0s | all 200 | 0 | 2634ms | 7626ms | 495 KB | +| 1.0s | all 200 | 0 | 1781ms | **19834ms** (one 4.5 MB doc) | 918 KB | +| 0.5s | all 200 | 0 | 666ms | 891ms | 748 KB | +| 0.25s | all 200 | 0 | 416ms | 1384ms | 251 KB | + +**Verdict**: 0.5s safe water mark. Latency variance is dominated by +**payload size** (Std docs span 4 KB to 4.5 MB) — not server backpressure. +The 19s p90 at the 1.0s tier was one giant doc, not a throttle. Same +posture as Pro API. ## modules.lceda.cn CDN — already at 0.2s @@ -80,7 +92,7 @@ back-to-back without throttling. No further probing needed. ```python SLEEP_BETWEEN = 1.0 # was 2.0 (oshwhub detail/listing) -SLEEP_SOURCE = 5.0 # unchanged (Std source — not yet probed) +SLEEP_SOURCE = 0.5 # was 5.0 (Std doc endpoint, 10× speedup) SLEEP_PRO = 0.5 # was 5.0 (Pro API host, 10× speedup) SLEEP_PRO_CDN = 0.2 # unchanged (CDN, already optimized) ``` @@ -88,5 +100,6 @@ SLEEP_PRO_CDN = 0.2 # unchanged (CDN, already optimized) ## Net impact on batch-50 plan - Pro 25 项 × ~5 API calls each: 5×5 = 25s/proj × 25 = ~10min → 0.5×5 = 2.5s/proj × 25 = ~1min +- Std 25 项 × ~10 doc calls each: 5×10 = 50s/proj × 25 = ~21min → 0.5×10 = 5s/proj × 25 = ~2min - Detail page scan 50 项: 50 × 2s = 100s → 50 × 1s = 50s -- Combined batch-50 walltime estimate: **~1.5h → ~30 min** +- Combined batch-50 walltime estimate: **~2h → ~10 min** (excluding actual download bytes) diff --git a/scripts/probe_rate_limit.py b/scripts/probe_rate_limit.py index 4b12e5c..b78c7d6 100644 --- a/scripts/probe_rate_limit.py +++ b/scripts/probe_rate_limit.py @@ -29,12 +29,14 @@ import httpx sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) from crawlers.oshwhub.crawler import ( # noqa: E402 BROWSER_UA, + LCEDA_DOC_API, PRO_API, PRO_COOKIE_PATH_DEFAULT, PRO_EDITOR_VERSION, UA, make_client, make_pro_source_client, + make_source_client, ) @@ -79,6 +81,30 @@ def ladder_oshwhub_detail(reps: int = 10) -> None: break +def ladder_std_doc(reps: int = 10) -> None: + """lceda.cn/api/documents/ — anonymous Std doc fetch. + + Reads /tmp/std_doc_uuids.json (collected from already-crawled Std + projects). Std endpoints want browser UA + Referer (see + docs/sources/easyeda_std_source.md §3) — use the real source client. + """ + uuids = json.loads(Path("/tmp/std_doc_uuids.json").read_text()) + if len(uuids) < reps: + print(f"only {len(uuids)} Std doc UUIDs available", file=sys.stderr) + return + client = make_source_client() + sched = [5.0, 2.0, 1.0, 0.5, 0.25] + cursor = 0 + for sleep in sched: + # rotate so each tier hits distinct doc UUIDs + slot = uuids[cursor : cursor + reps] + cursor += reps + if len(slot) < reps: + slot = uuids[:reps] # fall back to repeats if pool exhausted + if not _run_std_tier(client, slot, sleep=sleep, tier=f"std-doc@{sleep}s"): + break + + def ladder_pro_api(reps: int = 8) -> None: """pro.lceda.cn/api/v4/projects/

— auth required. @@ -178,6 +204,40 @@ def _run_paths_tier( return True +def _run_std_tier(client: httpx.Client, uuids: list[str], *, sleep: float, tier: str) -> bool: + print(f"\n=== {tier} ({len(uuids)} doc fetches at {sleep}s) ===") + statuses, sizes, lats = [], [], [] + bad = 0 + for i, u in enumerate(uuids): + url = f"{LCEDA_DOC_API}/{u}" + t0 = time.perf_counter() + try: + r = client.get(url, params={"uuid": u, "path": u}) + sz = len(r.content) + statuses.append(r.status_code); sizes.append(sz) + lats.append(time.perf_counter() - t0) + try: + ok = r.status_code == 200 and r.json().get("success", False) + except Exception: + ok = False + if not ok: + bad += 1 + print(f" [{i+1}] !! status={r.status_code} sz={sz} body[:200]={r.text[:200]!r}", + flush=True) + except Exception as e: # noqa: BLE001 + bad += 1 + statuses.append(-1) + print(f" [{i+1}] EXC {type(e).__name__}: {e}", flush=True) + if i + 1 < len(uuids): + time.sleep(sleep) + _summary(statuses, sizes, lats, bad) + if bad: + return False + if sleep > 0: + print(f" recovery sleep 30s before next tier..."); time.sleep(30) + return True + + def _run_pro_tier(client: httpx.Client, uuids: list[str], *, sleep: float, tier: str) -> bool: print(f"\n=== {tier} ({len(uuids)} project meta calls at {sleep}s) ===") statuses, sizes, latencies = [], [], [] @@ -231,7 +291,7 @@ def _summary(statuses, sizes, latencies, bad) -> None: def main() -> int: ap = argparse.ArgumentParser() - ap.add_argument("--host", choices=["oshwhub", "detail", "pro"], required=True) + ap.add_argument("--host", choices=["oshwhub", "detail", "pro", "std-doc"], required=True) ap.add_argument("--reps", type=int, default=10) args = ap.parse_args() @@ -241,6 +301,8 @@ def main() -> int: ladder_oshwhub_detail(reps=args.reps) elif args.host == "pro": ladder_pro_api(reps=min(args.reps, 8)) # cap pro reps + elif args.host == "std-doc": + ladder_std_doc(reps=args.reps) return 0