FacereDataset/scripts/probe_rate_limit.py

"""Rate-limit ladder probe — find each host's actual ceiling.

依次以越来越短的间隔向目标端点发请求，监控状态码 / body size / 异常。
任何一档出现 429 / 403 / 5xx / 异常 close → 停在该档，把上一档作为安全水位。

设计原则
- 单点采样不下重复结论：每档至少 8-10 次请求才作判断
- 每两档之间插 30s 恢复期，避免上一档触发的限流污染下一档
- 只读端点（GET），不修改任何东西
- Pro API 用候选清单里我们本来就要打的 UUID，不浪费指纹

Usage:
    uv run python scripts/probe_rate_limit.py --host oshwhub
    uv run python scripts/probe_rate_limit.py --host detail
    uv run python scripts/probe_rate_limit.py --host pro     # cookie required
"""

from __future__ import annotations

import argparse
import json
import statistics
import sys
import time
from pathlib import Path

import httpx

sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from crawlers.oshwhub.crawler import (  # noqa: E402
    BROWSER_UA,
    LCEDA_DOC_API,
    PRO_API,
    PRO_COOKIE_PATH_DEFAULT,
    PRO_EDITOR_VERSION,
    UA,
    make_client,
    make_pro_source_client,
    make_source_client,
)


def ladder_oshwhub_listing(reps: int = 10) -> None:
    """oshwhub.com/api/project — listing API, no auth."""
    client = make_client()
    sched = [2.0, 1.0, 0.5, 0.25, 0.1, 0.0]
    for sleep in sched:
        if not _run_one_tier(
            client,
            "GET",
            "https://oshwhub.com/api/project",
            params={"page": 1, "pageSize": 30, "origin": "pro"},
            sleep=sleep,
            reps=reps,
            tier_name=f"listing@{sleep}s",
        ):
            break


def ladder_oshwhub_detail(reps: int = 10) -> None:
    """oshwhub.com/<owner>/<path> — detail HTML pages.

    Use the 50 candidate paths so the test exercises real targets.
    """
    candidates = [
        json.loads(ln)
        for ln in open("data/state/oshwhub_batch50_candidates.jsonl")
    ]
    client = make_client()
    # Start polite, ramp aggressive
    sched = [2.0, 1.0, 0.5, 0.25, 0.1, 0.0]
    for sleep in sched:
        # Pull `reps` distinct paths; rotate so we don't hit same page twice in a tier
        paths = [c["path"] for c in candidates[:reps]]
        if not _run_paths_tier(
            client,
            paths,
            sleep=sleep,
            tier_name=f"detail@{sleep}s",
        ):
            break


def ladder_std_doc(reps: int = 10) -> None:
    """lceda.cn/api/documents/<doc_uuid> — anonymous Std doc fetch.

    Reads /tmp/std_doc_uuids.json (collected from already-crawled Std
    projects). Std endpoints want browser UA + Referer (see
    docs/sources/easyeda_std_source.md §3) — use the real source client.
    """
    uuids = json.loads(Path("/tmp/std_doc_uuids.json").read_text())
    if len(uuids) < reps:
        print(f"only {len(uuids)} Std doc UUIDs available", file=sys.stderr)
        return
    client = make_source_client()
    sched = [5.0, 2.0, 1.0, 0.5, 0.25]
    cursor = 0
    for sleep in sched:
        # rotate so each tier hits distinct doc UUIDs
        slot = uuids[cursor : cursor + reps]
        cursor += reps
        if len(slot) < reps:
            slot = uuids[:reps]  # fall back to repeats if pool exhausted
        if not _run_std_tier(client, slot, sleep=sleep, tier=f"std-doc@{sleep}s"):
            break


def ladder_pro_api(reps: int = 8) -> None:
    """pro.lceda.cn/api/v4/projects/<P> — auth required.

    Probes the project-meta endpoint with logged-in cookie. We cap reps
    lower since this is the most precious host (account ban risk).
    Conservative ladder; bail aggressively on any non-200.
    """
    candidates = [
        json.loads(ln)
        for ln in open("data/state/oshwhub_batch50_candidates.jsonl")
    ]
    pro_uuids = [c["uuid"] for c in candidates if c.get("origin") == "pro"]
    if len(pro_uuids) < reps:
        print(f"only {len(pro_uuids)} Pro UUIDs available (need {reps})", file=sys.stderr)
        return
    client = make_pro_source_client()
    # Conservative ladder for Pro: start 5s, halve down, stop early on trouble
    sched = [5.0, 2.0, 1.0, 0.5, 0.25]
    for sleep in sched:
        if not _run_pro_tier(client, pro_uuids[:reps], sleep=sleep, tier=f"pro@{sleep}s"):
            print(f"\n  STOP at {sleep}s — previous tier is safe water-mark.")
            break


def _run_one_tier(
    client: httpx.Client,
    method: str,
    url: str,
    *,
    sleep: float,
    reps: int,
    tier_name: str,
    params: dict | None = None,
) -> bool:
    print(f"\n=== {tier_name}  ({reps} reqs at {sleep}s interval) ===")
    statuses, sizes, latencies = [], [], []
    bad = 0
    for i in range(reps):
        t0 = time.perf_counter()
        try:
            r = client.request(method, url, params=params)
            sz = len(r.content)
            statuses.append(r.status_code)
            sizes.append(sz)
            latencies.append(time.perf_counter() - t0)
            ok = (r.status_code == 200) and sz > 0
            if not ok:
                bad += 1
                print(f"  [{i+1}] !! status={r.status_code} sz={sz}", flush=True)
        except Exception as e:  # noqa: BLE001
            bad += 1
            statuses.append(-1)
            print(f"  [{i+1}] EXC {type(e).__name__}: {e}", flush=True)
        if i + 1 < reps:
            time.sleep(sleep)
    _summary(statuses, sizes, latencies, bad)
    if bad:
        print(f"  -> tier FAILED ({bad}/{reps} bad). Stopping ladder.")
        return False
    if sleep > 0:
        print(f"  recovery sleep 30s before next tier...")
        time.sleep(30)
    return True


def _run_paths_tier(
    client: httpx.Client, paths: list[str], *, sleep: float, tier_name: str
) -> bool:
    print(f"\n=== {tier_name}  ({len(paths)} pages at {sleep}s interval) ===")
    statuses, sizes, latencies = [], [], []
    bad = 0
    for i, p in enumerate(paths):
        url = f"https://oshwhub.com/{p}"
        t0 = time.perf_counter()
        try:
            r = client.get(url)
            sz = len(r.content)
            statuses.append(r.status_code); sizes.append(sz)
            latencies.append(time.perf_counter() - t0)
            ok = (r.status_code == 200) and sz > 5000  # detail pages should be sizable
            if not ok:
                bad += 1
                print(f"  [{i+1}] !! status={r.status_code} sz={sz} url={url[:80]}",
                      flush=True)
        except Exception as e:  # noqa: BLE001
            bad += 1
            statuses.append(-1)
            print(f"  [{i+1}] EXC {type(e).__name__}: {e}", flush=True)
        if i + 1 < len(paths):
            time.sleep(sleep)
    _summary(statuses, sizes, latencies, bad)
    if bad:
        print(f"  -> tier FAILED. Stopping ladder.")
        return False
    if sleep > 0:
        print(f"  recovery sleep 30s before next tier..."); time.sleep(30)
    return True


def _run_std_tier(client: httpx.Client, uuids: list[str], *, sleep: float, tier: str) -> bool:
    print(f"\n=== {tier}  ({len(uuids)} doc fetches at {sleep}s) ===")
    statuses, sizes, lats = [], [], []
    bad = 0
    for i, u in enumerate(uuids):
        url = f"{LCEDA_DOC_API}/{u}"
        t0 = time.perf_counter()
        try:
            r = client.get(url, params={"uuid": u, "path": u})
            sz = len(r.content)
            statuses.append(r.status_code); sizes.append(sz)
            lats.append(time.perf_counter() - t0)
            try:
                ok = r.status_code == 200 and r.json().get("success", False)
            except Exception:
                ok = False
            if not ok:
                bad += 1
                print(f"  [{i+1}] !! status={r.status_code} sz={sz} body[:200]={r.text[:200]!r}",
                      flush=True)
        except Exception as e:  # noqa: BLE001
            bad += 1
            statuses.append(-1)
            print(f"  [{i+1}] EXC {type(e).__name__}: {e}", flush=True)
        if i + 1 < len(uuids):
            time.sleep(sleep)
    _summary(statuses, sizes, lats, bad)
    if bad:
        return False
    if sleep > 0:
        print(f"  recovery sleep 30s before next tier..."); time.sleep(30)
    return True


def _run_pro_tier(client: httpx.Client, uuids: list[str], *, sleep: float, tier: str) -> bool:
    print(f"\n=== {tier}  ({len(uuids)} project meta calls at {sleep}s) ===")
    statuses, sizes, latencies = [], [], []
    bad = 0
    for i, u in enumerate(uuids):
        url = f"{PRO_API}/projects/{u}"
        t0 = time.perf_counter()
        try:
            r = client.get(url, headers={"path": u})
            sz = len(r.content)
            statuses.append(r.status_code); sizes.append(sz)
            latencies.append(time.perf_counter() - t0)
            try:
                j = r.json()
                ok = r.status_code == 200 and j.get("success", False)
            except Exception:
                ok = False
            if not ok:
                bad += 1
                print(f"  [{i+1}] !! status={r.status_code} sz={sz} body[:200]={r.text[:200]!r}",
                      flush=True)
        except Exception as e:  # noqa: BLE001
            bad += 1
            statuses.append(-1)
            print(f"  [{i+1}] EXC {type(e).__name__}: {e}", flush=True)
        if i + 1 < len(uuids):
            time.sleep(sleep)
    _summary(statuses, sizes, latencies, bad)
    if bad:
        return False
    if sleep > 0:
        print(f"  recovery sleep 30s before next tier..."); time.sleep(30)
    return True


def _summary(statuses, sizes, latencies, bad) -> None:
    if not statuses:
        return
    by_code: dict[int, int] = {}
    for s in statuses:
        by_code[s] = by_code.get(s, 0) + 1
    if latencies:
        med = statistics.median(latencies)
        p90 = sorted(latencies)[int(len(latencies) * 0.9)]
        print(f"  status: {by_code}  bad={bad}  latency med={med * 1000:.0f}ms p90={p90 * 1000:.0f}ms")
    else:
        print(f"  status: {by_code}  bad={bad}")
    if sizes:
        print(f"  size:   median={statistics.median(sizes)} min={min(sizes)} max={max(sizes)}")


def main() -> int:
    ap = argparse.ArgumentParser()
    ap.add_argument("--host", choices=["oshwhub", "detail", "pro", "std-doc"], required=True)
    ap.add_argument("--reps", type=int, default=10)
    args = ap.parse_args()

    if args.host == "oshwhub":
        ladder_oshwhub_listing(reps=args.reps)
    elif args.host == "detail":
        ladder_oshwhub_detail(reps=args.reps)
    elif args.host == "pro":
        ladder_pro_api(reps=min(args.reps, 8))  # cap pro reps
    elif args.host == "std-doc":
        ladder_std_doc(reps=args.reps)
    return 0


if __name__ == "__main__":
    raise SystemExit(main())