"""Dump full oshwhub listing index for both origins to a local jsonl. 只抓 listing API,不抓详情页、不抓附件、不抓工程源。结果落 `data/state/oshwhub_listing_full.jsonl`,每行一条 listing 项。 Usage: uv run python scripts/dump_listing_index.py uv run python scripts/dump_listing_index.py --page-size 500 --sleep 0.5 API 注意: - `sort` 参数被服务端静默忽略,无论传啥都返回隐式排序(grade desc → 质量 desc) - 默认 `origin` 是 std;要 Pro 必须显式 `origin=pro` - `pageSize` 实测 ≥1000 都接受,单次响应体 ~1 MB """ from __future__ import annotations import argparse import json import math import statistics as st import sys import time from collections import Counter from pathlib import Path sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) from crawlers.oshwhub.crawler import list_projects, make_client # noqa: E402 ORIGINS = ("pro", "std") DEFAULT_OUT = Path("data/state/oshwhub_listing_full.jsonl") def dump_origin(client, origin: str, page_size: int, sleep: float, sink) -> int: # First call to learn `total` / `totalPage`. res = list_projects(client, page=1, page_size=page_size, origin=origin) total = res["total"] n_pages = math.ceil(total / page_size) written = 0 for it in res["lists"]: sink.write(json.dumps(it, ensure_ascii=False) + "\n") written += 1 print( f"[{origin}] total={total} pages={n_pages} pageSize={page_size}", flush=True, ) print(f" page 1/{n_pages}: {len(res['lists'])} items", flush=True) for p in range(2, n_pages + 1): time.sleep(sleep) res = list_projects(client, page=p, page_size=page_size, origin=origin) for it in res["lists"]: sink.write(json.dumps(it, ensure_ascii=False) + "\n") written += 1 print(f" page {p:>2}/{n_pages}: {len(res['lists'])} items", flush=True) if written != total: print( f" WARN: wrote {written} but server said total={total} " f"(diff={total - written})", file=sys.stderr, ) return written def summarize(path: Path) -> None: by_origin: dict[str, list[dict]] = {} with path.open() as f: for ln in f: it = json.loads(ln) by_origin.setdefault(it.get("origin") or "?", []).append(it) print("\n===== summary =====") print(f"file: {path} size={path.stat().st_size / 1024 / 1024:.1f} MB") for origin, items in sorted(by_origin.items()): if not items: continue likes = sorted(((i.get("count") or {}).get("like") or 0) for i in items) views = sorted(((i.get("count") or {}).get("views") or 0) for i in items) grades = Counter(i.get("grade") for i in items) # date range from created_at dates = sorted(i.get("created_at") or "" for i in items if i.get("created_at")) date_min = dates[0][:10] if dates else "?" date_max = dates[-1][:10] if dates else "?" print(f"\n[{origin}] n={len(items)}") print( f" likes: median={likes[len(likes) // 2]} " f"p90={likes[int(len(likes) * 0.9)]} " f"p99={likes[int(len(likes) * 0.99)]} max={likes[-1]}" ) print( f" views: median={views[len(views) // 2]} " f"p90={views[int(len(views) * 0.9)]} max={views[-1]}" ) print(f" grade: {dict(sorted(grades.items(), key=lambda kv: -(kv[0] or 0)))}") print(f" dates: {date_min} .. {date_max}") # Quality tiers (handy for batch sizing) tier_grade3 = sum(1 for i in items if (i.get("grade") or 0) >= 3) tier_like10 = sum( 1 for i in items if ((i.get("count") or {}).get("like") or 0) >= 10 and (i.get("grade") or 0) >= 3 ) print(f" tier: grade>=3: {tier_grade3} grade>=3 & like>=10: {tier_like10}") def main(argv: list[str] | None = None) -> int: ap = argparse.ArgumentParser() ap.add_argument("--out", type=Path, default=DEFAULT_OUT) ap.add_argument("--page-size", type=int, default=1000) ap.add_argument("--sleep", type=float, default=1.0, help="seconds between page calls") args = ap.parse_args(argv) args.out.parent.mkdir(parents=True, exist_ok=True) print(f"writing -> {args.out}") t0 = time.time() with args.out.open("w") as sink, make_client() as client: total = 0 for origin in ORIGINS: total += dump_origin(client, origin, args.page_size, args.sleep, sink) sink.flush() print(f"\ndone: {total} items in {time.time() - t0:.1f}s") summarize(args.out) return 0 if __name__ == "__main__": raise SystemExit(main())