oshwhub: dump full listing index (33,695 projects) for batch sizing
Probed listing API and learned: total field is exposed (Pro=21,202 / Std=12,493), pageSize accepts >=1000 (full corpus = 35 requests / 71s), sort param is silently ignored. Dump all listings via scripts/dump_listing_index.py to local jsonl so downstream batch-selection no longer hits the API. Why: needed quantitative anchors before scaling Pro batch beyond top-5. License is detail-page only (~19h serial scan), so we want to filter on grade/like *locally* first to shortlist before paying that cost. Quality-tier counts now known: A-tier (grade>=3 & like>=10) = 2,806 across both origins. - scripts/dump_listing_index.py: one-shot scraper, polite QPS, streams to jsonl - docs/sources/oshwhub_listing_full.md: human-readable report with growth trends, quality tiers, owner concentration, and storage-budget anchors Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
126
scripts/dump_listing_index.py
Normal file
126
scripts/dump_listing_index.py
Normal file
@@ -0,0 +1,126 @@
|
||||
"""Dump full oshwhub listing index for both origins to a local jsonl.
|
||||
|
||||
只抓 listing API,不抓详情页、不抓附件、不抓工程源。结果落
|
||||
`data/state/oshwhub_listing_full.jsonl`,每行一条 listing 项。
|
||||
|
||||
Usage:
|
||||
uv run python scripts/dump_listing_index.py
|
||||
uv run python scripts/dump_listing_index.py --page-size 500 --sleep 0.5
|
||||
|
||||
API 注意:
|
||||
- `sort` 参数被服务端静默忽略,无论传啥都返回隐式排序(grade desc → 质量 desc)
|
||||
- 默认 `origin` 是 std;要 Pro 必须显式 `origin=pro`
|
||||
- `pageSize` 实测 ≥1000 都接受,单次响应体 ~1 MB
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import math
|
||||
import statistics as st
|
||||
import sys
|
||||
import time
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
||||
from crawlers.oshwhub.crawler import list_projects, make_client # noqa: E402
|
||||
|
||||
ORIGINS = ("pro", "std")
|
||||
DEFAULT_OUT = Path("data/state/oshwhub_listing_full.jsonl")
|
||||
|
||||
|
||||
def dump_origin(client, origin: str, page_size: int, sleep: float, sink) -> int:
|
||||
# First call to learn `total` / `totalPage`.
|
||||
res = list_projects(client, page=1, page_size=page_size, origin=origin)
|
||||
total = res["total"]
|
||||
n_pages = math.ceil(total / page_size)
|
||||
written = 0
|
||||
for it in res["lists"]:
|
||||
sink.write(json.dumps(it, ensure_ascii=False) + "\n")
|
||||
written += 1
|
||||
print(
|
||||
f"[{origin}] total={total} pages={n_pages} pageSize={page_size}",
|
||||
flush=True,
|
||||
)
|
||||
print(f" page 1/{n_pages}: {len(res['lists'])} items", flush=True)
|
||||
for p in range(2, n_pages + 1):
|
||||
time.sleep(sleep)
|
||||
res = list_projects(client, page=p, page_size=page_size, origin=origin)
|
||||
for it in res["lists"]:
|
||||
sink.write(json.dumps(it, ensure_ascii=False) + "\n")
|
||||
written += 1
|
||||
print(f" page {p:>2}/{n_pages}: {len(res['lists'])} items", flush=True)
|
||||
if written != total:
|
||||
print(
|
||||
f" WARN: wrote {written} but server said total={total} "
|
||||
f"(diff={total - written})",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return written
|
||||
|
||||
|
||||
def summarize(path: Path) -> None:
|
||||
by_origin: dict[str, list[dict]] = {}
|
||||
with path.open() as f:
|
||||
for ln in f:
|
||||
it = json.loads(ln)
|
||||
by_origin.setdefault(it.get("origin") or "?", []).append(it)
|
||||
print("\n===== summary =====")
|
||||
print(f"file: {path} size={path.stat().st_size / 1024 / 1024:.1f} MB")
|
||||
for origin, items in sorted(by_origin.items()):
|
||||
if not items:
|
||||
continue
|
||||
likes = sorted(((i.get("count") or {}).get("like") or 0) for i in items)
|
||||
views = sorted(((i.get("count") or {}).get("views") or 0) for i in items)
|
||||
grades = Counter(i.get("grade") for i in items)
|
||||
# date range from created_at
|
||||
dates = sorted(i.get("created_at") or "" for i in items if i.get("created_at"))
|
||||
date_min = dates[0][:10] if dates else "?"
|
||||
date_max = dates[-1][:10] if dates else "?"
|
||||
print(f"\n[{origin}] n={len(items)}")
|
||||
print(
|
||||
f" likes: median={likes[len(likes) // 2]} "
|
||||
f"p90={likes[int(len(likes) * 0.9)]} "
|
||||
f"p99={likes[int(len(likes) * 0.99)]} max={likes[-1]}"
|
||||
)
|
||||
print(
|
||||
f" views: median={views[len(views) // 2]} "
|
||||
f"p90={views[int(len(views) * 0.9)]} max={views[-1]}"
|
||||
)
|
||||
print(f" grade: {dict(sorted(grades.items(), key=lambda kv: -(kv[0] or 0)))}")
|
||||
print(f" dates: {date_min} .. {date_max}")
|
||||
# Quality tiers (handy for batch sizing)
|
||||
tier_grade3 = sum(1 for i in items if (i.get("grade") or 0) >= 3)
|
||||
tier_like10 = sum(
|
||||
1
|
||||
for i in items
|
||||
if ((i.get("count") or {}).get("like") or 0) >= 10
|
||||
and (i.get("grade") or 0) >= 3
|
||||
)
|
||||
print(f" tier: grade>=3: {tier_grade3} grade>=3 & like>=10: {tier_like10}")
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--out", type=Path, default=DEFAULT_OUT)
|
||||
ap.add_argument("--page-size", type=int, default=1000)
|
||||
ap.add_argument("--sleep", type=float, default=1.0, help="seconds between page calls")
|
||||
args = ap.parse_args(argv)
|
||||
|
||||
args.out.parent.mkdir(parents=True, exist_ok=True)
|
||||
print(f"writing -> {args.out}")
|
||||
t0 = time.time()
|
||||
with args.out.open("w") as sink, make_client() as client:
|
||||
total = 0
|
||||
for origin in ORIGINS:
|
||||
total += dump_origin(client, origin, args.page_size, args.sleep, sink)
|
||||
sink.flush()
|
||||
print(f"\ndone: {total} items in {time.time() - t0:.1f}s")
|
||||
summarize(args.out)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user