Files
FacereDataset/scripts/dump_listing_index.py
Knowit d89a7cdf9c oshwhub: dump full listing index (33,695 projects) for batch sizing
Probed listing API and learned: total field is exposed (Pro=21,202 / Std=12,493),
pageSize accepts >=1000 (full corpus = 35 requests / 71s), sort param is silently
ignored. Dump all listings via scripts/dump_listing_index.py to local jsonl so
downstream batch-selection no longer hits the API.

Why: needed quantitative anchors before scaling Pro batch beyond top-5. License
is detail-page only (~19h serial scan), so we want to filter on grade/like
*locally* first to shortlist before paying that cost. Quality-tier counts now
known: A-tier (grade>=3 & like>=10) = 2,806 across both origins.

- scripts/dump_listing_index.py: one-shot scraper, polite QPS, streams to jsonl
- docs/sources/oshwhub_listing_full.md: human-readable report with growth
  trends, quality tiers, owner concentration, and storage-budget anchors

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-28 23:30:56 +08:00

127 lines
4.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Dump full oshwhub listing index for both origins to a local jsonl.
只抓 listing API不抓详情页、不抓附件、不抓工程源。结果落
`data/state/oshwhub_listing_full.jsonl`,每行一条 listing 项。
Usage:
uv run python scripts/dump_listing_index.py
uv run python scripts/dump_listing_index.py --page-size 500 --sleep 0.5
API 注意:
- `sort` 参数被服务端静默忽略无论传啥都返回隐式排序grade desc → 质量 desc
- 默认 `origin` 是 std要 Pro 必须显式 `origin=pro`
- `pageSize` 实测 ≥1000 都接受,单次响应体 ~1 MB
"""
from __future__ import annotations
import argparse
import json
import math
import statistics as st
import sys
import time
from collections import Counter
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from crawlers.oshwhub.crawler import list_projects, make_client # noqa: E402
ORIGINS = ("pro", "std")
DEFAULT_OUT = Path("data/state/oshwhub_listing_full.jsonl")
def dump_origin(client, origin: str, page_size: int, sleep: float, sink) -> int:
# First call to learn `total` / `totalPage`.
res = list_projects(client, page=1, page_size=page_size, origin=origin)
total = res["total"]
n_pages = math.ceil(total / page_size)
written = 0
for it in res["lists"]:
sink.write(json.dumps(it, ensure_ascii=False) + "\n")
written += 1
print(
f"[{origin}] total={total} pages={n_pages} pageSize={page_size}",
flush=True,
)
print(f" page 1/{n_pages}: {len(res['lists'])} items", flush=True)
for p in range(2, n_pages + 1):
time.sleep(sleep)
res = list_projects(client, page=p, page_size=page_size, origin=origin)
for it in res["lists"]:
sink.write(json.dumps(it, ensure_ascii=False) + "\n")
written += 1
print(f" page {p:>2}/{n_pages}: {len(res['lists'])} items", flush=True)
if written != total:
print(
f" WARN: wrote {written} but server said total={total} "
f"(diff={total - written})",
file=sys.stderr,
)
return written
def summarize(path: Path) -> None:
by_origin: dict[str, list[dict]] = {}
with path.open() as f:
for ln in f:
it = json.loads(ln)
by_origin.setdefault(it.get("origin") or "?", []).append(it)
print("\n===== summary =====")
print(f"file: {path} size={path.stat().st_size / 1024 / 1024:.1f} MB")
for origin, items in sorted(by_origin.items()):
if not items:
continue
likes = sorted(((i.get("count") or {}).get("like") or 0) for i in items)
views = sorted(((i.get("count") or {}).get("views") or 0) for i in items)
grades = Counter(i.get("grade") for i in items)
# date range from created_at
dates = sorted(i.get("created_at") or "" for i in items if i.get("created_at"))
date_min = dates[0][:10] if dates else "?"
date_max = dates[-1][:10] if dates else "?"
print(f"\n[{origin}] n={len(items)}")
print(
f" likes: median={likes[len(likes) // 2]} "
f"p90={likes[int(len(likes) * 0.9)]} "
f"p99={likes[int(len(likes) * 0.99)]} max={likes[-1]}"
)
print(
f" views: median={views[len(views) // 2]} "
f"p90={views[int(len(views) * 0.9)]} max={views[-1]}"
)
print(f" grade: {dict(sorted(grades.items(), key=lambda kv: -(kv[0] or 0)))}")
print(f" dates: {date_min} .. {date_max}")
# Quality tiers (handy for batch sizing)
tier_grade3 = sum(1 for i in items if (i.get("grade") or 0) >= 3)
tier_like10 = sum(
1
for i in items
if ((i.get("count") or {}).get("like") or 0) >= 10
and (i.get("grade") or 0) >= 3
)
print(f" tier: grade>=3: {tier_grade3} grade>=3 & like>=10: {tier_like10}")
def main(argv: list[str] | None = None) -> int:
ap = argparse.ArgumentParser()
ap.add_argument("--out", type=Path, default=DEFAULT_OUT)
ap.add_argument("--page-size", type=int, default=1000)
ap.add_argument("--sleep", type=float, default=1.0, help="seconds between page calls")
args = ap.parse_args(argv)
args.out.parent.mkdir(parents=True, exist_ok=True)
print(f"writing -> {args.out}")
t0 = time.time()
with args.out.open("w") as sink, make_client() as client:
total = 0
for origin in ORIGINS:
total += dump_origin(client, origin, args.page_size, args.sleep, sink)
sink.flush()
print(f"\ndone: {total} items in {time.time() - t0:.1f}s")
summarize(args.out)
return 0
if __name__ == "__main__":
raise SystemExit(main())