Add corpus size/license estimator; snapshot 90-project findings

Why: - 放量决策需要比"52MB/项目 × 12493 = 650GB"更扎实的数据。用 scripts/estimate_size.py 采样 90 个 hot 项目的 attachments[].size 得到真实分布（median 9MB / p90 54MB），全量 median 估算 110GB， p90 上界 660GB。这给 Charles 一个可信的存储预算。 - 附带 license 和 ext 分布采出两个重要洞察： (1) mp4+qt 视频占 54% 存储，加 --skip-ext 开关可节省一半； (2) NC (Non-Commercial) 许可 ~11%，下游必须按 whitelist 过滤。 What: - scripts/estimate_size.py: 无下载的元数据采样器，复用 crawler.parse_detail_html - docs/sources/oshwhub_corpus_estimate.md: 结果快照 + 决策建议 - log.md: 本次会话记录 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-23 19:45:54 +08:00
parent c8d55a22eb
commit e222b08f27
3 changed files with 213 additions and 1 deletions
--- a/scripts/estimate_size.py
+++ b/scripts/estimate_size.py
@@ -0,0 +1,123 @@
+"""Estimate full-corpus storage by sampling oshwhub detail pages (no downloads).
+
+从列表 API 取 N 个项目，解析每个详情页的 `attachments[]`，把 `size` 字段求和。
+不下载任何附件，仅抓 HTML 页，对服务器压力小；可用来快速给 Charles 一个
+放量存储估计。
+
+Usage:
+    uv run python scripts/estimate_size.py --pages 5 --sort hot
+"""
+
+from __future__ import annotations
+
+import argparse
+import statistics as st
+import sys
+import time
+from pathlib import Path
+
+# Reuse crawler helpers; avoid duplicating HTTP/parse code
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+from crawlers.oshwhub.crawler import (  # noqa: E402
+    make_client,
+    list_projects,
+    parse_detail_html,
+    BASE,
+)
+
+
+def fmt_mb(bytes_: float) -> str:
+    return f"{bytes_ / 1024 / 1024:.1f} MB"
+
+
+def fmt_gb(bytes_: float) -> str:
+    return f"{bytes_ / 1024 / 1024 / 1024:.2f} GB"
+
+
+def main(argv: list[str] | None = None) -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--pages", type=int, default=3)
+    ap.add_argument("--page-size", type=int, default=30)
+    ap.add_argument("--sort", default="hot")
+    ap.add_argument("--sleep", type=float, default=1.0, help="seconds between detail fetches")
+    args = ap.parse_args(argv)
+
+    sample_sizes: list[int] = []  # per-project total bytes
+    sample_counts: list[int] = []  # per-project attachment count
+    ext_hist: dict[str, int] = {}  # bytes by extension
+    lic_hist: dict[str, int] = {}
+
+    total = None
+    with make_client() as client:
+        for page in range(1, args.pages + 1):
+            res = list_projects(client, page=page, page_size=args.page_size, sort=args.sort)
+            total = res["total"]
+            for it in res["lists"]:
+                path = it["path"]
+                url = f"{BASE}/{path}"
+                try:
+                    r = client.get(url)
+                    r.raise_for_status()
+                    d = parse_detail_html(r.text)
+                except Exception as e:
+                    print(f"  skip {path}: {e}", file=sys.stderr)
+                    continue
+
+                lic = d.get("license") or "unknown"
+                lic_hist[lic] = lic_hist.get(lic, 0) + 1
+
+                proj_size = 0
+                count = 0
+                for a in d.get("attachments", []):
+                    size = a.get("size") or 0
+                    ext = (a.get("ext") or "?").lower()
+                    proj_size += size
+                    count += 1
+                    ext_hist[ext] = ext_hist.get(ext, 0) + size
+                sample_sizes.append(proj_size)
+                sample_counts.append(count)
+                print(
+                    f"  p{page:02d} {path:50.50} files={count:>2} size={fmt_mb(proj_size)}",
+                    flush=True,
+                )
+                time.sleep(args.sleep)
+
+    if not sample_sizes:
+        print("no samples")
+        return 1
+
+    n = len(sample_sizes)
+    total_bytes = sum(sample_sizes)
+    mean = st.mean(sample_sizes)
+    median = st.median(sample_sizes)
+    p90 = sorted(sample_sizes)[int(n * 0.9)] if n >= 10 else max(sample_sizes)
+    max_ = max(sample_sizes)
+
+    print()
+    print(f"sampled: {n} projects (sort={args.sort})")
+    print(f"attachments/proj: mean={st.mean(sample_counts):.1f} "
+          f"median={st.median(sample_counts):.0f} max={max(sample_counts)}")
+    print(f"size/proj:        mean={fmt_mb(mean)}  median={fmt_mb(median)}  "
+          f"p90={fmt_mb(p90)}  max={fmt_mb(max_)}")
+    print(f"sample total:     {fmt_mb(total_bytes)}")
+    if total:
+        est_full = mean * total
+        print(f"\ncorpus total (API reports):  {total} projects")
+        print(f"  × mean → estimate:    {fmt_gb(est_full)}")
+        print(f"  × median → estimate:  {fmt_gb(median * total)}")
+        print(f"  × p90 → upper bound:  {fmt_gb(p90 * total)}")
+
+    print("\ntop ext by total bytes:")
+    for ext, b in sorted(ext_hist.items(), key=lambda x: -x[1])[:10]:
+        print(f"  .{ext:6} {fmt_mb(b):>12}")
+
+    print("\nlicense distribution in sample:")
+    for lic, c in sorted(lic_hist.items(), key=lambda x: -x[1])[:10]:
+        pct = 100 * c / n
+        print(f"  {lic:30} {c:>3}  ({pct:.0f}%)")
+
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())