Add corpus size/license estimator; snapshot 90-project findings

Why:
- 放量决策需要比"52MB/项目 × 12493 = 650GB"更扎实的数据。用
  scripts/estimate_size.py 采样 90 个 hot 项目的 attachments[].size
  得到真实分布(median 9MB / p90 54MB),全量 median 估算 110GB,
  p90 上界 660GB。这给 Charles 一个可信的存储预算。
- 附带 license 和 ext 分布采出两个重要洞察:
  (1) mp4+qt 视频占 54% 存储,加 --skip-ext 开关可节省一半;
  (2) NC (Non-Commercial) 许可 ~11%,下游必须按 whitelist 过滤。

What:
- scripts/estimate_size.py: 无下载的元数据采样器,复用 crawler.parse_detail_html
- docs/sources/oshwhub_corpus_estimate.md: 结果快照 + 决策建议
- log.md: 本次会话记录

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Zhang Jiahao
2026-04-23 19:45:54 +08:00
parent c8d55a22eb
commit e222b08f27
3 changed files with 213 additions and 1 deletions

123
scripts/estimate_size.py Normal file
View File

@@ -0,0 +1,123 @@
"""Estimate full-corpus storage by sampling oshwhub detail pages (no downloads).
从列表 API 取 N 个项目,解析每个详情页的 `attachments[]`,把 `size` 字段求和。
不下载任何附件,仅抓 HTML 页,对服务器压力小;可用来快速给 Charles 一个
放量存储估计。
Usage:
uv run python scripts/estimate_size.py --pages 5 --sort hot
"""
from __future__ import annotations
import argparse
import statistics as st
import sys
import time
from pathlib import Path
# Reuse crawler helpers; avoid duplicating HTTP/parse code
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from crawlers.oshwhub.crawler import ( # noqa: E402
make_client,
list_projects,
parse_detail_html,
BASE,
)
def fmt_mb(bytes_: float) -> str:
return f"{bytes_ / 1024 / 1024:.1f} MB"
def fmt_gb(bytes_: float) -> str:
return f"{bytes_ / 1024 / 1024 / 1024:.2f} GB"
def main(argv: list[str] | None = None) -> int:
ap = argparse.ArgumentParser()
ap.add_argument("--pages", type=int, default=3)
ap.add_argument("--page-size", type=int, default=30)
ap.add_argument("--sort", default="hot")
ap.add_argument("--sleep", type=float, default=1.0, help="seconds between detail fetches")
args = ap.parse_args(argv)
sample_sizes: list[int] = [] # per-project total bytes
sample_counts: list[int] = [] # per-project attachment count
ext_hist: dict[str, int] = {} # bytes by extension
lic_hist: dict[str, int] = {}
total = None
with make_client() as client:
for page in range(1, args.pages + 1):
res = list_projects(client, page=page, page_size=args.page_size, sort=args.sort)
total = res["total"]
for it in res["lists"]:
path = it["path"]
url = f"{BASE}/{path}"
try:
r = client.get(url)
r.raise_for_status()
d = parse_detail_html(r.text)
except Exception as e:
print(f" skip {path}: {e}", file=sys.stderr)
continue
lic = d.get("license") or "unknown"
lic_hist[lic] = lic_hist.get(lic, 0) + 1
proj_size = 0
count = 0
for a in d.get("attachments", []):
size = a.get("size") or 0
ext = (a.get("ext") or "?").lower()
proj_size += size
count += 1
ext_hist[ext] = ext_hist.get(ext, 0) + size
sample_sizes.append(proj_size)
sample_counts.append(count)
print(
f" p{page:02d} {path:50.50} files={count:>2} size={fmt_mb(proj_size)}",
flush=True,
)
time.sleep(args.sleep)
if not sample_sizes:
print("no samples")
return 1
n = len(sample_sizes)
total_bytes = sum(sample_sizes)
mean = st.mean(sample_sizes)
median = st.median(sample_sizes)
p90 = sorted(sample_sizes)[int(n * 0.9)] if n >= 10 else max(sample_sizes)
max_ = max(sample_sizes)
print()
print(f"sampled: {n} projects (sort={args.sort})")
print(f"attachments/proj: mean={st.mean(sample_counts):.1f} "
f"median={st.median(sample_counts):.0f} max={max(sample_counts)}")
print(f"size/proj: mean={fmt_mb(mean)} median={fmt_mb(median)} "
f"p90={fmt_mb(p90)} max={fmt_mb(max_)}")
print(f"sample total: {fmt_mb(total_bytes)}")
if total:
est_full = mean * total
print(f"\ncorpus total (API reports): {total} projects")
print(f" × mean → estimate: {fmt_gb(est_full)}")
print(f" × median → estimate: {fmt_gb(median * total)}")
print(f" × p90 → upper bound: {fmt_gb(p90 * total)}")
print("\ntop ext by total bytes:")
for ext, b in sorted(ext_hist.items(), key=lambda x: -x[1])[:10]:
print(f" .{ext:6} {fmt_mb(b):>12}")
print("\nlicense distribution in sample:")
for lic, c in sorted(lic_hist.items(), key=lambda x: -x[1])[:10]:
pct = 100 * c / n
print(f" {lic:30} {c:>3} ({pct:.0f}%)")
return 0
if __name__ == "__main__":
raise SystemExit(main())