Add corpus size/license estimator; snapshot 90-project findings
Why: - 放量决策需要比"52MB/项目 × 12493 = 650GB"更扎实的数据。用 scripts/estimate_size.py 采样 90 个 hot 项目的 attachments[].size 得到真实分布(median 9MB / p90 54MB),全量 median 估算 110GB, p90 上界 660GB。这给 Charles 一个可信的存储预算。 - 附带 license 和 ext 分布采出两个重要洞察: (1) mp4+qt 视频占 54% 存储,加 --skip-ext 开关可节省一半; (2) NC (Non-Commercial) 许可 ~11%,下游必须按 whitelist 过滤。 What: - scripts/estimate_size.py: 无下载的元数据采样器,复用 crawler.parse_detail_html - docs/sources/oshwhub_corpus_estimate.md: 结果快照 + 决策建议 - log.md: 本次会话记录 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
123
scripts/estimate_size.py
Normal file
123
scripts/estimate_size.py
Normal file
@@ -0,0 +1,123 @@
|
||||
"""Estimate full-corpus storage by sampling oshwhub detail pages (no downloads).
|
||||
|
||||
从列表 API 取 N 个项目,解析每个详情页的 `attachments[]`,把 `size` 字段求和。
|
||||
不下载任何附件,仅抓 HTML 页,对服务器压力小;可用来快速给 Charles 一个
|
||||
放量存储估计。
|
||||
|
||||
Usage:
|
||||
uv run python scripts/estimate_size.py --pages 5 --sort hot
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import statistics as st
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
# Reuse crawler helpers; avoid duplicating HTTP/parse code
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
||||
from crawlers.oshwhub.crawler import ( # noqa: E402
|
||||
make_client,
|
||||
list_projects,
|
||||
parse_detail_html,
|
||||
BASE,
|
||||
)
|
||||
|
||||
|
||||
def fmt_mb(bytes_: float) -> str:
|
||||
return f"{bytes_ / 1024 / 1024:.1f} MB"
|
||||
|
||||
|
||||
def fmt_gb(bytes_: float) -> str:
|
||||
return f"{bytes_ / 1024 / 1024 / 1024:.2f} GB"
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--pages", type=int, default=3)
|
||||
ap.add_argument("--page-size", type=int, default=30)
|
||||
ap.add_argument("--sort", default="hot")
|
||||
ap.add_argument("--sleep", type=float, default=1.0, help="seconds between detail fetches")
|
||||
args = ap.parse_args(argv)
|
||||
|
||||
sample_sizes: list[int] = [] # per-project total bytes
|
||||
sample_counts: list[int] = [] # per-project attachment count
|
||||
ext_hist: dict[str, int] = {} # bytes by extension
|
||||
lic_hist: dict[str, int] = {}
|
||||
|
||||
total = None
|
||||
with make_client() as client:
|
||||
for page in range(1, args.pages + 1):
|
||||
res = list_projects(client, page=page, page_size=args.page_size, sort=args.sort)
|
||||
total = res["total"]
|
||||
for it in res["lists"]:
|
||||
path = it["path"]
|
||||
url = f"{BASE}/{path}"
|
||||
try:
|
||||
r = client.get(url)
|
||||
r.raise_for_status()
|
||||
d = parse_detail_html(r.text)
|
||||
except Exception as e:
|
||||
print(f" skip {path}: {e}", file=sys.stderr)
|
||||
continue
|
||||
|
||||
lic = d.get("license") or "unknown"
|
||||
lic_hist[lic] = lic_hist.get(lic, 0) + 1
|
||||
|
||||
proj_size = 0
|
||||
count = 0
|
||||
for a in d.get("attachments", []):
|
||||
size = a.get("size") or 0
|
||||
ext = (a.get("ext") or "?").lower()
|
||||
proj_size += size
|
||||
count += 1
|
||||
ext_hist[ext] = ext_hist.get(ext, 0) + size
|
||||
sample_sizes.append(proj_size)
|
||||
sample_counts.append(count)
|
||||
print(
|
||||
f" p{page:02d} {path:50.50} files={count:>2} size={fmt_mb(proj_size)}",
|
||||
flush=True,
|
||||
)
|
||||
time.sleep(args.sleep)
|
||||
|
||||
if not sample_sizes:
|
||||
print("no samples")
|
||||
return 1
|
||||
|
||||
n = len(sample_sizes)
|
||||
total_bytes = sum(sample_sizes)
|
||||
mean = st.mean(sample_sizes)
|
||||
median = st.median(sample_sizes)
|
||||
p90 = sorted(sample_sizes)[int(n * 0.9)] if n >= 10 else max(sample_sizes)
|
||||
max_ = max(sample_sizes)
|
||||
|
||||
print()
|
||||
print(f"sampled: {n} projects (sort={args.sort})")
|
||||
print(f"attachments/proj: mean={st.mean(sample_counts):.1f} "
|
||||
f"median={st.median(sample_counts):.0f} max={max(sample_counts)}")
|
||||
print(f"size/proj: mean={fmt_mb(mean)} median={fmt_mb(median)} "
|
||||
f"p90={fmt_mb(p90)} max={fmt_mb(max_)}")
|
||||
print(f"sample total: {fmt_mb(total_bytes)}")
|
||||
if total:
|
||||
est_full = mean * total
|
||||
print(f"\ncorpus total (API reports): {total} projects")
|
||||
print(f" × mean → estimate: {fmt_gb(est_full)}")
|
||||
print(f" × median → estimate: {fmt_gb(median * total)}")
|
||||
print(f" × p90 → upper bound: {fmt_gb(p90 * total)}")
|
||||
|
||||
print("\ntop ext by total bytes:")
|
||||
for ext, b in sorted(ext_hist.items(), key=lambda x: -x[1])[:10]:
|
||||
print(f" .{ext:6} {fmt_mb(b):>12}")
|
||||
|
||||
print("\nlicense distribution in sample:")
|
||||
for lic, c in sorted(lic_hist.items(), key=lambda x: -x[1])[:10]:
|
||||
pct = 100 * c / n
|
||||
print(f" {lic:30} {c:>3} ({pct:.0f}%)")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user