Files
FacereDataset/scripts/estimate_size.py
Zhang Jiahao e222b08f27 Add corpus size/license estimator; snapshot 90-project findings
Why:
- 放量决策需要比"52MB/项目 × 12493 = 650GB"更扎实的数据。用
  scripts/estimate_size.py 采样 90 个 hot 项目的 attachments[].size
  得到真实分布(median 9MB / p90 54MB),全量 median 估算 110GB,
  p90 上界 660GB。这给 Charles 一个可信的存储预算。
- 附带 license 和 ext 分布采出两个重要洞察:
  (1) mp4+qt 视频占 54% 存储,加 --skip-ext 开关可节省一半;
  (2) NC (Non-Commercial) 许可 ~11%,下游必须按 whitelist 过滤。

What:
- scripts/estimate_size.py: 无下载的元数据采样器,复用 crawler.parse_detail_html
- docs/sources/oshwhub_corpus_estimate.md: 结果快照 + 决策建议
- log.md: 本次会话记录

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-23 19:45:54 +08:00

124 lines
4.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Estimate full-corpus storage by sampling oshwhub detail pages (no downloads).
从列表 API 取 N 个项目,解析每个详情页的 `attachments[]`,把 `size` 字段求和。
不下载任何附件,仅抓 HTML 页,对服务器压力小;可用来快速给 Charles 一个
放量存储估计。
Usage:
uv run python scripts/estimate_size.py --pages 5 --sort hot
"""
from __future__ import annotations
import argparse
import statistics as st
import sys
import time
from pathlib import Path
# Reuse crawler helpers; avoid duplicating HTTP/parse code
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from crawlers.oshwhub.crawler import ( # noqa: E402
make_client,
list_projects,
parse_detail_html,
BASE,
)
def fmt_mb(bytes_: float) -> str:
return f"{bytes_ / 1024 / 1024:.1f} MB"
def fmt_gb(bytes_: float) -> str:
return f"{bytes_ / 1024 / 1024 / 1024:.2f} GB"
def main(argv: list[str] | None = None) -> int:
ap = argparse.ArgumentParser()
ap.add_argument("--pages", type=int, default=3)
ap.add_argument("--page-size", type=int, default=30)
ap.add_argument("--sort", default="hot")
ap.add_argument("--sleep", type=float, default=1.0, help="seconds between detail fetches")
args = ap.parse_args(argv)
sample_sizes: list[int] = [] # per-project total bytes
sample_counts: list[int] = [] # per-project attachment count
ext_hist: dict[str, int] = {} # bytes by extension
lic_hist: dict[str, int] = {}
total = None
with make_client() as client:
for page in range(1, args.pages + 1):
res = list_projects(client, page=page, page_size=args.page_size, sort=args.sort)
total = res["total"]
for it in res["lists"]:
path = it["path"]
url = f"{BASE}/{path}"
try:
r = client.get(url)
r.raise_for_status()
d = parse_detail_html(r.text)
except Exception as e:
print(f" skip {path}: {e}", file=sys.stderr)
continue
lic = d.get("license") or "unknown"
lic_hist[lic] = lic_hist.get(lic, 0) + 1
proj_size = 0
count = 0
for a in d.get("attachments", []):
size = a.get("size") or 0
ext = (a.get("ext") or "?").lower()
proj_size += size
count += 1
ext_hist[ext] = ext_hist.get(ext, 0) + size
sample_sizes.append(proj_size)
sample_counts.append(count)
print(
f" p{page:02d} {path:50.50} files={count:>2} size={fmt_mb(proj_size)}",
flush=True,
)
time.sleep(args.sleep)
if not sample_sizes:
print("no samples")
return 1
n = len(sample_sizes)
total_bytes = sum(sample_sizes)
mean = st.mean(sample_sizes)
median = st.median(sample_sizes)
p90 = sorted(sample_sizes)[int(n * 0.9)] if n >= 10 else max(sample_sizes)
max_ = max(sample_sizes)
print()
print(f"sampled: {n} projects (sort={args.sort})")
print(f"attachments/proj: mean={st.mean(sample_counts):.1f} "
f"median={st.median(sample_counts):.0f} max={max(sample_counts)}")
print(f"size/proj: mean={fmt_mb(mean)} median={fmt_mb(median)} "
f"p90={fmt_mb(p90)} max={fmt_mb(max_)}")
print(f"sample total: {fmt_mb(total_bytes)}")
if total:
est_full = mean * total
print(f"\ncorpus total (API reports): {total} projects")
print(f" × mean → estimate: {fmt_gb(est_full)}")
print(f" × median → estimate: {fmt_gb(median * total)}")
print(f" × p90 → upper bound: {fmt_gb(p90 * total)}")
print("\ntop ext by total bytes:")
for ext, b in sorted(ext_hist.items(), key=lambda x: -x[1])[:10]:
print(f" .{ext:6} {fmt_mb(b):>12}")
print("\nlicense distribution in sample:")
for lic, c in sorted(lic_hist.items(), key=lambda x: -x[1])[:10]:
pct = 100 * c / n
print(f" {lic:30} {c:>3} ({pct:.0f}%)")
return 0
if __name__ == "__main__":
raise SystemExit(main())