Files
FacereDataset/scripts/estimate_size.py
Zhang Jiahao ba501c328c Remove personal name from suggestion/decision phrasing
Why:
- "给 Charles 的建议"、"待 Charles 拍板"、"需要 Charles 决策" 这些写法
  把具体人绑到了文档里,换维护者就失准。改成中性的 "建议 / 待决策 /
  待拍板",文档对未来协作者和 agent 都更通用。

What:
- log.md: 四处去掉 "给 Charles / 还是需要 Charles 决策 / 等 Charles 拍板"
- plan.md: 三处去掉 "待 Charles / Charles 定目标 / 需要 Charles 定"
- docs/sources/hf_bshada_open_schematics.md: "待 Charles 决策" → "待决策"
- scripts/estimate_size.py: docstring 去掉 "给 Charles 一个估计"
- CLAUDE.md: 数据删除确认规则从 "先跟 Charles 确认" 改成 "先跟用户确认"

保留的 Charles 提及都是事实性的:
- README/plan 里的 "维护者:Charles"(身份字段)
- log.md 历史条目里 "Charles 要求..." / "Charles 点名..."(历史事件记录)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-23 20:01:52 +08:00

123 lines
4.2 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Estimate full-corpus storage by sampling oshwhub detail pages (no downloads).
从列表 API 取 N 个项目,解析每个详情页的 `attachments[]`,把 `size` 字段求和。
不下载任何附件,仅抓 HTML 页,对服务器压力小;可用来快速给出放量存储估计。
Usage:
uv run python scripts/estimate_size.py --pages 5 --sort hot
"""
from __future__ import annotations
import argparse
import statistics as st
import sys
import time
from pathlib import Path
# Reuse crawler helpers; avoid duplicating HTTP/parse code
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from crawlers.oshwhub.crawler import ( # noqa: E402
make_client,
list_projects,
parse_detail_html,
BASE,
)
def fmt_mb(bytes_: float) -> str:
return f"{bytes_ / 1024 / 1024:.1f} MB"
def fmt_gb(bytes_: float) -> str:
return f"{bytes_ / 1024 / 1024 / 1024:.2f} GB"
def main(argv: list[str] | None = None) -> int:
ap = argparse.ArgumentParser()
ap.add_argument("--pages", type=int, default=3)
ap.add_argument("--page-size", type=int, default=30)
ap.add_argument("--sort", default="hot")
ap.add_argument("--sleep", type=float, default=1.0, help="seconds between detail fetches")
args = ap.parse_args(argv)
sample_sizes: list[int] = [] # per-project total bytes
sample_counts: list[int] = [] # per-project attachment count
ext_hist: dict[str, int] = {} # bytes by extension
lic_hist: dict[str, int] = {}
total = None
with make_client() as client:
for page in range(1, args.pages + 1):
res = list_projects(client, page=page, page_size=args.page_size, sort=args.sort)
total = res["total"]
for it in res["lists"]:
path = it["path"]
url = f"{BASE}/{path}"
try:
r = client.get(url)
r.raise_for_status()
d = parse_detail_html(r.text)
except Exception as e:
print(f" skip {path}: {e}", file=sys.stderr)
continue
lic = d.get("license") or "unknown"
lic_hist[lic] = lic_hist.get(lic, 0) + 1
proj_size = 0
count = 0
for a in d.get("attachments", []):
size = a.get("size") or 0
ext = (a.get("ext") or "?").lower()
proj_size += size
count += 1
ext_hist[ext] = ext_hist.get(ext, 0) + size
sample_sizes.append(proj_size)
sample_counts.append(count)
print(
f" p{page:02d} {path:50.50} files={count:>2} size={fmt_mb(proj_size)}",
flush=True,
)
time.sleep(args.sleep)
if not sample_sizes:
print("no samples")
return 1
n = len(sample_sizes)
total_bytes = sum(sample_sizes)
mean = st.mean(sample_sizes)
median = st.median(sample_sizes)
p90 = sorted(sample_sizes)[int(n * 0.9)] if n >= 10 else max(sample_sizes)
max_ = max(sample_sizes)
print()
print(f"sampled: {n} projects (sort={args.sort})")
print(f"attachments/proj: mean={st.mean(sample_counts):.1f} "
f"median={st.median(sample_counts):.0f} max={max(sample_counts)}")
print(f"size/proj: mean={fmt_mb(mean)} median={fmt_mb(median)} "
f"p90={fmt_mb(p90)} max={fmt_mb(max_)}")
print(f"sample total: {fmt_mb(total_bytes)}")
if total:
est_full = mean * total
print(f"\ncorpus total (API reports): {total} projects")
print(f" × mean → estimate: {fmt_gb(est_full)}")
print(f" × median → estimate: {fmt_gb(median * total)}")
print(f" × p90 → upper bound: {fmt_gb(p90 * total)}")
print("\ntop ext by total bytes:")
for ext, b in sorted(ext_hist.items(), key=lambda x: -x[1])[:10]:
print(f" .{ext:6} {fmt_mb(b):>12}")
print("\nlicense distribution in sample:")
for lic, c in sorted(lic_hist.items(), key=lambda x: -x[1])[:10]:
pct = 100 * c / n
print(f" {lic:30} {c:>3} ({pct:.0f}%)")
return 0
if __name__ == "__main__":
raise SystemExit(main())