Files
FacereDataset/scripts/build_index.py
Zhang Jiahao ce22717288 Add projects.md index (stars-sorted) + build_index.py generator
Why:
- Charles 要一个索引页看入库项目 + 他们的 stars。手工维护会漂移,
  所以 scripts/build_index.py 直接读 metadata.json 重新生成,保证
  projects.md 永远是 data/raw/ 的镜像。

What:
- projects.md: 10 个项目按 Stars 倒序(最高 3293 的加热台量产计划
  → 最低 236 的柚子爱 AI 相机),含 stars/likes/forks/views/comments/
  files/size,+ License 与数据源分布
- scripts/build_index.py: 扫 metadata.json 渲染 markdown,支持未来
  多数据源(source 字段区分),下次新增 oshwhub / github / hackaday
  项目后重跑即可
- README.md: 加 projects.md 链接

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-23 19:48:21 +08:00

156 lines
5.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Scan data/raw/*/*/metadata.json and build projects.md (index, sorted by stars desc).
Usage:
uv run python scripts/build_index.py
uv run python scripts/build_index.py --out projects.md
"""
from __future__ import annotations
import argparse
import json
from datetime import datetime, timezone
from pathlib import Path
REPO = Path(__file__).resolve().parent.parent
def fmt_mb(b: int) -> str:
return f"{b / 1024 / 1024:.1f}"
def collect() -> list[dict]:
rows: list[dict] = []
for meta in (REPO / "data" / "raw").rglob("metadata.json"):
m = json.loads(meta.read_text(encoding="utf-8"))
files = m.get("files", [])
bytes_total = sum(f.get("size") or 0 for f in files)
rows.append(
{
"uuid": m["project_id"],
"title": m["title"],
"source": m["source"],
"source_url": m["source_url"],
"author_display": m["author"].get("display_name") or m["author"]["username"],
"author_username": m["author"]["username"],
"license": m.get("license") or "unknown",
"metrics": m.get("metrics") or {},
"files_count": len(files),
"files_bytes": bytes_total,
"local_dir": str(meta.parent.relative_to(REPO)),
}
)
# sort by stars desc, tie-break by likes
rows.sort(
key=lambda r: (
-(r["metrics"].get("stars") or 0),
-(r["metrics"].get("likes") or 0),
)
)
return rows
def render(rows: list[dict]) -> str:
out: list[str] = []
w = out.append
total_files = sum(r["files_count"] for r in rows)
total_bytes = sum(r["files_bytes"] for r in rows)
total_stars = sum((r["metrics"].get("stars") or 0) for r in rows)
total_likes = sum((r["metrics"].get("likes") or 0) for r in rows)
total_views = sum((r["metrics"].get("views") or 0) for r in rows)
w("# Crawled Projects Index")
w("")
w(f"_自动生成最近更新 {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}_")
w("")
w(
f"**当前**{len(rows)} 个项目 · {total_files} 个附件 · {fmt_mb(total_bytes)} MB"
)
w("")
w("> 按 **Stars 倒序**。Title → 源站UUID → 本仓库对应目录。")
w("")
w(
"| # | Title | Author | License | "
"⭐ Stars | ❤️ Likes | 🍴 Forks | 👁 Views | 💬 Comments | Files | Size (MB) |"
)
w(
"|---|-------|--------|---------|"
"--------:|---------:|---------:|---------:|------------:|------:|----------:|"
)
for i, r in enumerate(rows, 1):
m = r["metrics"]
title_link = f"[{r['title']}]({r['source_url']})"
# author link inference: oshwhub 格式 `https://oshwhub.com/<username>`
if r["source"] == "oshwhub":
author_url = f"https://oshwhub.com/{r['author_username']}"
else:
author_url = r["source_url"] # fallback
author_link = f"[{r['author_display']}]({author_url})"
uuid_short = r["uuid"][:8]
dir_link = f"[`{uuid_short}…`](./{r['local_dir']}/)"
w(
f"| {i} | {title_link}<br>{dir_link} | {author_link} | {r['license']} | "
f"{m.get('stars', 0):,} | {m.get('likes', 0):,} | {m.get('forks', 0):,} | "
f"{m.get('views', 0):,} | {m.get('comments', 0):,} | "
f"{r['files_count']} | {fmt_mb(r['files_bytes'])} |"
)
w("")
w("## 汇总")
w("")
avg_stars = total_stars // max(len(rows), 1)
w(f"- Stars 合计 **{total_stars:,}**(平均 {avg_stars:,}/项目)")
w(f"- Likes 合计 **{total_likes:,}**")
w(f"- Views 合计 **{total_views:,}**")
w("")
w("### License 分布")
w("")
lic_count: dict[str, int] = {}
for r in rows:
lic_count[r["license"]] = lic_count.get(r["license"], 0) + 1
for lic, c in sorted(lic_count.items(), key=lambda x: -x[1]):
w(f"- `{lic}` — {c} 项目")
w("")
w("### 数据源分布")
w("")
src_count: dict[str, int] = {}
for r in rows:
src_count[r["source"]] = src_count.get(r["source"], 0) + 1
for src, c in sorted(src_count.items(), key=lambda x: -x[1]):
w(f"- `{src}` — {c} 项目")
w("")
w("## 目录结构(每个项目)")
w("")
w("```")
w("data/raw/<source>/<uuid>/")
w("├── metadata.json # 统一 schema见 schemas/project.schema.json")
w("├── description.md # 标题 + 简介 + 许可证")
w("├── cover.{jpg,png} # 封面")
w("├── _urls.json # 所有原始 URL")
w("└── files/* # 原始附件Git LFS")
w("```")
w("")
w("## 重新生成")
w("")
w("```bash")
w("uv run python scripts/build_index.py")
w("```")
w("")
return "\n".join(out)
def main(argv: list[str] | None = None) -> int:
ap = argparse.ArgumentParser()
ap.add_argument("--out", type=Path, default=REPO / "projects.md")
args = ap.parse_args(argv)
rows = collect()
md = render(rows)
args.out.write_text(md, encoding="utf-8")
print(f"wrote {args.out} ({len(rows)} projects)")
return 0
if __name__ == "__main__":
raise SystemExit(main())