FacereDataset/scripts/build_index.py

"""Scan data/raw/*/*/metadata.json and build projects.md (index, sorted by stars desc).

Usage:
    uv run python scripts/build_index.py
    uv run python scripts/build_index.py --out projects.md
"""

from __future__ import annotations

import argparse
import json
from datetime import datetime, timezone
from pathlib import Path

REPO = Path(__file__).resolve().parent.parent


def fmt_mb(b: int) -> str:
    return f"{b / 1024 / 1024:.1f}"


def collect() -> list[dict]:
    rows: list[dict] = []
    for meta in (REPO / "data" / "raw").rglob("metadata.json"):
        m = json.loads(meta.read_text(encoding="utf-8"))
        files = m.get("files", [])
        bytes_total = sum(f.get("size") or 0 for f in files)
        rows.append(
            {
                "uuid": m["project_id"],
                "title": m["title"],
                "source": m["source"],
                "source_url": m["source_url"],
                "author_display": m["author"].get("display_name") or m["author"]["username"],
                "author_username": m["author"]["username"],
                "license": m.get("license") or "unknown",
                "metrics": m.get("metrics") or {},
                "files_count": len(files),
                "files_bytes": bytes_total,
                "local_dir": str(meta.parent.relative_to(REPO)),
            }
        )
    # sort by stars desc, tie-break by likes
    rows.sort(
        key=lambda r: (
            -(r["metrics"].get("stars") or 0),
            -(r["metrics"].get("likes") or 0),
        )
    )
    return rows


def render(rows: list[dict]) -> str:
    out: list[str] = []
    w = out.append

    total_files = sum(r["files_count"] for r in rows)
    total_bytes = sum(r["files_bytes"] for r in rows)
    total_stars = sum((r["metrics"].get("stars") or 0) for r in rows)
    total_likes = sum((r["metrics"].get("likes") or 0) for r in rows)
    total_views = sum((r["metrics"].get("views") or 0) for r in rows)

    w("# Crawled Projects Index")
    w("")
    w(f"_自动生成，最近更新 {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}_")
    w("")
    w(
        f"**当前**：{len(rows)} 个项目 · {total_files} 个附件 · {fmt_mb(total_bytes)} MB"
    )
    w("")
    w("> 按 **Stars 倒序**。Title → 源站；UUID → 本仓库对应目录。")
    w("")
    w(
        "| # | Title | Author | License | "
        "⭐ Stars | ❤️ Likes | 🍴 Forks | 👁 Views | 💬 Comments | Files | Size (MB) |"
    )
    w(
        "|---|-------|--------|---------|"
        "--------:|---------:|---------:|---------:|------------:|------:|----------:|"
    )
    for i, r in enumerate(rows, 1):
        m = r["metrics"]
        title_link = f"[{r['title']}]({r['source_url']})"
        # author link inference: oshwhub 格式 `https://oshwhub.com/<username>`
        if r["source"] == "oshwhub":
            author_url = f"https://oshwhub.com/{r['author_username']}"
        else:
            author_url = r["source_url"]  # fallback
        author_link = f"[{r['author_display']}]({author_url})"
        uuid_short = r["uuid"][:8]
        dir_link = f"[`{uuid_short}…`](./{r['local_dir']}/)"
        w(
            f"| {i} | {title_link}<br>{dir_link} | {author_link} | {r['license']} | "
            f"{m.get('stars', 0):,} | {m.get('likes', 0):,} | {m.get('forks', 0):,} | "
            f"{m.get('views', 0):,} | {m.get('comments', 0):,} | "
            f"{r['files_count']} | {fmt_mb(r['files_bytes'])} |"
        )
    w("")
    w("## 汇总")
    w("")
    avg_stars = total_stars // max(len(rows), 1)
    w(f"- Stars 合计 **{total_stars:,}**（平均 {avg_stars:,}/项目）")
    w(f"- Likes 合计 **{total_likes:,}**")
    w(f"- Views 合计 **{total_views:,}**")
    w("")
    w("### License 分布")
    w("")
    lic_count: dict[str, int] = {}
    for r in rows:
        lic_count[r["license"]] = lic_count.get(r["license"], 0) + 1
    for lic, c in sorted(lic_count.items(), key=lambda x: -x[1]):
        w(f"- `{lic}` — {c} 项目")
    w("")
    w("### 数据源分布")
    w("")
    src_count: dict[str, int] = {}
    for r in rows:
        src_count[r["source"]] = src_count.get(r["source"], 0) + 1
    for src, c in sorted(src_count.items(), key=lambda x: -x[1]):
        w(f"- `{src}` — {c} 项目")
    w("")
    w("## 目录结构（每个项目）")
    w("")
    w("```")
    w("data/raw/<source>/<uuid>/")
    w("├── metadata.json   # 统一 schema，见 schemas/project.schema.json")
    w("├── description.md  # 标题 + 简介 + 许可证")
    w("├── cover.{jpg,png} # 封面")
    w("├── _urls.json      # 所有原始 URL")
    w("└── files/*         # 原始附件（Git LFS）")
    w("```")
    w("")
    w("## 重新生成")
    w("")
    w("```bash")
    w("uv run python scripts/build_index.py")
    w("```")
    w("")
    return "\n".join(out)


def main(argv: list[str] | None = None) -> int:
    ap = argparse.ArgumentParser()
    ap.add_argument("--out", type=Path, default=REPO / "projects.md")
    args = ap.parse_args(argv)

    rows = collect()
    md = render(rows)
    args.out.write_text(md, encoding="utf-8")
    print(f"wrote {args.out} ({len(rows)} projects)")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())