FacereDataset/crawlers/oshwhub/crawler.py

"""oshwhub.com crawler — MVP.

Usage:
    uv run python -m crawlers.oshwhub \
        --out data/raw/oshwhub \
        --top 10 --min-likes 50 --min-grade 4

Or with explicit UUID list:
    uv run python -m crawlers.oshwhub \
        --uuids 298873b7fdbe44f8ba0e7351e023bc2c,7b6a398811f14eba9a952b8d2ddd7ace \
        --out data/raw/oshwhub
"""

from __future__ import annotations

import argparse
import dataclasses as dc
import hashlib
import html as _html
import json
import re
import sys
import time
import urllib.parse
from datetime import datetime, timezone
from pathlib import Path
from typing import Iterator

import httpx

API_LIST = "https://oshwhub.com/api/project"
BASE = "https://oshwhub.com"
IMG_CDN = "https://image.lceda.cn"
UA = "FacereDataset/0.1 (+https://git.deepknow.site/Facere/FacereDataset)"
SLEEP_BETWEEN = 2.0  # seconds between detail-page / file fetches


# ---------------------------------------------------------------------------
# HTTP
# ---------------------------------------------------------------------------

def make_client(timeout: float = 30.0) -> httpx.Client:
    return httpx.Client(
        http2=True,
        timeout=timeout,
        headers={"User-Agent": UA, "Accept": "text/html,application/json;q=0.9,*/*;q=0.8"},
        follow_redirects=True,
    )


def polite_sleep() -> None:
    time.sleep(SLEEP_BETWEEN)


# ---------------------------------------------------------------------------
# Listing
# ---------------------------------------------------------------------------

def list_projects(
    client: httpx.Client,
    page: int = 1,
    page_size: int = 30,
    sort: str = "hot",
) -> dict:
    r = client.get(API_LIST, params={"page": page, "pageSize": page_size, "sort": sort})
    r.raise_for_status()
    data = r.json()
    if not data.get("success"):
        raise RuntimeError(f"list API failed: {data}")
    return data["result"]


def rank_score(item: dict) -> float:
    """Composite quality score: favor projects with broad engagement."""
    c = item["count"]
    return (
        c["like"] * 3
        + c["star"] * 1
        + c["fork"] * 2
        + c["views"] / 100
        + item["comments_count"] * 2
        + (item.get("grade") or 0) * 50
    )


def pick_top(
    items: list[dict],
    n: int,
    min_likes: int,
    min_grade: int,
    exclude_copies: bool = True,
) -> list[dict]:
    filtered = []
    for it in items:
        if exclude_copies and "_copy" in it["path"]:
            continue
        if it["count"]["like"] < min_likes:
            continue
        if (it.get("grade") or 0) < min_grade:
            continue
        filtered.append(it)
    filtered.sort(key=rank_score, reverse=True)
    return filtered[:n]


# ---------------------------------------------------------------------------
# Detail page parsing
# ---------------------------------------------------------------------------

RE_ATTACH_BLOCK = re.compile(r'\\"attachments\\":\[', re.DOTALL)
RE_LICENSE = re.compile(r'\\"license\\":\\"([^\\"]+)\\"')
RE_META_DESC = re.compile(
    r'<meta\s+name="description"\s+content="([^"]*)"', re.IGNORECASE
)
RE_TITLE = re.compile(r"<title>([^<]+)</title>", re.IGNORECASE)


def _find_balanced_bracket(s: str, start: int, open_ch: str = "[", close_ch: str = "]") -> int:
    """Return index after the matching close bracket. start must point at open_ch."""
    assert s[start] == open_ch
    depth = 0
    for i in range(start, len(s)):
        ch = s[i]
        if ch == open_ch:
            depth += 1
        elif ch == close_ch:
            depth -= 1
            if depth == 0:
                return i + 1
    raise ValueError("unbalanced")


def parse_detail_html(h: str) -> dict:
    """Extract attachments, license, title, description from SSR HTML."""
    out: dict = {
        "title": None,
        "description_meta": None,
        "license": None,
        "attachments": [],
    }

    m = RE_TITLE.search(h)
    if m:
        # HTML entities + suffix stripping
        title = _html.unescape(m.group(1)).strip()
        for sfx in (
            " - 立创开源硬件平台 - 深圳创电优选科技有限公司",
            " - 立创开源硬件平台",
        ):
            if title.endswith(sfx):
                title = title[: -len(sfx)]
        out["title"] = title

    m = RE_META_DESC.search(h)
    if m:
        out["description_meta"] = _html.unescape(m.group(1))

    m = RE_LICENSE.search(h)
    if m:
        out["license"] = m.group(1)

    m = RE_ATTACH_BLOCK.search(h)
    if m:
        arr_start = m.end() - 1  # point at '['
        arr_end = _find_balanced_bracket(h, arr_start)
        block = h[arr_start:arr_end]
        clean = block.replace('\\"', '"').replace("\\\\", "\\")
        try:
            out["attachments"] = json.loads(clean)
        except json.JSONDecodeError as e:
            # Keep raw for debugging; skip attachments silently. Caller can log.
            out["_attachments_parse_error"] = str(e)

    return out


# ---------------------------------------------------------------------------
# Download helpers
# ---------------------------------------------------------------------------

def download_to(client: httpx.Client, url: str, dest: Path) -> tuple[int, str]:
    """Stream-download url to dest. Returns (size, sha256)."""
    dest.parent.mkdir(parents=True, exist_ok=True)
    h = hashlib.sha256()
    size = 0
    with client.stream("GET", url) as r:
        r.raise_for_status()
        with open(dest, "wb") as f:
            for chunk in r.iter_bytes(1 << 15):
                f.write(chunk)
                h.update(chunk)
                size += len(chunk)
    return size, h.hexdigest()


# ---------------------------------------------------------------------------
# Single-project crawl
# ---------------------------------------------------------------------------

@dc.dataclass
class CrawlResult:
    project_id: str
    out_dir: Path
    files_count: int
    bytes_total: int
    skipped_files: list[str]


def crawl_one(
    client: httpx.Client,
    list_item: dict,
    out_root: Path,
    fetch_files: bool = True,
) -> CrawlResult:
    uuid = list_item["uuid"]
    path = list_item["path"]
    proj_dir = out_root / uuid
    proj_dir.mkdir(parents=True, exist_ok=True)

    # 1. Fetch detail HTML
    detail_url = f"{BASE}/{path}"
    r = client.get(detail_url)
    r.raise_for_status()
    detail = parse_detail_html(r.text)
    polite_sleep()

    # 2. Cover image
    thumb_url = list_item["thumb"]
    if thumb_url.startswith("//"):
        thumb_url = "https:" + thumb_url
    cover_rel = None
    if thumb_url:
        ext = Path(urllib.parse.urlparse(thumb_url).path).suffix or ".jpg"
        cover_rel = f"cover{ext}"
        try:
            download_to(client, thumb_url, proj_dir / cover_rel)
        except httpx.HTTPError as e:
            print(f"  cover failed: {e}", file=sys.stderr)
            cover_rel = None
        polite_sleep()

    # 3. Description markdown (combine meta + introduction)
    desc_md_parts = [f"# {list_item['name']}\n"]
    if detail.get("description_meta"):
        desc_md_parts.append(detail["description_meta"].strip())
    elif list_item.get("introduction"):
        desc_md_parts.append(list_item["introduction"].strip())
    desc_md_parts.append(
        f"\n---\n"
        f"- Source: {detail_url}\n"
        f"- Author: {list_item['owner'].get('nickname')} "
        f"({list_item['owner'].get('username')})\n"
        f"- License: {detail.get('license') or 'unknown'}\n"
        f"- Published: {list_item.get('oshwhub_publish_at')}\n"
    )
    (proj_dir / "description.md").write_text("\n".join(desc_md_parts), encoding="utf-8")

    # 4. Files
    files_meta: list[dict] = []
    skipped: list[str] = []
    bytes_total = 0
    for a in detail.get("attachments", []):
        src = a.get("src") or ""
        if not src:
            continue
        file_url = IMG_CDN + src if src.startswith("/") else src
        name = a.get("name") or Path(src).name
        safe_name = re.sub(r'[/\\:*?"<>|]', "_", name)
        local_rel = f"files/{safe_name}"
        local_path = proj_dir / local_rel

        entry: dict = {
            "name": name,
            "url": file_url,
            "original_id": a.get("uuid"),
            "ext": a.get("ext"),
            "mime": a.get("mime"),
            "size": a.get("size"),
            "md5": a.get("md5"),
        }
        if fetch_files:
            try:
                size, sha = download_to(client, file_url, local_path)
                entry["path"] = local_rel
                entry["sha256"] = sha
                if entry.get("size") and entry["size"] != size:
                    entry["size_actual"] = size
                else:
                    entry["size"] = size
                bytes_total += size
            except httpx.HTTPError as e:
                skipped.append(f"{name}: {e}")
                print(f"  file skipped {name}: {e}", file=sys.stderr)
            polite_sleep()
        files_meta.append(entry)

    # 5. URL manifest (for files we couldn't download or for future re-download)
    urls_manifest = {
        "detail_url": detail_url,
        "cover_url": thumb_url,
        "attachments": [
            {"name": f["name"], "url": f["url"], "original_id": f.get("original_id")}
            for f in files_meta
        ],
    }
    (proj_dir / "_urls.json").write_text(
        json.dumps(urls_manifest, ensure_ascii=False, indent=2), encoding="utf-8"
    )

    # 6. Unified metadata
    meta = {
        "source": "oshwhub",
        "source_url": detail_url,
        "project_id": uuid,
        "title": detail.get("title") or list_item["name"],
        "description_short": list_item.get("introduction") or "",
        "description_path": "description.md",
        "author": {
            "username": list_item["owner"]["username"],
            "display_name": list_item["owner"].get("nickname"),
            "user_id": list_item["owner"].get("uuid"),
        },
        "license": detail.get("license") or "unknown",
        "tags": list_item.get("tags") or [],
        "created_at": list_item.get("created_at"),
        "updated_at": list_item.get("updated_at"),
        "published_at": list_item.get("oshwhub_publish_at"),
        "crawled_at": datetime.now(timezone.utc).isoformat(),
        "metrics": {
            "likes": list_item["count"]["like"],
            "stars": list_item["count"]["star"],
            "forks": list_item["count"]["fork"],
            "views": list_item["count"]["views"],
            "watch": list_item["count"].get("watch", 0),
            "comments": list_item.get("comments_count", 0),
        },
        "cover": {"url": thumb_url, "path": cover_rel} if thumb_url else None,
        "files": files_meta,
        "raw_fields": {
            "path": list_item["path"],
            "grade": list_item.get("grade"),
            "origin": list_item.get("origin"),
            "public": list_item.get("public"),
            "publish": list_item.get("publish"),
            "skipped_files": skipped,
        },
    }
    (proj_dir / "metadata.json").write_text(
        json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8"
    )

    return CrawlResult(
        project_id=uuid,
        out_dir=proj_dir,
        files_count=len(files_meta),
        bytes_total=bytes_total,
        skipped_files=skipped,
    )


# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------

def iter_candidates(
    client: httpx.Client,
    pages: int,
    page_size: int,
    sort: str,
) -> Iterator[dict]:
    for p in range(1, pages + 1):
        res = list_projects(client, page=p, page_size=page_size, sort=sort)
        for it in res["lists"]:
            yield it
        polite_sleep()


def main(argv: list[str] | None = None) -> int:
    ap = argparse.ArgumentParser(description="oshwhub MVP crawler")
    ap.add_argument("--out", type=Path, default=Path("data/raw/oshwhub"))
    ap.add_argument("--top", type=int, default=10, help="number of projects to crawl")
    ap.add_argument("--min-likes", type=int, default=50)
    ap.add_argument("--min-grade", type=int, default=4)
    ap.add_argument("--pages", type=int, default=3, help="list API pages to scan")
    ap.add_argument("--page-size", type=int, default=30)
    ap.add_argument("--sort", default="hot")
    ap.add_argument("--uuids", type=str, default=None, help="comma-separated explicit UUID list")
    ap.add_argument("--no-files", action="store_true", help="do not download attachments")
    ap.add_argument("--limit", type=int, default=None, help="override --top, same effect")
    args = ap.parse_args(argv)

    n_target = args.limit if args.limit is not None else args.top
    args.out.mkdir(parents=True, exist_ok=True)

    with make_client() as client:
        # Build list of items to crawl
        if args.uuids:
            wanted = set(args.uuids.split(","))
            items: list[dict] = []
            for it in iter_candidates(client, args.pages, args.page_size, args.sort):
                if it["uuid"] in wanted:
                    items.append(it)
                if len(items) == len(wanted):
                    break
            if len(items) < len(wanted):
                missing = wanted - {i["uuid"] for i in items}
                print(f"WARN: missing uuids (not in top pages): {missing}", file=sys.stderr)
        else:
            pool = list(iter_candidates(client, args.pages, args.page_size, args.sort))
            items = pick_top(
                pool, n=n_target, min_likes=args.min_likes, min_grade=args.min_grade
            )
            if len(items) < n_target:
                print(
                    f"WARN: only {len(items)} items passed filters "
                    f"(wanted {n_target})",
                    file=sys.stderr,
                )

        print(f"Crawling {len(items)} projects -> {args.out}")
        for i, it in enumerate(items, 1):
            print(f"[{i}/{len(items)}] {it['path']}  ({it['name']})")
            try:
                r = crawl_one(client, it, args.out, fetch_files=not args.no_files)
                print(
                    f"  OK: {r.files_count} files, {r.bytes_total / 1024 / 1024:.1f} MB "
                    f"(skipped: {len(r.skipped_files)})"
                )
            except Exception as e:
                print(f"  FAIL: {e}", file=sys.stderr)

    return 0


if __name__ == "__main__":
    raise SystemExit(main())