Files
FacereDataset/crawlers/oshwhub/crawler.py
Knowit d874278bc5 Add EasyEDA Std project source ingestion (10 boards backfilled)
打通 oshwhub origin=std 项目的工程源(schematic + PCB dataStr)抓取链路。原
plan.md §1.6 假设需要登录,实测 lceda.cn/api/documents/<doc>?uuid=<doc>&path=<doc>
对公开项目匿名可访问 —— 无需 cookie,无账号封禁风险。

调研:4 轮探测留痕在 data/state/std_probe[1-5]/(gitignored);翻 Std 编辑器
v6.5.51 的 main.min.js bundle 找到 ajaxDetail 端点;按 docType 区分两种
响应 shape(schematic 项目视图 vs PCB 文档视图)。

Crawler:
  - make_source_client() 用浏览器 UA + lceda.cn/editor Referer,因为
    oshwhub /api/project/<uuid> 端点拒绝 FacereDataset/0.1 UA(CLAUDE.md
    UA 例外条款:目标站主动封自定义 UA + 公开静态资源)
  - fetch_std_source(): 项目元 → version_documents → 逐文档 dataStr → 落
    source/<doc>.json + source/manifest.json
  - --with-source(爬新项目时一并抓源)/ --backfill-source(仅扫已有)
  - QPS ≤ 0.2 (SLEEP_SOURCE = 5s) 自律

Schema: 加 source_format / source_path / source_documents / editor_version
(前 3 进 enum 锁定,便于后续 Pro / KiCad 源对齐)。

回填结果:10/10 成功,45 个文档,33.2 MB;schema validate 全通。
docTypes 主要是 1 (schematic) 与 3 (pcb);USB 电压电流表只有 PCB 文档(4 个:
主板+盖板+底板+面板,作者未上传原理图源)。

完整调研:docs/sources/easyeda_std_source.md。
2026-04-28 20:07:40 +08:00

669 lines
23 KiB
Python

"""oshwhub.com crawler — MVP.
Usage:
uv run python -m crawlers.oshwhub \
--out data/raw/oshwhub \
--top 10 --min-likes 50 --min-grade 4
Or with explicit UUID list:
uv run python -m crawlers.oshwhub \
--uuids 298873b7fdbe44f8ba0e7351e023bc2c,7b6a398811f14eba9a952b8d2ddd7ace \
--out data/raw/oshwhub
"""
from __future__ import annotations
import argparse
import dataclasses as dc
import hashlib
import html as _html
import json
import re
import sys
import time
import urllib.parse
from datetime import datetime, timezone
from pathlib import Path
from typing import Iterator
import httpx
API_LIST = "https://oshwhub.com/api/project"
API_PROJECT = "https://oshwhub.com/api/project" # /api/project/<uuid> for source flow
BASE = "https://oshwhub.com"
IMG_CDN = "https://image.lceda.cn"
LCEDA_DOC_API = "https://lceda.cn/api/documents"
UA = "FacereDataset/0.1 (+https://git.deepknow.site/Facere/FacereDataset)"
# Std source endpoints reject FacereDataset UA on oshwhub /api/project; spoof browser UA only there.
# See docs/sources/easyeda_std_source.md §3.
BROWSER_UA = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/147.0.0.0 Safari/537.36"
)
SLEEP_BETWEEN = 2.0 # seconds between detail-page / file fetches
SLEEP_SOURCE = 5.0 # source fetch is sensitive — QPS ≤ 0.2 per CLAUDE.md登录态 spirit
# ---------------------------------------------------------------------------
# HTTP
# ---------------------------------------------------------------------------
def make_client(timeout: float = 30.0) -> httpx.Client:
return httpx.Client(
http2=True,
timeout=timeout,
headers={"User-Agent": UA, "Accept": "text/html,application/json;q=0.9,*/*;q=0.8"},
follow_redirects=True,
)
def make_source_client(timeout: float = 60.0) -> httpx.Client:
"""Client for Std source endpoints (lceda.cn/oshwhub.com /api/...).
Uses browser UA + editor Referer to satisfy server-side UA filter.
"""
return httpx.Client(
http2=True,
timeout=timeout,
headers={
"User-Agent": BROWSER_UA,
"Accept": "application/json, text/plain, */*",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Referer": "https://lceda.cn/editor",
},
follow_redirects=False,
)
def polite_sleep() -> None:
time.sleep(SLEEP_BETWEEN)
# ---------------------------------------------------------------------------
# Listing
# ---------------------------------------------------------------------------
def list_projects(
client: httpx.Client,
page: int = 1,
page_size: int = 30,
sort: str = "hot",
) -> dict:
r = client.get(API_LIST, params={"page": page, "pageSize": page_size, "sort": sort})
r.raise_for_status()
data = r.json()
if not data.get("success"):
raise RuntimeError(f"list API failed: {data}")
return data["result"]
def rank_score(item: dict) -> float:
"""Composite quality score: favor projects with broad engagement."""
c = item["count"]
return (
c["like"] * 3
+ c["star"] * 1
+ c["fork"] * 2
+ c["views"] / 100
+ item["comments_count"] * 2
+ (item.get("grade") or 0) * 50
)
def pick_top(
items: list[dict],
n: int,
min_likes: int,
min_grade: int,
exclude_copies: bool = True,
) -> list[dict]:
filtered = []
for it in items:
if exclude_copies and "_copy" in it["path"]:
continue
if it["count"]["like"] < min_likes:
continue
if (it.get("grade") or 0) < min_grade:
continue
filtered.append(it)
filtered.sort(key=rank_score, reverse=True)
return filtered[:n]
# ---------------------------------------------------------------------------
# Detail page parsing
# ---------------------------------------------------------------------------
RE_ATTACH_BLOCK = re.compile(r'\\"attachments\\":\[', re.DOTALL)
RE_LICENSE = re.compile(r'\\"license\\":\\"([^\\"]+)\\"')
RE_META_DESC = re.compile(
r'<meta\s+name="description"\s+content="([^"]*)"', re.IGNORECASE
)
RE_TITLE = re.compile(r"<title>([^<]+)</title>", re.IGNORECASE)
def _find_balanced_bracket(s: str, start: int, open_ch: str = "[", close_ch: str = "]") -> int:
"""Return index after the matching close bracket. start must point at open_ch."""
assert s[start] == open_ch
depth = 0
for i in range(start, len(s)):
ch = s[i]
if ch == open_ch:
depth += 1
elif ch == close_ch:
depth -= 1
if depth == 0:
return i + 1
raise ValueError("unbalanced")
def parse_detail_html(h: str) -> dict:
"""Extract attachments, license, title, description from SSR HTML."""
out: dict = {
"title": None,
"description_meta": None,
"license": None,
"attachments": [],
}
m = RE_TITLE.search(h)
if m:
# HTML entities + suffix stripping
title = _html.unescape(m.group(1)).strip()
for sfx in (
" - 立创开源硬件平台 - 深圳创电优选科技有限公司",
" - 立创开源硬件平台",
):
if title.endswith(sfx):
title = title[: -len(sfx)]
out["title"] = title
m = RE_META_DESC.search(h)
if m:
out["description_meta"] = _html.unescape(m.group(1))
m = RE_LICENSE.search(h)
if m:
out["license"] = m.group(1)
m = RE_ATTACH_BLOCK.search(h)
if m:
arr_start = m.end() - 1 # point at '['
arr_end = _find_balanced_bracket(h, arr_start)
block = h[arr_start:arr_end]
clean = block.replace('\\"', '"').replace("\\\\", "\\")
try:
out["attachments"] = json.loads(clean)
except json.JSONDecodeError as e:
# Keep raw for debugging; skip attachments silently. Caller can log.
out["_attachments_parse_error"] = str(e)
return out
# ---------------------------------------------------------------------------
# Download helpers
# ---------------------------------------------------------------------------
def download_to(client: httpx.Client, url: str, dest: Path) -> tuple[int, str]:
"""Stream-download url to dest. Returns (size, sha256)."""
dest.parent.mkdir(parents=True, exist_ok=True)
h = hashlib.sha256()
size = 0
with client.stream("GET", url) as r:
r.raise_for_status()
with open(dest, "wb") as f:
for chunk in r.iter_bytes(1 << 15):
f.write(chunk)
h.update(chunk)
size += len(chunk)
return size, h.hexdigest()
# ---------------------------------------------------------------------------
# Std source fetch (login NOT required for public projects — see
# docs/sources/easyeda_std_source.md)
# ---------------------------------------------------------------------------
def fetch_std_source(
source_client: httpx.Client,
project_uuid: str,
proj_dir: Path,
sleep: float = SLEEP_SOURCE,
) -> dict:
"""Fetch EasyEDA Std project source (schematic + PCB dataStr) anonymously.
Returns dict with keys:
- source_format: "easyeda-std"
- source_path: "source/"
- source_documents: list of {doc_uuid, docType, master, path, size, sha256}
- editor_version: from dataStr.head when available
"""
src_dir = proj_dir / "source"
src_dir.mkdir(parents=True, exist_ok=True)
# 1. Project meta → version_documents
r = source_client.get(f"{API_PROJECT}/{project_uuid}")
r.raise_for_status()
j = r.json()
if not j.get("success"):
raise RuntimeError(f"oshwhub project meta failed: {j}")
version_documents = j["result"].get("version_documents") or []
time.sleep(sleep)
# 2. Per document → dataStr
doc_metas: list[dict] = []
editor_version: str | None = None
for vd in version_documents:
doc_uuid = vd["uuid"]
master = vd.get("master")
doc_type = vd.get("docType")
url = f"{LCEDA_DOC_API}/{doc_uuid}"
r2 = source_client.get(url, params={"uuid": doc_uuid, "path": doc_uuid})
r2.raise_for_status()
# Server returns text/html mistakenly; body is JSON regardless.
try:
body_json = r2.json()
except Exception as e: # noqa: BLE001
raise RuntimeError(f"doc {doc_uuid} non-JSON response: {e}; head={r2.text[:200]!r}")
if not body_json.get("success"):
raise RuntimeError(f"doc {doc_uuid} response not success: {body_json}")
local_rel = f"source/{doc_uuid}.json"
local_path = proj_dir / local_rel
text = json.dumps(body_json, ensure_ascii=False, separators=(",", ":"))
local_path.write_text(text, encoding="utf-8")
size = local_path.stat().st_size
sha = hashlib.sha256(text.encode("utf-8")).hexdigest()
# Pull editor version from the dataStr.head if present.
ev = _extract_editor_version(body_json)
if ev and not editor_version:
editor_version = ev
doc_metas.append({
"doc_uuid": doc_uuid,
"docType": doc_type,
"master": master,
"path": local_rel,
"size": size,
"sha256": sha,
})
time.sleep(sleep)
# 3. source/manifest.json — index + raw upstream version_documents for diffing
manifest = {
"project_uuid": project_uuid,
"fetched_at": datetime.now(timezone.utc).isoformat(),
"editor_version": editor_version,
"documents": doc_metas,
"upstream_version_documents": version_documents,
}
(src_dir / "manifest.json").write_text(
json.dumps(manifest, ensure_ascii=False, indent=2), encoding="utf-8"
)
return {
"source_format": "easyeda-std",
"source_path": "source/",
"source_documents": doc_metas,
"editor_version": editor_version,
}
def _extract_editor_version(body_json: dict) -> str | None:
"""Best-effort: pull head.editorVersion from dataStr (location varies by docType)."""
res = body_json.get("result") or {}
# PCB shape: result.dataStr at top
ds = res.get("dataStr")
if isinstance(ds, dict):
head = ds.get("head") or {}
if isinstance(head, dict) and head.get("editorVersion"):
return str(head["editorVersion"])
# Schematic shape: result.schematics[*].dataStr
for sch in (res.get("schematics") or []):
if isinstance(sch, dict):
ds2 = sch.get("dataStr") or {}
if isinstance(ds2, dict):
head = ds2.get("head") or {}
if isinstance(head, dict) and head.get("editorVersion"):
return str(head["editorVersion"])
return None
# ---------------------------------------------------------------------------
# Single-project crawl
# ---------------------------------------------------------------------------
@dc.dataclass
class CrawlResult:
project_id: str
out_dir: Path
files_count: int
bytes_total: int
skipped_files: list[str]
def crawl_one(
client: httpx.Client,
list_item: dict,
out_root: Path,
fetch_files: bool = True,
source_client: httpx.Client | None = None,
) -> CrawlResult:
uuid = list_item["uuid"]
path = list_item["path"]
proj_dir = out_root / uuid
proj_dir.mkdir(parents=True, exist_ok=True)
# 1. Fetch detail HTML
detail_url = f"{BASE}/{path}"
r = client.get(detail_url)
r.raise_for_status()
detail = parse_detail_html(r.text)
polite_sleep()
# 2. Cover image
thumb_url = list_item["thumb"]
if thumb_url.startswith("//"):
thumb_url = "https:" + thumb_url
cover_rel = None
if thumb_url:
ext = Path(urllib.parse.urlparse(thumb_url).path).suffix or ".jpg"
cover_rel = f"cover{ext}"
try:
download_to(client, thumb_url, proj_dir / cover_rel)
except httpx.HTTPError as e:
print(f" cover failed: {e}", file=sys.stderr)
cover_rel = None
polite_sleep()
# 3. Description markdown (combine meta + introduction)
desc_md_parts = [f"# {list_item['name']}\n"]
if detail.get("description_meta"):
desc_md_parts.append(detail["description_meta"].strip())
elif list_item.get("introduction"):
desc_md_parts.append(list_item["introduction"].strip())
desc_md_parts.append(
f"\n---\n"
f"- Source: {detail_url}\n"
f"- Author: {list_item['owner'].get('nickname')} "
f"({list_item['owner'].get('username')})\n"
f"- License: {detail.get('license') or 'unknown'}\n"
f"- Published: {list_item.get('oshwhub_publish_at')}\n"
)
(proj_dir / "description.md").write_text("\n".join(desc_md_parts), encoding="utf-8")
# 4. Files
files_meta: list[dict] = []
skipped: list[str] = []
bytes_total = 0
for a in detail.get("attachments", []):
src = a.get("src") or ""
if not src:
continue
file_url = IMG_CDN + src if src.startswith("/") else src
name = a.get("name") or Path(src).name
safe_name = re.sub(r'[/\\:*?"<>|]', "_", name)
local_rel = f"files/{safe_name}"
local_path = proj_dir / local_rel
entry: dict = {
"name": name,
"url": file_url,
"original_id": a.get("uuid"),
"ext": a.get("ext"),
"mime": a.get("mime"),
"size": a.get("size"),
"md5": a.get("md5"),
}
if fetch_files:
try:
size, sha = download_to(client, file_url, local_path)
entry["path"] = local_rel
entry["sha256"] = sha
if entry.get("size") and entry["size"] != size:
entry["size_actual"] = size
else:
entry["size"] = size
bytes_total += size
except httpx.HTTPError as e:
skipped.append(f"{name}: {e}")
print(f" file skipped {name}: {e}", file=sys.stderr)
polite_sleep()
files_meta.append(entry)
# 5. URL manifest (for files we couldn't download or for future re-download)
urls_manifest = {
"detail_url": detail_url,
"cover_url": thumb_url,
"attachments": [
{"name": f["name"], "url": f["url"], "original_id": f.get("original_id")}
for f in files_meta
],
}
(proj_dir / "_urls.json").write_text(
json.dumps(urls_manifest, ensure_ascii=False, indent=2), encoding="utf-8"
)
# 6. Optional: EasyEDA Std project source (schematic + PCB dataStr)
src_meta: dict = {}
if source_client is not None:
try:
src_meta = fetch_std_source(source_client, uuid, proj_dir)
print(
f" source: {len(src_meta.get('source_documents', []))} docs, "
f"editor={src_meta.get('editor_version')}"
)
except Exception as e: # noqa: BLE001
print(f" source FAIL: {e}", file=sys.stderr)
skipped.append(f"source: {e}")
# 7. Unified metadata
meta = {
"source": "oshwhub",
"source_url": detail_url,
"project_id": uuid,
"title": detail.get("title") or list_item["name"],
"description_short": list_item.get("introduction") or "",
"description_path": "description.md",
"author": {
"username": list_item["owner"]["username"],
"display_name": list_item["owner"].get("nickname"),
"user_id": list_item["owner"].get("uuid"),
},
"license": detail.get("license") or "unknown",
"tags": list_item.get("tags") or [],
"created_at": list_item.get("created_at"),
"updated_at": list_item.get("updated_at"),
"published_at": list_item.get("oshwhub_publish_at"),
"crawled_at": datetime.now(timezone.utc).isoformat(),
"metrics": {
"likes": list_item["count"]["like"],
"stars": list_item["count"]["star"],
"forks": list_item["count"]["fork"],
"views": list_item["count"]["views"],
"watch": list_item["count"].get("watch", 0),
"comments": list_item.get("comments_count", 0),
},
"cover": {"url": thumb_url, "path": cover_rel} if thumb_url else None,
"files": files_meta,
"raw_fields": {
"path": list_item["path"],
"grade": list_item.get("grade"),
"origin": list_item.get("origin"),
"public": list_item.get("public"),
"publish": list_item.get("publish"),
"skipped_files": skipped,
},
}
if src_meta:
meta["source_format"] = src_meta["source_format"]
meta["source_path"] = src_meta["source_path"]
meta["source_documents"] = src_meta["source_documents"]
if src_meta.get("editor_version"):
meta["editor_version"] = src_meta["editor_version"]
(proj_dir / "metadata.json").write_text(
json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8"
)
return CrawlResult(
project_id=uuid,
out_dir=proj_dir,
files_count=len(files_meta),
bytes_total=bytes_total,
skipped_files=skipped,
)
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
def iter_candidates(
client: httpx.Client,
pages: int,
page_size: int,
sort: str,
) -> Iterator[dict]:
for p in range(1, pages + 1):
res = list_projects(client, page=p, page_size=page_size, sort=sort)
for it in res["lists"]:
yield it
polite_sleep()
def main(argv: list[str] | None = None) -> int:
ap = argparse.ArgumentParser(description="oshwhub MVP crawler")
ap.add_argument("--out", type=Path, default=Path("data/raw/oshwhub"))
ap.add_argument("--top", type=int, default=10, help="number of projects to crawl")
ap.add_argument("--min-likes", type=int, default=50)
ap.add_argument("--min-grade", type=int, default=4)
ap.add_argument("--pages", type=int, default=3, help="list API pages to scan")
ap.add_argument("--page-size", type=int, default=30)
ap.add_argument("--sort", default="hot")
ap.add_argument("--uuids", type=str, default=None, help="comma-separated explicit UUID list")
ap.add_argument("--no-files", action="store_true", help="do not download attachments")
ap.add_argument("--limit", type=int, default=None, help="override --top, same effect")
ap.add_argument(
"--with-source",
action="store_true",
help="also fetch EasyEDA Std project source (schematic + PCB dataStr) per project",
)
ap.add_argument(
"--backfill-source",
action="store_true",
help="skip listing/HTML/attachments; only fetch source for projects already in --out",
)
args = ap.parse_args(argv)
n_target = args.limit if args.limit is not None else args.top
args.out.mkdir(parents=True, exist_ok=True)
# --backfill-source: standalone path that scans existing project dirs and
# only fetches source. No listing/HTML/attachment work.
if args.backfill_source:
return _run_backfill_source(args.out, only_uuids=args.uuids)
with make_client() as client:
# Build list of items to crawl
if args.uuids:
wanted = set(args.uuids.split(","))
items: list[dict] = []
for it in iter_candidates(client, args.pages, args.page_size, args.sort):
if it["uuid"] in wanted:
items.append(it)
if len(items) == len(wanted):
break
if len(items) < len(wanted):
missing = wanted - {i["uuid"] for i in items}
print(f"WARN: missing uuids (not in top pages): {missing}", file=sys.stderr)
else:
pool = list(iter_candidates(client, args.pages, args.page_size, args.sort))
items = pick_top(
pool, n=n_target, min_likes=args.min_likes, min_grade=args.min_grade
)
if len(items) < n_target:
print(
f"WARN: only {len(items)} items passed filters "
f"(wanted {n_target})",
file=sys.stderr,
)
source_client_ctx = make_source_client() if args.with_source else None
try:
print(f"Crawling {len(items)} projects -> {args.out}")
for i, it in enumerate(items, 1):
print(f"[{i}/{len(items)}] {it['path']} ({it['name']})")
try:
r = crawl_one(
client,
it,
args.out,
fetch_files=not args.no_files,
source_client=source_client_ctx,
)
print(
f" OK: {r.files_count} files, {r.bytes_total / 1024 / 1024:.1f} MB "
f"(skipped: {len(r.skipped_files)})"
)
except Exception as e:
print(f" FAIL: {e}", file=sys.stderr)
finally:
if source_client_ctx is not None:
source_client_ctx.close()
return 0
def _run_backfill_source(out_root: Path, only_uuids: str | None = None) -> int:
"""Walk existing per-project dirs in out_root and fetch source.json into each.
Updates metadata.json in-place to add source_format / source_documents / editor_version.
"""
wanted: set[str] | None = set(only_uuids.split(",")) if only_uuids else None
targets: list[Path] = []
for d in sorted(out_root.iterdir()):
if not d.is_dir():
continue
meta_path = d / "metadata.json"
if not meta_path.exists():
continue
if wanted and d.name not in wanted:
continue
targets.append(d)
print(f"Backfill source for {len(targets)} projects under {out_root}")
src_client = make_source_client()
try:
for i, proj_dir in enumerate(targets, 1):
uuid = proj_dir.name
meta_path = proj_dir / "metadata.json"
meta = json.loads(meta_path.read_text(encoding="utf-8"))
print(f"[{i}/{len(targets)}] {uuid} ({meta.get('title', '?')})")
try:
src_meta = fetch_std_source(src_client, uuid, proj_dir)
except Exception as e: # noqa: BLE001
print(f" FAIL: {e}", file=sys.stderr)
continue
meta["source_format"] = src_meta["source_format"]
meta["source_path"] = src_meta["source_path"]
meta["source_documents"] = src_meta["source_documents"]
if src_meta.get("editor_version"):
meta["editor_version"] = src_meta["editor_version"]
meta_path.write_text(
json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8"
)
total = sum(d["size"] for d in src_meta["source_documents"])
print(
f" OK: {len(src_meta['source_documents'])} docs, "
f"{total / 1024:.1f} KB, editor={src_meta.get('editor_version')}"
)
finally:
src_client.close()
return 0
if __name__ == "__main__":
raise SystemExit(main())