From c8d55a22ebd76af1ef1a314f38c77b605052af61 Mon Sep 17 00:00:00 2001 From: Zhang Jiahao Date: Thu, 23 Apr 2026 19:40:55 +0800 Subject: [PATCH] Add schema+file validator; pin down fs-web-stream as ad icons MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Why: - schema 必须能自动校验,否则后续放量无法防腐。现在 scripts/validate.py 对全部 metadata.json 做两层检查(schema + 本地文件 sha256),跑一次 即可对全量数据签收;10/10 项目已通过。 - docs/sources/oshwhub.md 之前把 fs-web-stream.jlc.com 标为"工程源待查", 排查后确认那些 URL 全部是嘉立创服务侧栏/推广图标,与项目无关。 image.lceda.cn/attachments/ 是项目附件的唯一入口,现在调研文档闭合。 What: - scripts/validate.py: jsonschema 校验 + optional --check-files 核 sha256 - pyproject.toml: 加 jsonschema>=4.26 依赖 - docs/sources/oshwhub.md: fs-web-stream 归类为推广资源(已排除),附 context 证据 - log.md: 本次会话记录 Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/sources/oshwhub.md | 15 ++++--- log.md | 31 ++++++++++++- pyproject.toml | 1 + scripts/validate.py | 99 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 139 insertions(+), 7 deletions(-) create mode 100644 scripts/validate.py diff --git a/docs/sources/oshwhub.md b/docs/sources/oshwhub.md index 58d8067..4e6634f 100644 --- a/docs/sources/oshwhub.md +++ b/docs/sources/oshwhub.md @@ -52,14 +52,19 @@ HTML 内嵌 escaped JSON,关键字段: - **附件** (`attachments[].src`):`https://image.lceda.cn{src}` - 例:`/attachments/2020/7/mRn5hQZRhmx5r4usGxFmy8BXsCIHw5QoAT5HaLGC.pdf` - 已验证 HTTP 200,无鉴权 - - 覆盖 PDF / ZIP / MP4 等 -- **工程源文件** (`fs-web-stream.jlc.com/fs-web-stream/file-operation/download/`) - - HTML 里出现,但能否直接下载未测试 - - 优先级:P1;v0.1 先不抓 + - 覆盖 PDF / ZIP / MP4 / CSV / BIN 等(用户自传全类型) + - **这是项目用户文件的唯一入口**(已排查确认) + +### 排查过不是项目源的路径 +- `fs-web-stream.jlc.com/fs-web-stream/file-operation/download/` + - 详情页 HTML 里出现过 13 个,context 均为站点侧栏/推广图标 + - 例:嘉立创 3D 打印 icon、开源硬件平台 badge、EDA 扩展广场 banner + - **与项目本身无关**,不抓 ### 未找到 / 留作后续 -- EasyEDA 工程源 JSON(schematic/PCB 的真正源):推测在 `u.lceda.cn` 下,需登录 +- EasyEDA 工程源 JSON(schematic/PCB 的真正源):`u.lceda.cn/api/projects` 返回 401 "尚未登录",需要登录态 - 项目详情 JSON API:`/api/project/` 返回 `{"code":104001,"success":false}`(试过 GET/POST/路径形式均失败,疑似此端点需 session) + - 当前通过 SSR HTML 解析获取详情,字段够用 ## 已知字段与 schema 映射 diff --git a/log.md b/log.md index 42d1b00..13bb83c 100644 --- a/log.md +++ b/log.md @@ -56,8 +56,35 @@ 1. 验收 10 个项目元数据质量(随机抽 2-3 条对照原站) 2. 决定 Phase 1.4 放量目标(50?500?全量 12493?) -3. 未解决:`fs-web-stream.jlc.com` 下载(工程源?)、`u.lceda.cn` 登录态抓工程 JSON -4. Phase 2 准备:GitHub KiCad repo 调研 +3. Phase 2 准备:GitHub KiCad repo 调研 + +--- + +## 2026-04-23 19:40 fs-web-stream 排查 + schema 自动校验 + +**Claude 会话**(自主推进) + +### fs-web-stream.jlc.com 定性 + +重新抓 `/CYIIOT/ST_LINK-V2_1` 并看 13 个 `fs-web-stream.jlc.com` 链接的上下文:全部是嘉立创服务侧栏/推广图标(3D 打印、发热片、Ican、EDA 扩展广场、开源硬件平台 badge 等),**与项目本身无关**。`image.lceda.cn/attachments/` 就是项目附件的唯一入口,已确认闭环。`docs/sources/oshwhub.md` 对应章节已更新。 + +### scripts/validate.py + +jsonschema 做两层校验: +- 默认:所有 `data/raw/**/metadata.json` 对 `schemas/project.schema.json` 的结构校验 +- `--check-files`:另外验证每条 file 的本地 path 存在且 sha256 匹配 + +**结果**:10/10 项目两项全通过。 + +### 新增 + +- `scripts/validate.py` +- `pyproject.toml` 加 `jsonschema>=4.26` + +### 还是需要 Charles 决策 + +- 放量规模(推算:52MB/项目 × 12493 ≈ 650GB 全量,需评估 Gitea LFS 容量) +- 是否需要抓 `u.lceda.cn` 的 EasyEDA 源 JSON(需登录,v0.1 跳过) --- diff --git a/pyproject.toml b/pyproject.toml index 2e0a246..3bc3456 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,6 +5,7 @@ description = "Open hardware design dataset for Facere." requires-python = ">=3.11" dependencies = [ "httpx[http2]>=0.27", + "jsonschema>=4.26.0", ] [tool.ruff] diff --git a/scripts/validate.py b/scripts/validate.py new file mode 100644 index 0000000..e560415 --- /dev/null +++ b/scripts/validate.py @@ -0,0 +1,99 @@ +"""Validate all data/raw///metadata.json against project.schema.json. + +Usage: + uv run python scripts/validate.py # 校验全部 + uv run python scripts/validate.py data/raw/oshwhub # 指定子目录 + uv run python scripts/validate.py --check-files # 也核对本地文件是否存在+sha256 一致 +""" + +from __future__ import annotations + +import argparse +import hashlib +import json +import sys +from pathlib import Path + +import jsonschema + +REPO = Path(__file__).resolve().parent.parent +SCHEMA_PATH = REPO / "schemas" / "project.schema.json" + + +def iter_metadata(roots: list[Path]): + for root in roots: + for meta in root.rglob("metadata.json"): + yield meta + + +def sha256_of(path: Path, chunk: int = 1 << 15) -> str: + h = hashlib.sha256() + with open(path, "rb") as f: + while True: + b = f.read(chunk) + if not b: + break + h.update(b) + return h.hexdigest() + + +def check_files(meta_path: Path, meta: dict) -> list[str]: + """Return list of errors for the file-presence / hash check.""" + errs: list[str] = [] + proj_dir = meta_path.parent + for f in meta.get("files", []): + rel = f.get("path") + if not rel: + continue # URL-only entry, ok + p = proj_dir / rel + if not p.exists(): + errs.append(f"missing file: {rel}") + continue + want = f.get("sha256") + if want: + got = sha256_of(p) + if got != want: + errs.append(f"sha256 mismatch for {rel}: {got} != {want}") + return errs + + +def main(argv: list[str] | None = None) -> int: + ap = argparse.ArgumentParser(description="schema + file integrity check") + ap.add_argument( + "roots", + nargs="*", + type=Path, + default=[REPO / "data" / "raw"], + help="目录,递归找 metadata.json(默认 data/raw)", + ) + ap.add_argument("--check-files", action="store_true", help="验证本地文件存在且 sha256 一致") + args = ap.parse_args(argv) + + schema = json.loads(SCHEMA_PATH.read_text()) + validator = jsonschema.Draft202012Validator(schema) + + ok = 0 + bad = 0 + for meta_path in iter_metadata(args.roots): + meta = json.loads(meta_path.read_text()) + rel = meta_path.relative_to(REPO) + + errors = [f"schema: {e.message} at {'/'.join(map(str, e.path))}" for e in validator.iter_errors(meta)] + if args.check_files: + errors += [f"file: {m}" for m in check_files(meta_path, meta)] + + if errors: + bad += 1 + print(f"FAIL {rel}") + for e in errors: + print(f" - {e}") + else: + ok += 1 + + total = ok + bad + print(f"\n{ok}/{total} passed") + return 0 if bad == 0 else 1 + + +if __name__ == "__main__": + raise SystemExit(main())