diff --git a/docs/sources/epro2_to_std_mapping.md b/docs/sources/epro2_to_std_mapping.md index e1e492c..69bec7b 100644 --- a/docs/sources/epro2_to_std_mapping.md +++ b/docs/sources/epro2_to_std_mapping.md @@ -1,4 +1,4 @@ -# EPRO2 OPTYPE → EasyEDA Std shape verb mapping +# EPRO2 / Pro 2.x OPTYPE → EasyEDA Std shape verb mapping For downstream adapters that consume `tools/epro2/std/`'s Option-2 output (raw `objects: {id: payload}` dict in the `dataStr` field) and need to @@ -225,6 +225,43 @@ choose to skip silently or emit best-effort placeholders: | STRING (PCB) | `TEXT` | Board-level text; field order distinct from PCB TEXT-in-LIB | | BUS / BE (SCH) | `BUS` / `BE` | Bus + bus entry — no EPRO2 sample in our corpus | +## Pro 2.x source format + +Pro 2.x projects (lceda Pro editor 2.x — Liangshan Pi, Taishan Pi RK3566 +in our corpus) use a **different on-disk format** than Pro 3.x EPRO2, +even though both come out of the same crawler. Detection: the +`source/manifest.json` file has `"editor_version": "2.x.x"`. Our +exporter auto-detects this and emits the same Std envelope, but with two +key differences the adapter must branch on: + +- `result.dataStr.head.epro_format = "pro2"` (vs absent / `"epro2"` for + Pro 3.x). This is the canonical dispatch field. +- `result.dataStr.objects` values are **JSON arrays**, not the + `{"_type": ..., **fields}` dicts EPRO2 produces. The first array + element is the OPTYPE (`["COMPONENT", "e1", "", 0, 0, 90, ...]`). + +Pro 2.x op vocabulary overlaps EPRO2 but adds editor-specific helpers: +`FONTSTYLE` / `LINESTYLE` (referenced by id from text/stroke ops), +`CONNECT` (sch wire-end to pin binding), `OBJ` (group container), +`REGION` (sch background fills), `DIMENSION` (sch annotation), +`STRING` (PCB board-level text — distinct from PCB `TEXT`), +`TEARDROP` (cosmetic fillets at via/pad). + +Field positions per OPTYPE follow the public EasyEDA Pro 2.x spec +(versioned via the leading `["DOCTYPE","SCH","1.1"]` / `["DOCTYPE", +"PCB","1.4"]` op). Our writer doesn't translate them — adapter +dispatches by `arr[0]` (OPTYPE) and walks the rest by index. + +### Encrypted-external PCB blobs + +Some Pro 2.x PCB docs (and a handful of resource docs) replace the +inline `dataStr` field with `{"dataStrId": "https://modules.lceda.cn/...", +"iv": "...", "key": "..."}` — the actual op-stream lives at the URL, +AES-decrypted with the iv+key. **Our exporter skips these**; the +`source/.json` files still hold the dataStrId/iv/key so a future +fetch+decrypt pass can recover them. Taishan PCB is the example in our +corpus. + ## Provenance fields the adapter can rely on In addition to `objects`, our writer always emits: diff --git a/tools/epro2/std/__main__.py b/tools/epro2/std/__main__.py index 30a8ab9..52f8faf 100644 --- a/tools/epro2/std/__main__.py +++ b/tools/epro2/std/__main__.py @@ -25,9 +25,30 @@ from pathlib import Path from ..replay import Project, replay_project from .pcb_writer import write_pcb_std +from .pro2_writer import write_pro2_doc from .sch_writer import write_sch_std +def _detect_pro2(project_dir: Path) -> tuple[bool, str]: + """Return ``(is_pro2, editor_version)`` from manifest.json. + + Pro 2.x and Pro 3.x EPRO2 share the manifest filename + per-doc-uuid + layout, but Pro 2.x sets ``editor_version`` to a 2.x string like + ``"2.1.40"`` and stores documents as ``.json`` (vs Pro 3.x's + ``.epro2``). The cheap test is just to read the editor_version + string — falls through to the existing EPRO2 path on any mismatch. + """ + mani_path = project_dir / "source" / "manifest.json" + if not mani_path.exists(): + return (False, "") + try: + m = json.loads(mani_path.read_text(encoding="utf-8")) + except json.JSONDecodeError: + return (False, "") + ev = str(m.get("editor_version") or "") + return (ev.startswith("2."), ev) + + def _dump(payload: dict, out_path: Path, project_uuid: str) -> None: payload["result"]["puuid"] = project_uuid or "" out_path.write_text( @@ -78,8 +99,68 @@ def _convert_schs(proj: Project, out_dir: Path) -> int: return len(uuids) +def _convert_pro2(project_dir: Path, out_dir: Path, + editor_version: str, want_pcb: bool, want_sch: bool) -> int: + """Pro 2.x path — read each .json directly (no EPRO2 replay) + and run pro2_writer. The manifest tells us per-doc docType so we + can route to PCB/SCH filters without parsing dataStr first.""" + mani_path = project_dir / "source" / "manifest.json" + m = json.loads(mani_path.read_text(encoding="utf-8")) + project_uuid = m.get("project_uuid") or project_dir.name + + skipped_encrypted = 0 + n = 0 + print(f"Pro 2.x project (editor {editor_version}) → {out_dir}") + for entry in m["documents"]: + dt = entry.get("docType") + if dt == 3 and not want_pcb: + continue + if dt == 1 and not want_sch: + continue + if dt not in (1, 3): + continue + path = project_dir / entry["path"] + try: + payload = write_pro2_doc( + path, project_uuid=project_uuid, editor_version_hint=editor_version, + ) + except Exception as e: # noqa: BLE001 + print(f" FAIL {entry['doc_uuid'][:12]}: {e}", file=sys.stderr) + continue + if payload is None: + stats = getattr(write_pro2_doc, "last_stats", None) + if stats and stats.skipped_encrypted: + print( + f" SKIP {entry['doc_uuid'][:12]}: PCB blob is " + f"AES-encrypted external (dataStrId+iv+key); needs " + f"a separate fetch+decrypt step we don't run here." + ) + skipped_encrypted += 1 + continue + out_path = out_dir / f"{entry['doc_uuid']}.json" + out_path.write_text( + json.dumps(payload, ensure_ascii=False, separators=(",", ":")), + encoding="utf-8", + ) + s = getattr(write_pro2_doc, "last_stats", None) + if s: + print( + f" {entry['doc_uuid'][:12]}.json: docType={dt} " + f"objects={s.objects} BBox=({s.bbox_x:g},{s.bbox_y:g}," + f"{s.bbox_w:g},{s.bbox_h:g})" + ) + n += 1 + if skipped_encrypted: + print( + f" ({skipped_encrypted} encrypted-external doc(s) skipped — " + f"the source/.json files still hold the dataStrId/iv/key " + f"so a future fetch+decrypt pass can recover them.)" + ) + return n + + def main(argv: list[str] | None = None) -> int: - ap = argparse.ArgumentParser(description="EPRO2 → EasyEDA Std-shaped JSON dump") + ap = argparse.ArgumentParser(description="EPRO2 / Pro 2.x → EasyEDA Std-shaped JSON dump") ap.add_argument("project_dir", type=Path) g = ap.add_mutually_exclusive_group(required=True) g.add_argument("--all-pcb", action="store_true", help="dump every PCB doc") @@ -88,9 +169,22 @@ def main(argv: list[str] | None = None) -> int: ap.add_argument("--out", type=Path, default=Path("data/processed/std_json")) args = ap.parse_args(argv) - proj = replay_project(args.project_dir) args.out.mkdir(parents=True, exist_ok=True) + is_pro2, editor_version = _detect_pro2(args.project_dir) + if is_pro2: + n = _convert_pro2( + args.project_dir, args.out, editor_version, + want_pcb=args.all_pcb or args.all, + want_sch=args.all_sch or args.all, + ) + if n == 0: + print("nothing to dump (no Pro 2.x SCH/PCB docs survived)", file=sys.stderr) + return 1 + return 0 + + # Pro 3.x EPRO2 path — full replay then per-doc dump. + proj = replay_project(args.project_dir) n = 0 if args.all_pcb or args.all: n += _convert_pcbs(proj, args.out) diff --git a/tools/epro2/std/pro2_writer.py b/tools/epro2/std/pro2_writer.py new file mode 100644 index 0000000..e1201f0 --- /dev/null +++ b/tools/epro2/std/pro2_writer.py @@ -0,0 +1,243 @@ +"""Convert one EasyEDA Pro 2.x JSON document → an Option-2 Std-shaped JSON. + +Pro 2.x stores each document as a JSON file whose ``dataStr`` field is a +**plaintext op-stream** — one JSON array per line, e.g.:: + + ["DOCTYPE","SCH","1.1"] + ["HEAD",{"originX":0,"originY":0,"version":"2.1.39","maxId":4639}] + ["COMPONENT","e1","",0,0,0,0,{},0] + ["ATTR","e18","e1","Symbol","6d31...",0,0,2506,-116,0,"st1",0] + ... + +This is **not** the same wire format as Pro 3.x EPRO2 (which is a binary +op-stream with tilde/pipe delimiters, decrypted from an AES-GCM blob); +the on-disk structure is closer to lceda Std but the op vocabulary +includes Pro 2.x extras (FONTSTYLE / LINESTYLE / CONNECT / OBJ / +REGION / DIMENSION / STRING / TEARDROP) the downstream adapter has to +dispatch on. + +We don't translate the ops to a normalised dict like ``replay.Document`` +does for EPRO2 — the field positions per OPTYPE are spec-defined for +Pro 2.x and the adapter already has to walk them by index. Instead we +emit the raw op arrays into ``dataStr.objects``, keyed by id (position 1 +for most ops, OPTYPE for singletons), and tag the head with +``head.epro_format = "pro2"`` so the adapter can branch on it. + +PCB docs whose payload is encrypted-external (``dataStrId`` / ``iv`` / +``key`` instead of inline ``dataStr``) are skipped — fetching from +modules.lceda.cn + AES-decrypt is out of this writer's scope. +""" + +from __future__ import annotations + +import json +from dataclasses import dataclass +from pathlib import Path + + +# Pro 2.x ops that carry no addressable id (one per doc) — keyed by their +# OPTYPE in our objects dict. The rest of the op list shows up as +# id-keyed entries (id = position 1 of the op array). +_SINGLETON_OPTYPES: set[str] = { + "DOCTYPE", "HEAD", "CANVAS", "ACTIVE_LAYER", "PREFERENCE", + "SILK_OPTS", "PANELIZE", "PANELIZE_STAMP", "PANELIZE_SIDE", + "PRIMITIVE", +} + + +@dataclass +class WriteStats: + objects: int = 0 + bbox_x: float = 0.0 + bbox_y: float = 0.0 + bbox_w: float = 0.0 + bbox_h: float = 0.0 + skipped_encrypted: bool = False + + +def _parse_datastr(s: str) -> dict[str, list]: + """Parse a Pro 2.x op-stream into ``{id: [OPTYPE, args...]}``. + + Lines that fail to parse are skipped silently — we mirror the EPRO2 + replay's tolerance for partial / malformed sources, since some + real-world docs have stray whitespace or trailing commas. + """ + objects: dict[str, list] = {} + auto_id = 0 + for line in s.splitlines(): + line = line.strip() + if not line: + continue + try: + arr = json.loads(line) + except json.JSONDecodeError: + continue + if not isinstance(arr, list) or not arr: + continue + optype = arr[0] + if not isinstance(optype, str): + continue + + if optype in _SINGLETON_OPTYPES: + obj_id = optype + elif len(arr) > 1 and isinstance(arr[1], (str, int)) and arr[1] != "": + # Most id-keyed ops have a string id ("e") at position 1; + # a handful use ints (LAYER, RULE_SELECTOR). Convert ints to + # qualified strings so they don't collide with string ids. + raw = arr[1] + obj_id = raw if isinstance(raw, str) else f"{optype}_{raw}" + else: + # Defensive fallback — shouldn't fire on healthy docs but + # keeps unknown shapes addressable instead of dropped. + auto_id += 1 + obj_id = f"_anon_{optype}_{auto_id}" + + # Last write wins on duplicate id — matches EPRO2 replay semantics. + objects[obj_id] = arr + return objects + + +def _gather_bbox_from_objects(objects: dict[str, list]) -> tuple[float, float, float, float]: + """Best-effort BBox by scanning known coord positions across Pro 2.x op arrays. + + Different OPTYPEs put coords at different positions; we cover the + common ones (LINE / WIRE / VIA / RECT / TEXT / COMPONENT) — anything + we miss just makes the BBox loose, not wrong. + """ + xs: list[float] = [] + ys: list[float] = [] + + # OPTYPE → list of (x_idx, y_idx) pairs in the op array. + coord_positions: dict[str, list[tuple[int, int]]] = { + # ["LINE", id, layer, x1, y1, x2, y2, ...] + "LINE": [(3, 4), (5, 6)], + # ["WIRE", id, x1, y1, x2, y2, ...] (sch wire) + "WIRE": [(2, 3), (4, 5)], + # ["VIA", id, layer, x, y, outerD, innerD, ...] + "VIA": [(3, 4)], + # ["RECT", id, x, y, w, h, ...] + "RECT": [(2, 3)], + # ["TEXT", id, ?, x, y, ...] — best-effort + "TEXT": [(3, 4)], + # ["COMPONENT", id, ..., x, y, rot, ...] — position varies; field + # order has stayed consistent through 2.x but not bullet-proof + "COMPONENT": [(3, 4)], + # ["ARC", id, layer, cx, cy, ...] + "ARC": [(3, 4)], + } + + for arr in objects.values(): + if not isinstance(arr, list) or len(arr) < 3: + continue + optype = arr[0] + for xi, yi in coord_positions.get(optype, []): + if xi >= len(arr) or yi >= len(arr): + continue + try: + xs.append(float(arr[xi])) + ys.append(float(arr[yi])) + except (TypeError, ValueError): + pass + + if not xs: + return (0.0, 0.0, 0.0, 0.0) + return (min(xs), min(ys), max(xs) - min(xs), max(ys) - min(ys)) + + +def _layers_from_objects(objects: dict[str, list]) -> list[str]: + """Pro 2.x emits LAYER ops that already match the Std layer-string + format almost 1:1; we just join the array elements with ``~``.""" + layers: list[str] = [] + for arr in objects.values(): + if not isinstance(arr, list) or len(arr) < 2 or arr[0] != "LAYER": + continue + # ["LAYER", id, type, name, attr1, color1, alpha1, color2, alpha2] + # → "id~name~color~visible~active~locked~" (we coerce to Std style) + try: + lid = arr[1] + ltype = arr[2] if len(arr) > 2 else "" + lname = arr[3] if len(arr) > 3 else f"Layer{lid}" + color = arr[5] if len(arr) > 5 else "#000000" + # use/show/locked aren't in the same positions as Std's; we + # keep stable defaults that downstream's adapter can override. + layers.append(f"{lid}~{lname}~{color}~true~false~true~") + except (TypeError, IndexError): + continue + return layers + + +def write_pro2_doc( + json_path: Path, + *, + project_uuid: str = "", + editor_version_hint: str = "", +) -> dict | None: + """Read a Pro 2.x JSON file → Option-2 Std envelope. + + Returns ``None`` for docs we can't decode (e.g. encrypted-external + PCBs that store a ``dataStrId`` URL and AES iv/key instead of inline + ``dataStr``); caller treats that as "skip with stats bump". + """ + raw = json.loads(json_path.read_text(encoding="utf-8")) + doc_uuid = raw.get("uuid") or json_path.stem + title = raw.get("title") or raw.get("display_title") or doc_uuid[:12] + doc_type_int = raw.get("docType") + + # Encrypted-external: PCB blob lives at modules.lceda.cn keyed by + # dataStrId, AES-decrypt with the iv+key fields. We don't fetch. + if "dataStr" not in raw and ("dataStrId" in raw or "iv" in raw): + write_pro2_doc.last_stats = WriteStats(skipped_encrypted=True) # type: ignore[attr-defined] + return None + + s = raw.get("dataStr") + if not isinstance(s, str): + return None + + objects = _parse_datastr(s) + bbox_x, bbox_y, bbox_w, bbox_h = _gather_bbox_from_objects(objects) + layers = _layers_from_objects(objects) + + # Pull the Pro 2.x editor version out of the HEAD op if present — + # finer-grained than the manifest's top-level editor_version (which + # is the project's, not the doc's). + head_op = objects.get("HEAD") + pro2_version = "" + if isinstance(head_op, list) and len(head_op) > 1 and isinstance(head_op[1], dict): + pro2_version = head_op[1].get("version", "") + + result = { + "uuid": doc_uuid, + "puuid": project_uuid, + "title": title, + "description": raw.get("description", ""), + "docType": doc_type_int, + "components": {}, + "dataStr": { + "head": { + "docType": str(doc_type_int) if doc_type_int is not None else "", + "editorVersion": ( + f"facere-pro2/0.1 (lceda {pro2_version or editor_version_hint})" + ), + "units": "mil", + "epro_format": "pro2", + "pro2_doc_uuid": doc_uuid, + "pro2_editor_version": pro2_version, + }, + "BBox": { + "x": bbox_x, + "y": bbox_y, + "width": bbox_w, + "height": bbox_h, + }, + "layers": layers, + "objects": objects, + "preference": {}, + "netColors": [], + "DRCRULE": {}, + }, + } + write_pro2_doc.last_stats = WriteStats( # type: ignore[attr-defined] + objects=len(objects), + bbox_x=bbox_x, bbox_y=bbox_y, bbox_w=bbox_w, bbox_h=bbox_h, + ) + return {"success": True, "code": 0, "result": result} diff --git a/tools/epro2/tests/test_std_writers.py b/tools/epro2/tests/test_std_writers.py index 66e3bcf..bb4cfb1 100644 --- a/tools/epro2/tests/test_std_writers.py +++ b/tools/epro2/tests/test_std_writers.py @@ -155,6 +155,69 @@ def test_sch_objects_dict_preserved(): # -- json round-trip --------------------------------------------------- +def test_pro2_parses_inline_datastr_into_objects_dict(): + """Pro 2.x docs store dataStr as a plaintext op-stream (one JSON + array per line). Each id-keyed op (most of them — COMPONENT / ATTR / + LINE / WIRE / VIA / ...) lands in objects[id] as the raw array. + Singletons (DOCTYPE / HEAD / CANVAS / ...) get keyed by their + OPTYPE name. This is what downstream's adapter receives and + dispatches on.""" + import tempfile + from pathlib import Path + from tools.epro2.std.pro2_writer import write_pro2_doc + + json_text = json.dumps({ + "uuid": "doc-1", + "title": "test sch", + "docType": 1, + "dataStr": ( + '["DOCTYPE","SCH","1.1"]\n' + '["HEAD",{"originX":0,"originY":0,"version":"2.1.39","maxId":42}]\n' + '["COMPONENT","e1","",0,0,90,0,{},0]\n' + '["WIRE","e2",100,200,300,200]\n' + '["LINE","e3",1,500,600,700,600,6,"st1",0]\n' + ), + }) + with tempfile.NamedTemporaryFile("w", suffix=".json", delete=False) as f: + f.write(json_text) + tmp = Path(f.name) + payload = write_pro2_doc(tmp, project_uuid="proj-1") + objs = payload["result"]["dataStr"]["objects"] + # singletons keyed by OPTYPE + assert objs["DOCTYPE"] == ["DOCTYPE", "SCH", "1.1"] + assert objs["HEAD"][0] == "HEAD" + # id-keyed by position-1 string + assert objs["e1"] == ["COMPONENT", "e1", "", 0, 0, 90, 0, {}, 0] + assert objs["e2"] == ["WIRE", "e2", 100, 200, 300, 200] + # head propagates the EPRO2 format hint so adapter knows which dispatch + head = payload["result"]["dataStr"]["head"] + assert head["epro_format"] == "pro2" + assert head["units"] == "mil" + assert "2.1.39" in head["editorVersion"] + + +def test_pro2_skips_encrypted_external_pcb(): + """Pro 2.x PCB docs sometimes store the dataStr at modules.lceda.cn + keyed by `dataStrId` + AES-decrypted with `iv`/`key`. We don't fetch + those — return None so the CLI can skip with a stats bump rather + than emit a stub JSON the downstream parser can't make sense of.""" + import tempfile + from pathlib import Path + from tools.epro2.std.pro2_writer import write_pro2_doc + + enc = json.dumps({ + "uuid": "doc-pcb", + "docType": 3, + "dataStrId": "https://modules.lceda.cn/datastr/abc...", + "iv": "abcd1234abcd1234abcd1234", + "key": "deadbeef" * 8, + }) + with tempfile.NamedTemporaryFile("w", suffix=".json", delete=False) as f: + f.write(enc) + tmp = Path(f.name) + assert write_pro2_doc(tmp) is None + + def test_writers_round_trip_through_json_dump(): """Our payloads must survive json.dumps without TypeError — catches Decimal / datetime / bytes leaks early."""