"""EPRO2 state-machine replay. Each ``.epro2`` file is a per-document op stream (already partitioned by DOCHEAD during crawl). Replaying that stream yields the document's final state — a dict keyed by object id with the latest payload. This is a *prototype*. Semantics intentionally minimal: - DOCHEAD sets the document head (docType, uuid, editVersion, ...). - Any other op with an ``id`` upserts ``objects[id]`` with its payload. - A ``null`` / missing payload on a normally-payloaded op is treated as a deletion. (Empirically uncommon — flagged for review when seen.) - We do *not* yet model relationships (lineGroup → WIRE, NET ↔ PAD_NET, ...). Those belong in a higher-level translator, not the raw replay. """ from __future__ import annotations import json from collections import Counter from dataclasses import dataclass, field from pathlib import Path from .parser import Op, iter_ops @dataclass class Document: """Replayed state of a single Pro document.""" doc_uuid: str doc_type: str | None = None head: dict = field(default_factory=dict) objects: dict[str, dict] = field(default_factory=dict) op_counts: Counter[str] = field(default_factory=Counter) deletes: int = 0 untyped_ops: int = 0 # ops with no `id` and not DOCHEAD/EDIT_HEAD/META/CANVAS def apply(self, op: Op) -> None: self.op_counts[op.type] += 1 if op.type == "DOCHEAD": if op.payload: self.head = op.payload self.doc_type = op.payload.get("docType") return # Document-level singletons that don't have an `id` field. if op.type in {"EDIT_HEAD", "META", "CANVAS", "PREFERENCE", "PANELIZE"}: if op.payload is not None: self.objects[op.type] = {"_type": op.type, **op.payload} return if op.id is None: # Op carries no addressable id — keep a tally so we know if our # model is missing a category. self.untyped_ops += 1 return if op.payload is None: # Empty payload on an id-keyed op — treat as deletion. if op.id in self.objects: del self.objects[op.id] self.deletes += 1 return self.objects[op.id] = {"_type": op.type, **op.payload} @dataclass class Project: """Replayed state across all documents of a project.""" project_uuid: str editor_version: str | None = None documents: dict[str, Document] = field(default_factory=dict) parse_errors: list[tuple[str, str]] = field(default_factory=list) def by_doc_type(self) -> dict[str, list[Document]]: out: dict[str, list[Document]] = {} for d in self.documents.values(): out.setdefault(d.doc_type or "?", []).append(d) return out def aggregate_op_counts(self) -> Counter[str]: agg: Counter[str] = Counter() for d in self.documents.values(): agg += d.op_counts return agg def replay_document(epro2_path: Path | str) -> Document: """Replay a single ``.epro2`` file. Document UUID is taken from filename.""" p = Path(epro2_path) doc_uuid = p.stem # crawler writes .epro2 d = Document(doc_uuid=doc_uuid) for op in iter_ops(p): d.apply(op) return d def replay_project(project_dir: Path | str) -> Project: """Replay every document under ``/source/`` per its manifest.json.""" pdir = Path(project_dir) src = pdir / "source" manifest_path = src / "manifest.json" if not manifest_path.exists(): raise FileNotFoundError(f"missing {manifest_path}") manifest = json.loads(manifest_path.read_text(encoding="utf-8")) proj = Project( project_uuid=manifest["project_uuid"], editor_version=manifest.get("editor_version"), ) for entry in manifest.get("documents", []): epro2_path = pdir / entry["path"] try: d = replay_document(epro2_path) except Exception as e: # noqa: BLE001 — surface as parse_errors proj.parse_errors.append((entry["doc_uuid"], f"{type(e).__name__}: {e}")) continue proj.documents[d.doc_uuid] = d return proj