FacereDataset/tools/epro2/replay.py

"""EPRO2 state-machine replay.

Each ``.epro2`` file is a per-document op stream (already partitioned by
DOCHEAD during crawl). Replaying that stream yields the document's final
state — a dict keyed by object id with the latest payload.

This is a *prototype*. Semantics intentionally minimal:
  - DOCHEAD sets the document head (docType, uuid, editVersion, ...).
  - Any other op with an ``id`` upserts ``objects[id]`` with its payload.
  - A ``null`` / missing payload on a normally-payloaded op is treated as
    a deletion. (Empirically uncommon — flagged for review when seen.)
  - We do *not* yet model relationships (lineGroup → WIRE, NET ↔ PAD_NET, ...).
    Those belong in a higher-level translator, not the raw replay.
"""

from __future__ import annotations

import json
from collections import Counter
from dataclasses import dataclass, field
from pathlib import Path

from .parser import Op, iter_ops


@dataclass
class Document:
    """Replayed state of a single Pro document."""

    doc_uuid: str
    doc_type: str | None = None
    head: dict = field(default_factory=dict)
    objects: dict[str, dict] = field(default_factory=dict)
    op_counts: Counter[str] = field(default_factory=Counter)
    deletes: int = 0
    untyped_ops: int = 0  # ops with no `id` and not DOCHEAD/EDIT_HEAD/META/CANVAS

    def apply(self, op: Op) -> None:
        self.op_counts[op.type] += 1

        if op.type == "DOCHEAD":
            if op.payload:
                self.head = op.payload
                self.doc_type = op.payload.get("docType")
            return

        # Document-level singletons that don't have an `id` field.
        if op.type in {"EDIT_HEAD", "META", "CANVAS", "PREFERENCE", "PANELIZE"}:
            if op.payload is not None:
                self.objects[op.type] = {"_type": op.type, **op.payload}
            return

        if op.id is None:
            # Op carries no addressable id — keep a tally so we know if our
            # model is missing a category.
            self.untyped_ops += 1
            return

        if op.payload is None:
            # Empty payload on an id-keyed op — treat as deletion.
            if op.id in self.objects:
                del self.objects[op.id]
                self.deletes += 1
            return

        self.objects[op.id] = {"_type": op.type, **op.payload}


@dataclass
class Project:
    """Replayed state across all documents of a project."""

    project_uuid: str
    editor_version: str | None = None
    documents: dict[str, Document] = field(default_factory=dict)
    parse_errors: list[tuple[str, str]] = field(default_factory=list)

    def by_doc_type(self) -> dict[str, list[Document]]:
        out: dict[str, list[Document]] = {}
        for d in self.documents.values():
            out.setdefault(d.doc_type or "?", []).append(d)
        return out

    def aggregate_op_counts(self) -> Counter[str]:
        agg: Counter[str] = Counter()
        for d in self.documents.values():
            agg += d.op_counts
        return agg


def replay_document(epro2_path: Path | str) -> Document:
    """Replay a single ``.epro2`` file. Document UUID is taken from filename."""
    p = Path(epro2_path)
    doc_uuid = p.stem  # crawler writes <doc_uuid>.epro2
    d = Document(doc_uuid=doc_uuid)
    for op in iter_ops(p):
        d.apply(op)
    return d


def replay_project(project_dir: Path | str) -> Project:
    """Replay every document under ``<project_dir>/source/`` per its manifest.json."""
    pdir = Path(project_dir)
    src = pdir / "source"
    manifest_path = src / "manifest.json"
    if not manifest_path.exists():
        raise FileNotFoundError(f"missing {manifest_path}")
    manifest = json.loads(manifest_path.read_text(encoding="utf-8"))

    proj = Project(
        project_uuid=manifest["project_uuid"],
        editor_version=manifest.get("editor_version"),
    )
    for entry in manifest.get("documents", []):
        epro2_path = pdir / entry["path"]
        try:
            d = replay_document(epro2_path)
        except Exception as e:  # noqa: BLE001 — surface as parse_errors
            proj.parse_errors.append((entry["doc_uuid"], f"{type(e).__name__}: {e}"))
            continue
        proj.documents[d.doc_uuid] = d
    return proj