Zer0pa · Zer0pa-Architect-Prime · Apr 8, 2026
diff --git a/BENCHMARKS.md b/BENCHMARKS.md
@@ -1,26 +1,34 @@
 # ZPE-Ink Benchmarks
 
-This file is a scaffold for Phase 2.
-Real public-dataset rows land in Phase 3 only.
-Do not backfill synthetic or proxy data here.
+Public rows only. No proxy data is promoted here.
 
-## Reproducible Methodology
+## Methodology
 
-1. Record the repo commit, host, OS, Python version, Swift version, Rust toolchain, and browser/runtime version.
-2. Record the exact dataset URL or local path, the command used, the sample count, and whether the source is public or proxy.
-3. Run `encode -> decode -> verify` on the same sample set.
-4. Capture raw bytes, encoded bytes, ratio, fidelity metric, and wall-clock timing.
-5. Keep proxy/demo rows separate from public dataset rows.
+1. Run from the repo root: `python code/scripts/run_phase3_public_benchmarks.py`
+2. Artifact path: `proofs/reruns/phase3_public_benchmarks/phase3_public_benchmarks.json`
+3. Baseline: raw float32 `x/y` payload, matching the repo's current authority surface.
+4. Fidelity rule: `exact` means decode output matched the source stroke arrays byte-for-byte at the integer channel level.
+5. Registration-gated datasets stay blocked until the real corpus is acquired in-lane. No proxy values are substituted into this table.
 
-## Phase 2 Scaffold
+## Dataset Table
 
-| dataset | source | baseline | zpe | ratio | fidelity | notes |
-|---|---|---|---|---|---|---|
-| Synthetic proxy | repo fixtures | raw float32 | n/a | n/a | roundtrip only | Proxy/demo surface only |
-| Public datasets | reserved for Phase 3 | raw float32 | n/a | n/a | n/a | IAM, CASIA, and other real rows land in Phase 3 |
+| dataset | strokes | points_per_stroke | raw_size | compressed | ratio | roundtrip_fidelity |
+|---|---:|---:|---:|---:|---:|---|
+| IAM On-Line Handwriting | blocked | blocked | blocked | blocked | blocked | blocked |
+| CASIA Online Handwriting | blocked | blocked | blocked | blocked | blocked | blocked |
+| UJI Pen Characters | 1,854 | 40.23 | 596,736 B | 370,379 B | 1.6111x | exact |
 
-## Phase 3 Reservation
+## Sources
 
-- Use only freely available public datasets.
-- Replace proxy rows with measured rows from runnable scripts.
-- Cite the dataset URL, the exact command, and the fidelity metric for each row.
+- IAM On-Line Handwriting: `https://fki.tic.heia-fr.ch/databases/iam-on-line-handwriting-database`
+  Probe result on 2026-04-08: `HTTP/1.1 200 OK`. No direct public corpus download was established for this phase.
+- CASIA Online Handwriting: `https://nlpr.ia.ac.cn/databases/handwriting/home.html`
+  Probe result on 2026-04-08: `rc=28`, `status=000` after the bounded 20-second probe. No direct public corpus download was established for this phase.
+- UJI Pen Characters: `https://archive.ics.uci.edu/dataset/160/uji+pen+characters`
+  Download URL used: `https://archive.ics.uci.edu/static/public/160/uji+pen+characters.zip`
+  Archive SHA-256: `06e484103d21ead80ec7675059d3ffe66f39f51bfcb9c77a00fbbfb1c85546dc`
+
+## Notes
+
+- UJI metrics were measured over `1,364` isolated-character samples and `74,592` total points.
+- This file does not widen the repo claim surface beyond the current structured-tier authority boundary.
diff --git a/code/scripts/run_phase3_public_benchmarks.py b/code/scripts/run_phase3_public_benchmarks.py
@@ -0,0 +1,193 @@
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+import tempfile
+import zipfile
+from pathlib import Path
+from typing import Any
+
+ROOT = Path(__file__).resolve().parents[1]
+REPO_ROOT = ROOT.parent
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))
+
+from scripts.shared import append_command_log, run_command, write_json
+from zpe_ink.benchmarks import measure_dataset
+from zpe_ink.io import sha256_file
+from zpe_ink.unipen import load_uji_pen_characters
+
+
+IAM_URL = "https://fki.tic.heia-fr.ch/databases/iam-on-line-handwriting-database"
+CASIA_URL = "https://nlpr.ia.ac.cn/databases/handwriting/home.html"
+UJI_PAGE_URL = "https://archive.ics.uci.edu/dataset/160/uji+pen+characters"
+UJI_ZIP_URL = "https://archive.ics.uci.edu/static/public/160/uji+pen+characters.zip"
+
+
+def _safe_extract_zip(archive_path: Path, destination: Path) -> list[str]:
+    extracted: list[str] = []
+    destination = destination.resolve()
+    with zipfile.ZipFile(archive_path) as handle:
+        for member in handle.infolist():
+            target = (destination / member.filename).resolve()
+            if not str(target).startswith(str(destination)):
+                raise ValueError(f"unsafe zip path: {member.filename}")
+            handle.extract(member, destination)
+            extracted.append(member.filename)
+    return extracted
+
+
+def _probe_url(url: str, log_path: Path, label: str) -> dict[str, Any]:
+    result = run_command(
+        [
+            "curl",
+            "-L",
+            "-sS",
+            "--max-time",
+            "20",
+            "-o",
+            "/dev/null",
+            "-w",
+            "%{http_code} %{url_effective}",
+            url,
+        ],
+        log_path,
+        label,
+    )
+    status_code = "000"
+    final_url = url
+    parts = result["stdout"].strip().split(maxsplit=1)
+    if parts:
+        status_code = parts[0]
+    if len(parts) == 2:
+        final_url = parts[1]
+    return {
+        "url": url,
+        "returncode": result["returncode"],
+        "status_code": status_code,
+        "final_url": final_url,
+    }
+
+
+def _blocked_row(name: str, url: str, probe: dict[str, Any], reason: str) -> dict[str, Any]:
+    return {
+        "dataset": name,
+        "source_url": url,
+        "status": "blocked",
+        "strokes": None,
+        "points_per_stroke": None,
+        "raw_size_bytes": None,
+        "compressed_size_bytes": None,
+        "compression_ratio": None,
+        "roundtrip_fidelity": None,
+        "note": reason,
+        "probe": probe,
+    }
+
+
+def _blocked_note(probe: dict[str, Any]) -> str:
+    if probe["returncode"] == 0 and probe["status_code"].startswith("2"):
+        return f"Official page reachable via HTTP {probe['status_code']}, but no direct public corpus download was established for this phase."
+    return (
+        "Official access probe failed or timed out from this environment, "
+        f"with rc={probe['returncode']} and status={probe['status_code']}."
+    )
+
+
+def _benchmark_uji(log_path: Path) -> dict[str, Any]:
+    with tempfile.TemporaryDirectory(prefix="zpe-ink-phase3-uji-") as temp_root_str:
+        temp_root = Path(temp_root_str)
+        archive_path = temp_root / "uji_pen_characters.zip"
+        extract_root = temp_root / "uji_pen_characters"
+
+        download = run_command(
+            ["curl", "-L", "-sS", "--max-time", "60", UJI_ZIP_URL, "-o", str(archive_path)],
+            log_path,
+            "phase3_uji_download",
+        )
+        if download["returncode"] != 0 or not archive_path.exists():
+            raise RuntimeError("failed to download UJI Pen Characters archive")
+
+        extracted = _safe_extract_zip(archive_path, extract_root)
+        append_command_log(
+            log_path,
+            "phase3_uji_extract",
+            f"safe-unzip {archive_path}",
+            0,
+            f"members={len(extracted)}",
+            "",
+        )
+
+        samples = load_uji_pen_characters(extract_root, limit=100000)
+        metrics = measure_dataset(samples)
+        return {
+            "dataset": "UJI Pen Characters",
+            "source_url": UJI_PAGE_URL,
+            "download_url": UJI_ZIP_URL,
+            "status": "measured",
+            "strokes": metrics["stroke_count"],
+            "points_per_stroke": metrics["average_points_per_stroke"],
+            "raw_size_bytes": metrics["raw_size_bytes"],
+            "compressed_size_bytes": metrics["compressed_size_bytes"],
+            "compression_ratio": metrics["compression_ratio"],
+            "roundtrip_fidelity": metrics["roundtrip_fidelity"],
+            "sample_count": metrics["sample_count"],
+            "point_count": metrics["point_count"],
+            "mode": metrics["mode"],
+            "seed": metrics["seed"],
+            "archive_sha256": sha256_file(archive_path),
+        }
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Run public Phase 3 dataset benchmarks for ZPE-Ink.")
+    parser.add_argument(
+        "--artifact-root",
+        default=str(REPO_ROOT / "proofs" / "reruns" / "phase3_public_benchmarks"),
+        help="Directory for JSON artifacts and command logs",
+    )
+    args = parser.parse_args()
+
+    artifact_root = Path(args.artifact_root)
+    artifact_root.mkdir(parents=True, exist_ok=True)
+    log_path = artifact_root / "command_log.txt"
+
+    iam_probe = _probe_url(IAM_URL, log_path, "phase3_iam_probe")
+    casia_probe = _probe_url(CASIA_URL, log_path, "phase3_casia_probe")
+    uji_probe = _probe_url(UJI_PAGE_URL, log_path, "phase3_uji_page_probe")
+
+    uji_row = _benchmark_uji(log_path)
+    rows = [
+        _blocked_row(
+            "IAM On-Line Handwriting",
+            IAM_URL,
+            iam_probe,
+            f"Registration-gated dataset. {_blocked_note(iam_probe)}",
+        ),
+        _blocked_row(
+            "CASIA Online Handwriting",
+            CASIA_URL,
+            casia_probe,
+            f"Registration-gated dataset. {_blocked_note(casia_probe)}",
+        ),
+        uji_row,
+    ]
+
+    payload = {
+        "generated_from": "code/scripts/run_phase3_public_benchmarks.py",
+        "baseline": "raw float32 xy payload",
+        "rows": rows,
+        "probes": {
+            "iam": iam_probe,
+            "casia": casia_probe,
+            "uji_page": uji_probe,
+        },
+    }
+    write_json(artifact_root / "phase3_public_benchmarks.json", payload)
+    print(json.dumps(payload, indent=2, sort_keys=True))
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/code/tests/test_benchmarks.py b/code/tests/test_benchmarks.py
@@ -0,0 +1,33 @@
+from __future__ import annotations
+
+import random
+
+from zpe_ink.benchmarks import measure_dataset
+from zpe_ink.fixtures import generate_directional_stroke
+
+
+def test_measure_dataset_reports_exact_roundtrip() -> None:
+    rng = random.Random(20260408)
+    samples = [[generate_directional_stroke(rng, segments=8)] for _ in range(4)]
+
+    metrics = measure_dataset(samples, seed=20260408)
+
+    assert metrics["sample_count"] == 4
+    assert metrics["stroke_count"] == 4
+    assert metrics["point_count"] > 0
+    assert metrics["raw_size_bytes"] > metrics["compressed_size_bytes"]
+    assert metrics["compression_ratio"] > 1.0
+    assert metrics["roundtrip_fidelity"] == "exact"
+
+
+def test_measure_dataset_handles_empty_input() -> None:
+    metrics = measure_dataset([])
+
+    assert metrics["sample_count"] == 0
+    assert metrics["stroke_count"] == 0
+    assert metrics["point_count"] == 0
+    assert metrics["average_points_per_stroke"] == 0.0
+    assert metrics["raw_size_bytes"] == 0
+    assert metrics["compressed_size_bytes"] == 0
+    assert metrics["compression_ratio"] == 0.0
+    assert metrics["roundtrip_fidelity"] == "exact"
diff --git a/code/zpe_ink/benchmarks.py b/code/zpe_ink/benchmarks.py
@@ -0,0 +1,48 @@
+from __future__ import annotations
+
+from typing import Any
+
+from zpe_ink.codec import decode_zpink, encode_zpink
+from zpe_ink.phase2_authority import raw_float32_xy_payload
+
+
+Sample = list[dict[str, list[int]]]
+
+
+def measure_dataset(samples: list[Sample], *, mode: str = "lossless", seed: int = 20260408) -> dict[str, Any]:
+    total_raw_size = 0
+    total_compressed_size = 0
+    stroke_count = 0
+    point_count = 0
+    exact_roundtrip = True
+    sample_count = 0
+
+    for sample in samples:
+        if not sample:
+            continue
+
+        encoded = encode_zpink(sample, mode=mode, seed=seed)
+        decoded = decode_zpink(encoded)["strokes"]
+
+        total_raw_size += len(raw_float32_xy_payload(sample))
+        total_compressed_size += len(encoded)
+        stroke_count += len(sample)
+        point_count += sum(len(stroke["x"]) for stroke in sample)
+        sample_count += 1
+        exact_roundtrip = exact_roundtrip and decoded == sample
+
+    average_points_per_stroke = (point_count / stroke_count) if stroke_count else 0.0
+    compression_ratio = (total_raw_size / total_compressed_size) if total_compressed_size else 0.0
+
+    return {
+        "sample_count": sample_count,
+        "stroke_count": stroke_count,
+        "point_count": point_count,
+        "average_points_per_stroke": round(average_points_per_stroke, 2),
+        "raw_size_bytes": total_raw_size,
+        "compressed_size_bytes": total_compressed_size,
+        "compression_ratio": round(compression_ratio, 4),
+        "roundtrip_fidelity": "exact" if exact_roundtrip else "mismatch",
+        "mode": mode,
+        "seed": seed,
+    }
diff --git a/proofs/reruns/phase3_public_benchmarks/command_log.txt b/proofs/reruns/phase3_public_benchmarks/command_log.txt
@@ -0,0 +1,30 @@
+[2026-04-08T03:04:13.947081+00:00] phase3_iam_probe
+CMD: curl -L -sS --max-time 20 -o /dev/null -w %{http_code} %{url_effective} https://fki.tic.heia-fr.ch/databases/iam-on-line-handwriting-database
+RC: 0
+STDOUT:
+200 https://fki.tic.heia-fr.ch/databases/iam-on-line-handwriting-database
+---
+[2026-04-08T03:04:34.043540+00:00] phase3_casia_probe
+CMD: curl -L -sS --max-time 20 -o /dev/null -w %{http_code} %{url_effective} https://nlpr.ia.ac.cn/databases/handwriting/home.html
+RC: 28
+STDOUT:
+000 https://nlpr.ia.ac.cn/databases/handwriting/home.html
+STDERR:
+curl: (28) Connection timed out after 20008 milliseconds
+---
+[2026-04-08T03:04:35.907531+00:00] phase3_uji_page_probe
+CMD: curl -L -sS --max-time 20 -o /dev/null -w %{http_code} %{url_effective} https://archive.ics.uci.edu/dataset/160/uji+pen+characters
+RC: 0
+STDOUT:
+200 https://archive.ics.uci.edu/dataset/160/uji+pen+characters
+---
+[2026-04-08T03:04:39.788594+00:00] phase3_uji_download
+CMD: curl -L -sS --max-time 60 https://archive.ics.uci.edu/static/public/160/uji+pen+characters.zip -o /var/folders/49/b6bj2qfd5nl4vndxv7rb9dh00000gn/T/zpe-ink-phase3-uji-o33zjkk1/uji_pen_characters.zip
+RC: 0
+---
+[2026-04-08T03:04:39.810579+00:00] phase3_uji_extract
+CMD: safe-unzip /var/folders/49/b6bj2qfd5nl4vndxv7rb9dh00000gn/T/zpe-ink-phase3-uji-o33zjkk1/uji_pen_characters.zip
+RC: 0
+STDOUT:
+members=12
+---