From 2bfd0eb2a4f24a2de224d17cf9b2b83a03f935e2 Mon Sep 17 00:00:00 2001 From: Samet Date: Fri, 27 Mar 2026 15:40:56 +0100 Subject: [PATCH 1/4] docs: converged manifest format v5 with dual component resolution MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Update lerobot.md (v4→v5): dual-path resolution (type + class_path), drop _normalize_metadata(), direct Pydantic parsing, rewrite runner resolution, processor construction, migration, and comparison table - Update inferencekit.md (v4→v5): dual resolution in 'How models are loaded', add PhysicalAI-native class_path+init_args manifest example Key design decisions documented: - type + flat params (LeRobot interop) and class_path + init_args (PhysicalAI full-power) both resolve through ComponentRegistry - No _normalize_metadata() shim — clean cut to nested Pydantic models - from_legacy_metadata() handles pre-manifest YAML only --- docs/design/components/inferencekit.md | 241 +++-- docs/design/integrations/lerobot.md | 1266 +++++++++++++++--------- 2 files changed, 996 insertions(+), 511 deletions(-) diff --git a/docs/design/components/inferencekit.md b/docs/design/components/inferencekit.md index bd3efdf..2b16c39 100644 --- a/docs/design/components/inferencekit.md +++ b/docs/design/components/inferencekit.md @@ -438,77 +438,169 @@ class Postprocessor(ABC): ### Manifest Format -All exported models use a unified `manifest.json` format. The manifest uses `class_path` + `init_args` (following `jsonargparse` conventions) for component specification: +All exported models use a unified `manifest.json` format. The manifest uses a nested structure that mirrors the `InferenceModel` class hierarchy, with logical sections for policy identity, inference configuration, hardware, and metadata: + +```text +manifest.json +├── format + version (envelope) +├── policy (identity — what policy is this?) +│ ├── name (human-readable name) +│ └── source (provenance: repo_id, class_path) +├── inference (InferenceModel — how to run it?) +│ ├── n_obs_steps (observation window) +│ ├── runner (execution pattern + params) +│ ├── artifacts (model files by named role) +│ └── io (shapes, preprocessors, postprocessors) +├── hardware (deployment — what hardware?) +│ ├── robots (robot configurations) +│ └── cameras (camera configurations) +└── metadata (provenance — when/who created this?) +``` ```json { "format": "policy_package", "version": "1.0", - "robots": [ - { - "name": "main", - "type": "Koch v1.1", - "state": { "shape": [14], "dtype": "float32" }, - "action": { "shape": [14], "dtype": "float32" } - } - ], - "cameras": [ - { - "name": "top", - "shape": [3, 480, 640], - "dtype": "uint8" - }, - { - "name": "wrist", - "shape": [3, 480, 640], - "dtype": "uint8" - } - ], "policy": { "name": "my_model", - "kind": "single_pass" + "source": { + "repo_id": "user/my_model", + "class_path": "mypackage.policies.MyPolicy" + } }, - "artifacts": { - "onnx": "model.onnx" + "inference": { + "n_obs_steps": 1, + "runner": { + "type": "action_chunking", + "chunk_size": 100, + "n_action_steps": 100 + }, + "artifacts": { + "model": "model.onnx" + }, + "io": { + "inputs": [ + {"name": "observation.image", "dtype": "float32", "shape": ["B", 3, 96, 96]}, + {"name": "observation.state", "dtype": "float32", "shape": ["B", 14]} + ], + "outputs": [ + {"name": "action", "dtype": "float32", "shape": ["B", 100, 14]} + ], + "preprocessors": [ + { + "type": "normalize", + "mode": "mean_std", + "artifact": "stats.safetensors", + "features": ["observation.state"] + } + ], + "postprocessors": [ + { + "type": "denormalize", + "mode": "mean_std", + "artifact": "stats.safetensors", + "features": ["action"] + } + ] + } }, - "runner": { - "class_path": "inferencekit.runners.SinglePassRunner", - "init_args": {} + "hardware": { + "robots": [], + "cameras": [] }, - "adapter": { - "class_path": "inferencekit.adapters.ONNXAdapter", - "init_args": { - "providers": ["CUDAExecutionProvider", "CPUExecutionProvider"] + "metadata": { + "created_at": "2026-03-27T12:00:00Z", + "created_by": "mypackage.export" + } +} +``` + +> **Note:** For the full manifest schema reference (all runner variants, field descriptions, and design rationale), see [LeRobot Integration Design](../integrations/lerobot.md#2-converged-manifest-format). The format is shared by both PhysicalAI and LeRobot exports. + +**PhysicalAI-native format (`class_path` + `init_args`):** + +PhysicalAI can also write manifests using the full `class_path` + `init_args` format for components. This gives full power over component instantiation (custom classes, nested configs) while remaining loadable by PhysicalAI's `ComponentRegistry`: + +```json +{ + "format": "policy_package", + "version": "1.0", + "policy": { + "name": "act", + "source": { + "repo_id": "lerobot/act_aloha_sim_transfer_cube_human", + "class_path": "physicalai.policies.act.policy.ACT" } }, - "preprocessors": [ - { - "class_path": "mypackage.preprocessors.ImageResize", - "init_args": { - "target_size": [640, 640] - } - } - ], - "postprocessors": [ - { - "class_path": "mypackage.postprocessors.NMS", + "inference": { + "n_obs_steps": 1, + "runner": { + "class_path": "action_chunking", "init_args": { - "confidence_threshold": 0.5 + "chunk_size": 100, + "n_action_steps": 100 } + }, + "artifacts": { + "model": "model.onnx" + }, + "io": { + "inputs": [ + {"name": "observation.image", "dtype": "float32", "shape": ["B", 3, 96, 96]}, + {"name": "observation.state", "dtype": "float32", "shape": ["B", 14]} + ], + "outputs": [ + {"name": "action", "dtype": "float32", "shape": ["B", 100, 14]} + ], + "preprocessors": [ + { + "class_path": "normalize", + "init_args": { + "mode": "mean_std", + "stats_path": "stats.safetensors", + "features": ["observation.state"] + } + } + ], + "postprocessors": [ + { + "class_path": "denormalize", + "init_args": { + "mode": "mean_std", + "stats_path": "stats.safetensors", + "features": ["action"] + } + } + ] } - ] + }, + "hardware": { + "robots": [], + "cameras": [] + }, + "metadata": { + "created_at": "2026-03-27T12:00:00Z", + "created_by": "physicalai.export" + } } ``` +> **Both formats resolve identically.** The `type`-based example above (used by LeRobot) and this `class_path`-based example both resolve to the same runner and processor instances through the `ComponentRegistry`. See [Dual Component Resolution](../integrations/lerobot.md#dual-component-resolution) for the full resolution algorithm. + **How models are loaded:** -The framework reads `manifest.json` and resolves the model configuration: +The framework reads `manifest.json` and resolves the model configuration using **dual-path component resolution**: -1. **Built‑in models** (physicalai-train, LeRobot): `policy.kind` maps to a built‑in runner. No `class_path` needed for the runner — the `kind` field is sufficient. -2. **Custom/exotic models**: `runner.class_path` points to the user's runner class. The framework instantiates it dynamically. -3. **Hardware validation**: `robots` and `cameras` sections declare expected shapes. The runtime validates observations against these on first contact. +1. **Manifest parsing**: `manifest.json` is parsed directly into nested Pydantic models --- no flattening or normalization step. +2. **Runner resolution**: Components support two formats that both resolve through the same `ComponentRegistry` + `instantiate_component()` pipeline: + - **`type` + flat params** (interoperable, written by LeRobot): `{"type": "action_chunking", "chunk_size": 100}` → registry lookup → `ComponentSpec` → `instantiate_component()` + - **`class_path` + `init_args`** (full-power, written by PhysicalAI): `{"class_path": "action_chunking", "init_args": {"chunk_size": 100}}` → `ComponentSpec` → `instantiate_component()` +3. **Backend selection**: `inference.artifacts` maps named roles (e.g., `"model"`, `"encoder"`) to filenames. The first available backend is auto-selected, or the user can override at load time. +4. **I/O pipeline**: `inference.io.preprocessors` and `inference.io.postprocessors` declare input/output transforms (normalization, denormalization) resolved via the same dual-path mechanism. +5. **Hardware validation**: `hardware.robots` and `hardware.cameras` sections declare expected shapes. The runtime can validate observations against these. +6. **Custom components**: Domain layers can extend the manifest with custom processor types or runner parameters without modifying inferencekit. Any component with a `class_path` is instantiated directly; any component with a `type` goes through the registry. -The `class_path` + `init_args` pattern allows domain layers to specify their own components in the manifest without inferencekit needing to know about them. +> **See also**: [LeRobot Integration Design — Runner Resolution](../integrations/lerobot.md#runner-resolution) for the full resolution algorithm and examples. --- @@ -596,28 +688,41 @@ Domain layers can publish model packages to HuggingFace that include: { "format": "policy_package", "version": "1.0", - "robots": [...], - "cameras":[...], "policy": { "name": "my_model", - "kind": "custom" - }, - "domain_package": "my-domain-inference", - "artifacts": { - "onnx": "model.onnx" - }, - "runner": { - "class_path": "my_domain_inference.runners.MyDomainRunner", - "init_args": { - "param1": "value1" + "source": { + "class_path": "my_domain_inference.models.MyModel" } }, - "preprocessors": [ - { - "class_path": "my_domain_inference.preprocessors.MyPreprocessor", - "init_args": {} + "inference": { + "n_obs_steps": 1, + "runner": { + "type": "action_chunking", + "chunk_size": 1, + "n_action_steps": 1 + }, + "artifacts": { + "model": "model.onnx" + }, + "io": { + "inputs": [ + {"name": "input", "dtype": "float32", "shape": ["B", 3, 640, 640]} + ], + "outputs": [ + {"name": "output", "dtype": "float32", "shape": ["B", 100, 6]} + ], + "preprocessors": [], + "postprocessors": [] } - ] + }, + "hardware": { + "robots": [], + "cameras": [] + }, + "metadata": { + "created_at": "2026-03-27T12:00:00Z", + "created_by": "my-domain-inference" + } } ``` @@ -1118,5 +1223,5 @@ Instead of replacing model_api, inferencekit provides the **foundation** that mo --- -_Document Version: 3.0_ -_Last Updated: 2026-02-16_ +_Document Version: 5.0_ +_Last Updated: 2026-03-27_ diff --git a/docs/design/integrations/lerobot.md b/docs/design/integrations/lerobot.md index 84f8a92..c0532f0 100644 --- a/docs/design/integrations/lerobot.md +++ b/docs/design/integrations/lerobot.md @@ -1,595 +1,975 @@ -# physicalai: LeRobot Integration Design +# PhysicalAI: LeRobot Integration Design **Status**: Proposal -**Author**: [Your Name] -**Date**: 2026-01-13 -**Relates to**: [LeRobot Policy Export Design](./policy_export_design.md) - -> **Important: LeRobot export is our proposal, not an agreed standard.** -> The PolicyPackage format (`manifest.json`) described in this document is a design we have proposed to the LeRobot team. It has **not yet been reviewed or accepted** upstream. If the LeRobot team adopts a different export format or modifies the proposed schema, this integration design will need to adapt accordingly. The architectural approach (unified manifest format, no lerobot dependency at runtime) remains valid regardless of the final format — only the loader implementation would change. +**Author**: Samet Akcay +**Date**: 2026-03-27 +**Relates to**: [Inference Core Design](../components/inferencekit.md) --- ## Executive Summary -This document describes how **physicalai** integrates with LeRobot's proposed PolicyPackage format. The integration is seamless because both physicalai-train and LeRobot use the **same unified `manifest.json` format**. The runtime reads `manifest.json` (pure JSON, no lerobot import) and maps `policy.kind` to built‑in runners. No LeRobot dependency is needed at deployment time. +This document describes how **PhysicalAI** integrates with **LeRobot** exported models using a **single converged manifest format**. Both frameworks produce `manifest.json` files with the same schema, eliminating the need for format adapters or translation layers. + +**Key principles:** + +1. **One schema, two expressiveness levels** --- The manifest schema supports two component formats: `type` + flat params (interoperable, used by LeRobot) and `class_path` + `init_args` (full-power, used by PhysicalAI). PhysicalAI reads both; LeRobot reads `type` only. +2. **LeRobot is standalone** --- LeRobot's export system works perfectly without PhysicalAI installed. No PhysicalAI imports, no PhysicalAI class paths in manifests. +3. **PhysicalAI loads LeRobot exports natively** --- `InferenceModel.load("./lerobot_export")` works out of the box. No adapter class, no special-casing. +4. **Dependency is strictly one-way** --- LeRobot does not depend on PhysicalAI. PhysicalAI reads LeRobot's output (pure JSON) without importing LeRobot. + +```text +LeRobot (standalone) PhysicalAI +-------------------- ---------- +policy.export("./out") --produces--> InferenceModel.load("./out") + | + Same manifest.json schema +-- reads manifest.json + Writes: type + flat params +-- resolves via type OR class_path + Own runners (numpy-only) +-- builds preprocessors/postprocessors from io + Zero physicalai deps +-- runs inference through pipeline +``` + +### Dual Component Resolution -**Key principle:** All packages (physicalai-train, LeRobot, custom) export models using the same `manifest.json` format. physicalai consumes them identically. No special-casing, no separate format loaders, no circular dependencies. +The manifest supports two ways to specify components (runners, preprocessors, postprocessors): -**Note on status**: The PolicyPackage export format is our proposal to the LeRobot team (see [LeRobot Export Suggestions](../internal/lerobot-export-suggestions.md)). The format details below reflect our proposed design. The integration approach is sound regardless of the final format the LeRobot team adopts. +| Format | Who writes | Who reads | Example | +| ------------------------------ | ---------------------------------- | -------------------- | --------------------------------------------------------------------- | +| **`type` + flat params** | LeRobot, simple PhysicalAI exports | Both (interoperable) | `{"type": "action_chunking", "chunk_size": 100}` | +| **`class_path` + `init_args`** | PhysicalAI (full-power) | PhysicalAI only | `{"class_path": "action_chunking", "init_args": {"chunk_size": 100}}` | + +PhysicalAI resolves both through the same `ComponentRegistry` + `instantiate_component()` pipeline. LeRobot only reads `type` and maps to its own implementations. See [Runner Resolution](#runner-resolution) for the resolution algorithm. + +--- + +## Table of Contents + +- [PhysicalAI: LeRobot Integration Design](#physicalai-lerobot-integration-design) + - [Executive Summary](#executive-summary) + - [Dual Component Resolution](#dual-component-resolution) + - [Table of Contents](#table-of-contents) + - [1. Architecture Overview](#1-architecture-overview) + - [2. Converged Manifest Format](#2-converged-manifest-format) + - [Schema Overview](#schema-overview) + - [Full Example: ACT Policy](#full-example-act-policy) + - [Runner Variants](#runner-variants) + - [Field Reference](#field-reference) + - [Top-Level Envelope](#top-level-envelope) + - [`policy` --- Identity](#policy-----identity) + - [`inference` --- How to Run](#inference-----how-to-run) + - [`hardware` --- Deployment](#hardware-----deployment) + - [`metadata` --- Provenance](#metadata-----provenance) + - [Preprocessor / Postprocessor Entry](#preprocessor--postprocessor-entry) + - [Design Decisions](#design-decisions) + - [3. How PhysicalAI Loads the Manifest](#3-how-physicalai-loads-the-manifest) + - [Loading Flow](#loading-flow) + - [Runner Resolution](#runner-resolution) + - [Preprocessor and Postprocessor Construction](#preprocessor-and-postprocessor-construction) + - [4. How LeRobot Uses the Manifest](#4-how-lerobot-uses-the-manifest) + - [5. Runner Mapping](#5-runner-mapping) + - [`inference.runner.type` to Runner](#inferencerunnertype-to-runner) + - [Runner Parameters (All in `inference.runner`)](#runner-parameters-all-in-inferencerunner) + - [6. Normalization Handling](#6-normalization-handling) + - [Problem](#problem) + - [Solution: Preprocessor and Postprocessor Entries](#solution-preprocessor-and-postprocessor-entries) + - [PhysicalAI Implementation](#physicalai-implementation) + - [Normalization Modes](#normalization-modes) + - [Stats File Format](#stats-file-format) + - [7. Usage Examples](#7-usage-examples) + - [Basic Usage](#basic-usage) + - [With Callbacks](#with-callbacks) + - [Override Runner Parameters](#override-runner-parameters) + - [Real-Time Control](#real-time-control) + - [8. Supported Policies](#8-supported-policies) + - [9. Testing Strategy](#9-testing-strategy) + - [Conformance Tests](#conformance-tests) + - [Parity Tests](#parity-tests) + - [Backward Compatibility Tests](#backward-compatibility-tests) + - [10. Migration from Legacy Formats](#10-migration-from-legacy-formats) + - [Migration Path](#migration-path) + - [Schema Enforcement](#schema-enforcement) + - [Appendix A: Design Rationale](#appendix-a-design-rationale) + - [Why One Format Instead of Two?](#why-one-format-instead-of-two) + - [Why `inference` as a Container?](#why-inference-as-a-container) + - [Why Not `inference: null` for Single-Pass?](#why-not-inference-null-for-single-pass) + - [Why Preprocessors Inside `io`?](#why-preprocessors-inside-io) + - [Why `policy.source.class_path`?](#why-policysourceclass_path) + - [Appendix B: Comparison with Previous Design](#appendix-b-comparison-with-previous-design) + - [Related Documents](#related-documents) --- ## 1. Architecture Overview +The integration is seamless because both frameworks share the same manifest schema. PhysicalAI's `InferenceModel` reads the manifest, resolves components (runner, preprocessors, postprocessors, adapter), and runs inference --- regardless of which framework produced the export. + ```text -┌────────────────────────────────────────────────────────────────┐ -│ physicalai │ -│ │ -│ ┌──────────────┐ ┌──────────────┐ ┌──────────────────────┐ │ -│ │ Adapters │ │ Built‑in │ │ Callbacks │ │ -│ │ (backends) │ │ Runners │ │ (instrumentation) │ │ -│ └──────────────┘ └──────────────┘ └──────────────────────┘ │ -│ │ -│ ┌──────────────────────────────────────────────────────────┐ │ -│ │ Unified Manifest Loader │ │ -│ │ │ │ -│ │ manifest.json (same format for all model sources) │ │ -│ │ physicalai-train, LeRobot, custom — all use the same schema │ -│ └──────────────────────────────────────────────────────────┘ │ -└────────────────────────────────────────────────────────────────┘ - │ - │ reads (pure file I/O) - ▼ - ┌──────────────────────────┐ - │ Exported Model │ - │ (any source) │ - │ │ - │ manifest.json │ - │ model artifacts │ - └──────────────────────────┘ ++-----------------------------------------------------------------------+ +| PhysicalAI | +| | +| +----------------+ +-----------------+ +------------------------+ | +| | Adapters | | Built-in | | Callbacks | | +| | (backends) | | Runners | | (instrumentation) | | +| | | | | | | | +| | ONNX, OpenVINO | | SinglePass | | TimingCallback | | +| | TensorRT | | ActionChunking | | LoggingCallback | | +| | TorchExportIR | | Iterative | | ActionSafetyCallback | | +| | | | TwoPhase | | | | +| +----------------+ +-----------------+ +------------------------+ | +| | +| +---------------------------------------------------------------+ | +| | Manifest Loader | | +| | | | +| | manifest.json --> parse --> resolve components --> run | | +| | (same schema for all sources: PhysicalAI, LeRobot, custom) | | +| +---------------------------------------------------------------+ | ++-----------------------------------------------------------------------+ + | + | reads (pure JSON file I/O) + v + +----------------------------+ + | Exported Package | + | (any source) | + | | + | manifest.json | + | model.onnx | + | stats.safetensors | + +----------------------------+ ``` +**What PhysicalAI adds over LeRobot's standalone runtime:** + +| Feature | LeRobot Standalone | PhysicalAI | +| ----------------------------------- | ------------------ | ---------------------- | +| Load exported policy | Yes | Yes | +| Single-pass inference | Yes | Yes | +| Iterative inference | Yes | Yes | +| Two-phase inference | Yes | Yes | +| Action chunking | Yes | Yes | +| Callbacks (timing, logging, safety) | No | Yes | +| Multi-backend with fallback | ONNX + OpenVINO | ONNX + OpenVINO + TRT | +| Preprocessor/postprocessor chains | Fixed pipeline | Extensible chain | +| HuggingFace Hub loading | No | Yes (`hf://user/repo`) | +| `select_action()` / `reset()` API | No | Yes | + --- -## 2. Unified Manifest Format +## 2. Converged Manifest Format -All packages (physicalai-train, LeRobot, custom) use the same `manifest.json` schema. This section describes the fields relevant to LeRobot policies specifically. +### Schema Overview -### Package Detection +The manifest mirrors PhysicalAI's `InferenceModel` class hierarchy, following the same philosophy as training configs (which split into `model`, `data`, `trainer` sections): -A directory is an exported model package if it contains `manifest.json` with `"format": "policy_package"`: +```text +manifest.json ++-- format + version (envelope --- what is this file?) ++-- policy (identity --- what policy is this?) +| +-- name (human-readable name) +| +-- source (provenance: repo_id, class_path) ++-- inference (InferenceModel --- how to run it?) +| +-- n_obs_steps (observation window size) +| +-- runner (execution pattern + parameters) +| +-- artifacts (model files by named role) +| +-- io (I/O contract: shapes, preprocessors, postprocessors) ++-- hardware (deployment --- what hardware?) +| +-- robots (robot configurations) +| +-- cameras (camera configurations) ++-- metadata (provenance --- when/who created this?) +``` -```python -def is_policy_package(path: Path) -> bool: - manifest_path = path / "manifest.json" - if not manifest_path.exists(): - return False - manifest = json.loads(manifest_path.read_text()) - return manifest.get("format") == "policy_package" +### Full Example: ACT Policy + +```json +{ + "format": "policy_package", + "version": "1.0", + "policy": { + "name": "act", + "source": { + "repo_id": "lerobot/act_aloha_sim_transfer_cube_human", + "class_path": "physicalai.policies.act.policy.ACT" + } + }, + "inference": { + "n_obs_steps": 1, + "runner": { + "type": "action_chunking", + "chunk_size": 100, + "n_action_steps": 100 + }, + "artifacts": { + "model": "model.onnx" + }, + "io": { + "inputs": [ + {"name": "observation.image", "dtype": "float32", "shape": ["B", 3, 96, 96]}, + {"name": "observation.state", "dtype": "float32", "shape": ["B", 14]} + ], + "outputs": [ + {"name": "action", "dtype": "float32", "shape": ["B", 100, 14]} + ], + "preprocessors": [ + { + "type": "normalize", + "mode": "mean_std", + "artifact": "stats.safetensors", + "features": ["observation.state"] + } + ], + "postprocessors": [ + { + "type": "denormalize", + "mode": "mean_std", + "artifact": "stats.safetensors", + "features": ["action"] + } + ] + } + }, + "hardware": { + "robots": [], + "cameras": [] + }, + "metadata": { + "created_at": "2026-03-27T12:00:00Z", + "created_by": "lerobot.export" + } +} ``` -### Manifest Fields Used +> **Note on image inputs:** Image normalization (uint8 to float32, divide by 255) is baked into the ONNX graph during export. Only non-image features that use dataset-level statistics (e.g., `observation.state`) need explicit preprocessor entries. -| Field | physicalai Usage | -| --------------- | -------------------------------------------------------------------------------------- | -| `format` | Package type detection | -| `version` | Schema compatibility check | -| `policy.kind` | Runner selection (`single_shot` → `SinglePassRunner`, `iterative` → `IterativeRunner`) | -| `artifacts` | Backend artifact paths | -| `io` | Input/output validation | -| `action` | Action semantics (chunk_size, n_action_steps) | -| `iterative` | Loop parameters (num_steps, scheduler) | -| `normalization` | Normalizer configuration | -| `x-physical-ai` | Extension fields (callbacks, adapter options) | +### Runner Variants + +The `inference.runner` section is open-ended --- policy-specific parameters go directly in the runner object alongside `type`. This avoids the need for a rigid union schema. Each runner implementation declares its expected parameters and logs a warning for any unrecognized keys (see [Runner Parameter Validation](#runner-parameter-validation)). + +**ACT / VQBeT** (single-pass with action chunking): + +```json +"runner": { + "type": "action_chunking", + "chunk_size": 100, + "n_action_steps": 100 +} +``` + +**Diffusion Policy** (iterative denoising): + +```json +"runner": { + "type": "iterative", + "horizon": 16, + "n_action_steps": 8, + "num_inference_steps": 100, + "scheduler": "ddpm" +} +``` + +**PI0** (two-phase: encode once + denoise iteratively): + +For two-phase policies, all model artifacts are listed in `inference.artifacts` with named roles (`encoder`, `denoise`) rather than backend names. The runner references these roles: + +```json +"artifacts": { + "encoder": "encoder.onnx", + "denoise": "denoise.onnx" +}, +"runner": { + "type": "two_phase", + "chunk_size": 50, + "n_action_steps": 50, + "num_inference_steps": 10, + "scheduler": "euler" +} +``` + +**SmolVLA** (two-phase, no explicit scheduler): + +```json +"artifacts": { + "encoder": "encoder.onnx", + "denoise": "denoise.onnx" +}, +"runner": { + "type": "two_phase", + "chunk_size": 50, + "n_action_steps": 50, + "num_inference_steps": 10 +} +``` + +**TDMPC** (iterative with model-predictive control): + +```json +"runner": { + "type": "iterative", + "horizon": 5, + "n_action_steps": 1, + "use_mpc": true, + "cem_iterations": 6 +} +``` + +### Field Reference + +#### Top-Level Envelope + +| Field | Type | Required | Description | +| --------- | ------ | -------- | ------------------------------------------------ | +| `format` | string | Yes | Always `"policy_package"`. Schema identification | +| `version` | string | Yes | Schema version (semver). Currently `"1.0"` | + +#### `policy` --- Identity + +| Field | Type | Required | Description | +| -------------------------- | ------ | -------- | --------------------------------------------------- | +| `policy.name` | string | Yes | Human-readable policy name (e.g., `"act"`, `"pi0"`) | +| `policy.source` | object | No | Provenance information | +| `policy.source.repo_id` | string | No | HuggingFace repo ID | +| `policy.source.class_path` | string | No | Original Python class path | + +#### `inference` --- How to Run + +| Field | Type | Required | Description | +| ----------------------------- | ------ | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | +| `inference.n_obs_steps` | int | Yes | Number of observation timesteps needed by the model (see [n_obs_steps rationale](#why-n_obs_steps-in-inference)) | +| `inference.runner` | object | Yes | Runner configuration (see variants) | +| `inference.runner.type` | string | Yes | Runner type: `action_chunking`, `iterative`, `two_phase` | +| `inference.artifacts` | object | Yes | Map of artifact role to filename. Single-model: `{"model": "model.onnx"}`. Two-phase: `{"encoder": "encoder.onnx", "denoise": "denoise.onnx"}` | +| `inference.io` | object | Yes | I/O specification | +| `inference.io.inputs` | array | Yes | Input tensor specifications | +| `inference.io.outputs` | array | Yes | Output tensor specifications | +| `inference.io.preprocessors` | array | No | Input transforms (normalize, etc.) | +| `inference.io.postprocessors` | array | No | Output transforms (denormalize, etc.) | + +#### `hardware` --- Deployment + +| Field | Type | Required | Description | +| ------------------ | ----- | -------- | -------------------------------- | +| `hardware.robots` | array | No | Robot configurations (optional) | +| `hardware.cameras` | array | No | Camera configurations (optional) | + +#### `metadata` --- Provenance + +| Field | Type | Required | Description | +| --------------------- | ------ | -------- | ------------------ | +| `metadata.created_at` | string | No | ISO 8601 timestamp | +| `metadata.created_by` | string | No | Creator identifier | + +#### Preprocessor / Postprocessor Entry + +| Field | Type | Required | Description | +| ------------ | ------ | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------- | +| `type` | string | Yes | Processor type: `"normalize"`, `"denormalize"`, or custom | +| `class_path` | string | No | Python class path for custom processor types. Built-in types (`normalize`, `denormalize`) resolve by convention; unknown types require `class_path`. | +| `mode` | string | No | Normalization mode: `"mean_std"`, `"min_max"`, `"identity"` (required for `normalize`/`denormalize`) | +| `artifact` | string | No | Path to stats file (e.g., `"stats.safetensors"`) (required for `normalize`/`denormalize`) | +| `features` | array | No | Feature names to process (e.g., `["observation.state"]`) (required for `normalize`/`denormalize`) | + +Built-in types resolve by convention: `"normalize"` maps to `StatsNormalizer`, `"denormalize"` maps to `StatsDenormalizer`. For custom processor types, provide a `class_path`: + +```json +{ + "type": "clamp", + "class_path": "physicalai.inference.postprocessors.ActionClamp", + "min": -1.0, + "max": 1.0 +} +``` + +### Design Decisions + +| Decision | Rationale | +| ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| **No `policy.type` field** | `inference.runner.type` is the sole source of truth for runner construction. A separate `policy.type` would overlap without matching (e.g., ACT is `single_pass` but uses `action_chunking` runner), causing confusion. Eliminated to avoid ambiguity. | +| **`hardware` is top-level** | Deployment needs hardware information even if LeRobot doesn't use it yet. Cheap future-proofing. | +| **Preprocessors inside `inference.io`** | They are I/O transforms, logically part of the I/O boundary. Not separate top-level sections. | +| **`format` + `version` kept** | Cheap future-proofing for schema evolution. `format` enables detection, `version` enables migration. | +| **`inference` is a container** | Mirrors `InferenceModel` class hierarchy. Matches training config philosophy (`model`/`data`/`trainer`). | +| **No separate `action` section** | `chunk_size` and `n_action_steps` are runner behavioral params. `action_dim` is redundant with output shape. | +| **`mode` per processor** | Different features may use different normalization modes (mean_std vs min_max). | +| **Runner params are open with validation** | Policy-specific fields go directly in runner. Each runner declares expected params and warns on unknown keys (see [Runner Parameter Validation](#runner-parameter-validation)). | +| **Named artifact roles** | `inference.artifacts` uses role names (`model`, `encoder`, `denoise`) not backend names. This eliminates the split between `inference.artifacts` and runner-level artifact refs for two-phase policies, giving a single authoritative location for all model files. | +| **Extensible processor types** | Built-in types (`normalize`, `denormalize`) resolve by convention. Unknown types fall back to `class_path`, allowing custom processors without code changes to the loader. | +| **`n_obs_steps` in `inference`** | Although `n_obs_steps` describes what the model expects (a contract), it is consumed during inference preparation --- the caller uses it to decide how many observation frames to collect before calling the model. It sits alongside other inference-time parameters rather than in `policy` (which is purely identity/provenance). | +| **Shared JSON Schema for CI** | Both projects validate exported manifests against a shared `manifest.schema.json` to prevent schema drift (see [Schema Enforcement](#schema-enforcement)). | +| **Dual component resolution** | Components support both `type` + flat params (interoperable, LeRobot-compatible) and `class_path` + `init_args` (PhysicalAI full-power). Both resolve through the same `ComponentRegistry` + `instantiate_component()` pipeline. This avoids per-type if-chains while preserving the existing component system. See [Dual Component Resolution](#dual-component-resolution). | +| **No `_normalize_metadata()`** | The nested manifest structure is parsed directly into Pydantic models. No flattening shim needed --- both the schema and the loader are designed together. Legacy `metadata.yaml` files (pre-manifest era) are handled by `from_legacy_metadata()` only. | --- -## 3. Manifest Loader Implementation +## 3. How PhysicalAI Loads the Manifest -### How It Works +### Loading Flow -The manifest loader is unified — there is no separate "LeRobot loader" vs "physicalai-train loader". The same code parses `manifest.json` for all model sources. The `policy.kind` field determines which built‑in runner to use. +The manifest is parsed directly into nested Pydantic models --- no intermediate flattening or normalization step. The nested JSON maps 1:1 to the Pydantic model hierarchy: ```python -# physicalai/manifest_loader.py - -class ManifestLoader: - """Unified manifest loader for all model sources.""" - - @staticmethod - def detect(path: Path) -> bool: - """Check if path contains a valid manifest.""" - manifest_path = path / "manifest.json" - if not manifest_path.exists(): - return False - try: - manifest = json.loads(manifest_path.read_text()) - return manifest.get("format") == "policy_package" - except (json.JSONDecodeError, KeyError): - return False - - @staticmethod - def load( - path: Path, - backend: str | None = None, - device: str = "cpu", - **kwargs - ) -> "InferenceModel": - """Load a model package into an InferenceModel.""" - manifest = json.loads((path / "manifest.json").read_text()) - - # Validate schema version - version = manifest.get("version", "1.0") - if not version.startswith("1."): - raise ValueError(f"Unsupported manifest version: {version}") - - # Select backend - backend = backend or _select_default_backend(manifest) - artifact_path = path / manifest["artifacts"][backend] - - # Create adapter (via inference core) - adapter = get_adapter(backend)(artifact_path, device=device) - - # Select runner based on policy kind - kind = manifest["policy"]["kind"] - runner = _create_runner(kind, manifest, **kwargs) - - # Create normalizer (if specified) - normalizer = _create_normalizer(path, manifest) - - # Load callbacks from extensions - callbacks = _load_callbacks(manifest) - - return InferenceModel( - adapter=adapter, - runner=runner, - normalizer=normalizer, - callbacks=callbacks, - metadata=manifest, - ) +# In InferenceModel.load(): +raw = json.loads((path / "manifest.json").read_text()) + +# Validate format +if raw.get("format") != "policy_package": + msg = f"Unknown manifest format: {raw.get('format')}" + raise ValueError(msg) + +# Parse directly into nested Pydantic models +manifest = Manifest.model_validate(raw) + +# Resolve components from typed manifest fields +runner = resolve_runner(manifest.inference.runner) +adapter = create_adapter(manifest.inference.artifacts, path) +preprocessors = resolve_processors(manifest.inference.io.preprocessors, path) +postprocessors = resolve_processors(manifest.inference.io.postprocessors, path) +``` + +> **Legacy `metadata.yaml` files** (pre-manifest era, before `manifest.json` existed) are handled separately by `from_legacy_metadata()` in `manifest.py`. This is unrelated to the manifest format --- it handles the old YAML-based metadata from early PhysicalAI exports. +### Runner Resolution -def _create_runner(kind: str, manifest: dict, **kwargs) -> InferenceRunner: - """Map policy.kind to a built‑in runner.""" - if kind == "single_pass": - return SinglePassRunner() +The runner factory uses **dual-path resolution** --- a single if-check, not an if-chain per type. If `class_path` is present, it goes straight to `ComponentSpec` instantiation. Otherwise, `type` is resolved through the same `ComponentRegistry` pipeline: - elif kind == "iterative": - iter_config = manifest.get("inference", {}) - return IterativeRunner( - num_steps=kwargs.get("num_steps", iter_config.get("num_steps", 10)), - scheduler=kwargs.get("scheduler", iter_config.get("scheduler", "euler")), - timestep_spacing=iter_config.get("timestep_spacing", "linear"), +```python +def resolve_runner(runner_config: dict) -> InferenceRunner: + """Resolve runner from manifest config using dual-path resolution. + + Path 1: class_path + init_args → ComponentSpec → instantiate_component() + Path 2: type + flat params → registry lookup → ComponentSpec → instantiate_component() + + Both paths end at the same instantiate_component() call. + """ + if "class_path" in runner_config: + # PhysicalAI-native: full ComponentSpec path + spec = ComponentSpec.model_validate(runner_config) + return instantiate_component(spec) + + # Framework-agnostic: type → registry resolves short name to class_path + runner_type = runner_config["type"] + init_args = {k: v for k, v in runner_config.items() if k != "type"} + spec = ComponentSpec(class_path=runner_type, init_args=init_args) + return instantiate_component(spec) +``` + +**How `instantiate_component()` handles `class_path` values:** + +- Full path (e.g., `"physicalai.inference.runners.ActionChunkingRunner"`) → direct import +- Short name (e.g., `"action_chunking"`) → `ComponentRegistry.resolve()` maps to full path → import + +This means `type: "action_chunking"` and `class_path: "action_chunking"` both resolve identically through the registry. The registry is the single source of truth for mapping names to classes. + +**Example: How the same runner loads from both formats:** + +```json +// LeRobot writes (type + flat params): +{"type": "action_chunking", "chunk_size": 100, "n_action_steps": 100} + +// PhysicalAI writes (class_path + init_args): +{"class_path": "action_chunking", "init_args": {"chunk_size": 100, "n_action_steps": 100}} + +// Both resolve to: +// ComponentSpec(class_path="action_chunking", init_args={"chunk_size": 100, "n_action_steps": 100}) +// → ComponentRegistry.resolve("action_chunking") → ActionChunkingRunner +// → instantiate_component(spec) → ActionChunkingRunner(chunk_size=100, n_action_steps=100) +``` + +### Runner Parameter Validation + +Each runner declares the parameters it consumes. Unknown keys trigger a warning, catching typos without breaking forward compatibility: + +```python +class IterativeRunner(InferenceRunner): + EXPECTED_PARAMS = {"type", "num_inference_steps", "scheduler", "horizon", "n_action_steps"} + + @classmethod + def from_config(cls, config: dict) -> "IterativeRunner": + unknown = set(config.keys()) - cls.EXPECTED_PARAMS + if unknown: + logger.warning("IterativeRunner: ignoring unknown params: %s", unknown) + return cls( + num_steps=config.get("num_inference_steps", 10), + scheduler=config.get("scheduler", "euler"), ) +``` + +### Preprocessor and Postprocessor Construction + +Processors use the same dual-path resolution as runners. The `resolve_processor()` function handles both `class_path` + `init_args` (PhysicalAI-native) and `type` + flat params (interoperable): + +```python +def resolve_processors(specs: list[dict], path: Path) -> list: + """Build processor chain from manifest specs using dual-path resolution. + + Each spec is resolved identically to runners: + - class_path present → ComponentSpec → instantiate_component() + - type present → registry lookup → ComponentSpec → instantiate_component() + """ + processors = [] + for spec in specs: + if "class_path" in spec: + # PhysicalAI-native: full ComponentSpec path + component_spec = ComponentSpec.model_validate(spec) + processors.append(instantiate_component(component_spec)) + else: + # Framework-agnostic: type → registry → ComponentSpec + processor_type = spec["type"] + init_args = {k: v for k, v in spec.items() if k != "type"} + # Resolve relative artifact paths to absolute + if "artifact" in init_args: + init_args["stats_path"] = path / init_args.pop("artifact") + component_spec = ComponentSpec(class_path=processor_type, init_args=init_args) + processors.append(instantiate_component(component_spec)) + return processors +``` + +**Example: Normalize processor from both formats:** - elif kind == "two_phase": - iter_config = manifest.get("inference", {}) - return TwoPhaseRunner( - num_steps=kwargs.get("num_steps", iter_config.get("num_steps", 10)), - scheduler=kwargs.get("scheduler", iter_config.get("scheduler", "euler")), +```json +// LeRobot writes (type + flat params): +{"type": "normalize", "mode": "mean_std", "artifact": "stats.safetensors", "features": ["observation.state"]} + +// PhysicalAI writes (class_path + init_args): +{"class_path": "normalize", "init_args": {"mode": "mean_std", "stats_path": "stats.safetensors", "features": ["observation.state"]}} + +// Both resolve to: StatsNormalizer(mode="mean_std", stats_path=..., features=["observation.state"]) +``` + +> **Note:** The `artifact` → `stats_path` key rename happens during resolution for `type`-format specs. In `class_path` format, the key is already `stats_path` (matching the constructor parameter name). + +--- + +## 4. How LeRobot Uses the Manifest + +LeRobot reads the same `manifest.json` with its own tooling. It does NOT use pydantic --- it uses `draccus` dataclasses or plain `json.load()`. + +```python +# LeRobot's own loading (no physicalai dependency) +import json +from pathlib import Path + +def load_exported_policy(path: str | Path) -> ExportedPolicy: + """Load an exported policy package.""" + path = Path(path) + raw = json.loads((path / "manifest.json").read_text()) + + # Read runner config + runner_config = raw["inference"]["runner"] + runner_type = runner_config["type"] + + # Build LeRobot's own runner (standalone, numpy-only) + if runner_type == "action_chunking": + runner = ActionChunkingWrapper( + SinglePassRunner(), + chunk_size=runner_config["chunk_size"], + n_action_steps=runner_config["n_action_steps"], + ) + elif runner_type == "iterative": + runner = IterativeRunner( + num_steps=runner_config["num_inference_steps"], + scheduler=runner_config.get("scheduler", "euler"), ) + elif runner_type == "two_phase": + runner = TwoPhaseRunner(...) + ... + + # Load normalizer from io specs + preprocessors = raw["inference"]["io"].get("preprocessors", []) + postprocessors = raw["inference"]["io"].get("postprocessors", []) + normalizer = Normalizer.from_specs(preprocessors + postprocessors, path) - elif kind == "custom": - # Custom runner specified via class_path - runner_config = manifest.get("runner", {}) - return instantiate(runner_config) + # Load backend adapter (from named artifact role) + artifacts = raw["inference"]["artifacts"] + adapter = ONNXRuntimeAdapter(path / artifacts["model"]) - else: - raise ValueError(f"Unknown policy kind: {kind}") + return ExportedPolicy(runner=runner, adapter=adapter, normalizer=normalizer) ``` -### Installation +**Key point:** LeRobot's runners, normalizer, and adapters are its own implementations. They have zero overlap with PhysicalAI's implementations. The only shared artifact is the `manifest.json` file on disk. + +--- + +## 5. Runner Mapping + +### `inference.runner.type` to Runner + +| `runner.type` | PhysicalAI Runner | LeRobot Runner | Policies | +| ----------------- | ------------------------------------ | ----------------------------------------- | ---------------- | +| `action_chunking` | `ActionChunkingRunner(SinglePass())` | `ActionChunkingWrapper(SinglePassRunner)` | ACT, VQBeT | +| `iterative` | `IterativeRunner(SinglePass())` | `IterativeRunner` | Diffusion, TDMPC | +| `two_phase` | `TwoPhaseRunner(encoder, Iterative)` | `TwoPhaseRunner` | PI0, SmolVLA | + +### Runner Parameters (All in `inference.runner`) + +| Parameter | Used By | Description | +| --------------------- | ------------------------------------- | --------------------------------------- | +| `chunk_size` | action_chunking | Size of predicted action chunk | +| `n_action_steps` | action_chunking, two_phase, iterative | Actions to execute per chunk | +| `num_inference_steps` | iterative, two_phase | Number of denoising steps | +| `scheduler` | iterative, two_phase | Scheduler algorithm (euler, ddpm, ddim) | +| `horizon` | iterative | Planning horizon (Diffusion, TDMPC) | +| `use_mpc` | iterative | Enable model-predictive control (TDMPC) | +| `cem_iterations` | iterative | CEM optimization iterations (TDMPC) | + +> **Note:** Two-phase artifact paths (`encoder`, `denoise`) live in `inference.artifacts`, not in the runner config. See [Runner Variants](#runner-variants) for examples. + +--- + +## 6. Normalization Handling + +### Problem -The manifest loader is **built‑in** — it ships with physicalai. No extra install needed. +LeRobot policies operate on **normalized** inputs and produce **normalized** outputs. Normalization statistics are saved alongside the model in `stats.safetensors`. At inference time: -```bash -# This is all you need to run any exported model (physicalai-train, LeRobot, custom) -pip install physicalai +1. **Observations must be normalized** before feeding to the model +2. **Actions must be denormalized** after the model produces them + +### Solution: Preprocessor and Postprocessor Entries + +The manifest declares normalization as I/O transforms in `inference.io.preprocessors` and `inference.io.postprocessors`: + +```json +"io": { + "inputs": [...], + "outputs": [...], + "preprocessors": [ + { + "type": "normalize", + "mode": "mean_std", + "artifact": "stats.safetensors", + "features": ["observation.state"] + } + ], + "postprocessors": [ + { + "type": "denormalize", + "mode": "mean_std", + "artifact": "stats.safetensors", + "features": ["action"] + } + ] +} +``` + +### PhysicalAI Implementation + +Two pipeline components handle normalization: + +**`StatsNormalizer`** (preprocessor): + +```python +class StatsNormalizer(Preprocessor): + """Normalize input features using saved statistics.""" + + def __init__(self, stats_path: Path, features: list[str], mode: str = "mean_std"): + self.stats = load_stats(stats_path) + self.features = features + self.mode = mode + + def __call__(self, inputs: dict[str, np.ndarray]) -> dict[str, np.ndarray]: + for feature in self.features: + if feature in inputs: + inputs[feature] = self._normalize(inputs[feature], feature) + return inputs +``` + +**`StatsDenormalizer`** (postprocessor): + +```python +class StatsDenormalizer(Postprocessor): + """Denormalize output features using saved statistics.""" + + def __init__(self, stats_path: Path, features: list[str], mode: str = "mean_std"): + self.stats = load_stats(stats_path) + self.features = features + self.mode = mode + + def __call__(self, outputs: dict[str, np.ndarray]) -> dict[str, np.ndarray]: + for feature in self.features: + if feature in outputs: + outputs[feature] = self._denormalize(outputs[feature], feature) + return outputs ``` -The loader reads `manifest.json` (pure JSON parsing) and maps `policy.kind` to built‑in runners. No `lerobot` import. No `physicalai-train` import. No `physicalai[lerobot]` extra. +### Normalization Modes + +| Mode | Normalize | Denormalize | +| ---------- | --------------------------------- | --------------------------------- | +| `mean_std` | `(x - mean) / std` | `x * std + mean` | +| `min_max` | `(x - min) / (max - min) * 2 - 1` | `(x + 1) / 2 * (max - min) + min` | +| `identity` | passthrough | passthrough | + +### Stats File Format + +Normalization statistics are stored in `safetensors` format. Each feature has `{feature}/mean`, `{feature}/std`, `{feature}/min`, `{feature}/max` tensors as needed by the normalization mode. --- -## 4. Usage Examples +## 7. Usage Examples -### Basic Usage (Unified API) +### Basic Usage ```python from physicalai import InferenceModel -# Load LeRobot package (auto-detected via plugin) -model = InferenceModel("./pi0_exported") +# Load LeRobot-exported policy (detected automatically via manifest.json) +model = InferenceModel("./act_exported") -# Run inference (raw outputs) +# Run inference observation = { - "observation.images.top": image_array, - "observation.state": state_array, + "observation.image": image_array, # float32, shape (1, 3, 96, 96) + "observation.state": state_array, # float32, shape (1, 14) } outputs = model(observation) -action_chunk = outputs["action"] +action = outputs["action"] # float32, shape (1, 14) ``` ### With Callbacks ```python from physicalai import InferenceModel -from physicalai.callbacks import TimingCallback, LoggingCallback +from physicalai.inference.callbacks import TimingCallback model = InferenceModel( "./pi0_exported", - callbacks=[ - TimingCallback(), - LoggingCallback(log_inputs=False, log_outputs=True), - ], + callbacks=[TimingCallback()], ) -# Callbacks fire automatically on predict -action = model(observation) -# -> logs timing and output summary +outputs = model(observation) +# TimingCallback logs: "Inference: 12.3ms" ``` ### Override Runner Parameters ```python -# Override num_steps at load time (no re-export needed) +# Override denoising steps at load time (no re-export needed) model = InferenceModel( - "./pi0_exported", - num_steps=20, # Override manifest default of 10 - scheduler="ddim", + "./diffusion_exported", + num_steps=20, # Override manifest default of 100 + scheduler="ddim", # Override manifest default of "ddpm" ) ``` -### Real-Time Control (Policy API) +### Real-Time Control ```python from physicalai import InferenceModel -policy = InferenceModel("./pi0_exported") +policy = InferenceModel("./act_exported") policy.reset() while not done: action = policy.select_action(observation) observation, reward, done, info = env.step(action) -``` - -### Explicit Backend Selection - -```python -# Use specific backend -model = InferenceModel( - "./pi0_exported", - backend="onnx", - device="cuda:0", -) -# Or with adapter options -model = InferenceModel( - "./pi0_exported", - backend="onnx", - adapter_options={ - "providers": ["TensorrtExecutionProvider", "CUDAExecutionProvider"], - }, -) +# Reset between episodes +policy.reset() ``` --- -## 5. Extension Fields +## 8. Supported Policies -physicalai-specific configuration can be embedded in the manifest under `x-physical-ai`. These fields are ignored by LeRobot's own runtime: +All LeRobot policy types are supported through the converged runner system: -```json -{ - "format": "policy_package", - "version": "1.0", - - "policy": { ... }, - "artifacts": { ... }, - -"x-physical-ai": { - "callbacks": [ - "timing", - {"class_path": "myproject.callbacks.SafetyCallback", "init_args": {"max_velocity": 1.0}} - ], - "adapter": { - "providers": ["CUDAExecutionProvider", "CPUExecutionProvider"], - "graph_optimization_level": "all" - }, - "preprocessors": [ -{"class_path": "physicalai.preprocessors.ImageNormalize", "init_args": {"mean": [0.485, 0.456, 0.406]}} - ] - } -} -``` - -### Extension Schema - -| Field | Type | Description | | -| ---------------- | --------------------------- | ----------------------- | ------------------- | -| `callbacks` | `list[str \ | CallbackConfig]` | Callbacks to attach | -| `adapter` | `dict` | Adapter/backend options | | -| `preprocessors` | `list[PreprocessorConfig]` | Input preprocessors | | -| `postprocessors` | `list[PostprocessorConfig]` | Output postprocessors | | - -**Note**: LeRobot ignores `x-physical-ai` fields entirely. They are only read by physicalai. +| Policy | `runner.type` | Runner Stack | Artifact Roles | +| --------- | --------------- | -------------------------------------------- | -------------------- | +| ACT | action_chunking | ActionChunking(SinglePass) | `model` | +| VQBeT | action_chunking | ActionChunking(SinglePass) | `model` | +| Diffusion | iterative | ActionChunking(Iterative(SinglePass)) | `model` | +| TDMPC | iterative | Iterative(SinglePass) with MPC | `model` | +| PI0 | two_phase | ActionChunking(TwoPhase(encoder, Iterative)) | `encoder`, `denoise` | +| SmolVLA | two_phase | ActionChunking(TwoPhase(encoder, Iterative)) | `encoder`, `denoise` | --- -## 6. Runner Mapping - -### `policy.kind` → Built‑in Runner +## 9. Testing Strategy -| `policy.kind` | Runner | Notes | -| ------------- | ------------------ | -------------------------------- | -| `single_pass` | `SinglePassRunner` | Direct forward pass | -| `iterative` | `IterativeRunner` | Configurable loop with scheduler | -| `two_phase` | `TwoPhaseRunner` | Encode once + denoise loop | -| `custom` | via `class_path` | User-provided runner class | +### Conformance Tests -### IterativeRunner Configuration +Verify that PhysicalAI correctly loads manifests produced by LeRobot: ```python -class IterativeRunner(InferenceRunner): - """Runner for iterative/flow-matching policies.""" - - def __init__( - self, - num_steps: int = 10, - scheduler: str = "euler", - timestep_spacing: str = "linear", - timestep_range: tuple[float, float] = (1.0, 0.0), - ): - self.num_steps = num_steps - self.scheduler = scheduler - self.timestep_spacing = timestep_spacing - self.timestep_range = timestep_range - - def run(self, adapter: RuntimeAdapter, inputs: dict) -> dict: - # Initialize from noise - action_shape = self._infer_action_shape(inputs) - x_t = np.random.randn(*action_shape).astype(np.float32) - - # Generate timesteps - timesteps = self._generate_timesteps() - dt = -1.0 / self.num_steps - - # Iterative denoising - for t in timesteps: - step_inputs = { - **inputs, - "x_t": x_t, - "timestep": np.array([t], dtype=np.float32), - } - v_t = adapter.predict(step_inputs)["v_t"] - x_t = self._step(x_t, v_t, dt) - - return {"action": x_t} - - def _step(self, x: np.ndarray, v: np.ndarray, dt: float) -> np.ndarray: - if self.scheduler == "euler": - return x + dt * v - elif self.scheduler == "ddim": - # DDIM update rule - ... - else: - raise ValueError(f"Unknown scheduler: {self.scheduler}") -``` - ---- +class TestConvergedManifestLoading: + """Verify PhysicalAI loads converged manifest format.""" + + def test_detect_policy_package(self, package_path): + """Detect exported package via format field.""" + manifest = json.loads((package_path / "manifest.json").read_text()) + assert manifest["format"] == "policy_package" + + def test_load_action_chunking(self, act_package): + """Load ACT policy with action chunking runner.""" + model = InferenceModel(act_package) + assert isinstance(model.runner, ActionChunkingRunner) + + def test_load_iterative(self, diffusion_package): + """Load Diffusion policy with iterative runner.""" + model = InferenceModel(diffusion_package) + assert isinstance(model.runner, IterativeRunner) -## 7. Callbacks for Robotics + def test_load_two_phase(self, pi0_package): + """Load PI0 policy with two-phase runner.""" + model = InferenceModel(pi0_package) + assert isinstance(model.runner, TwoPhaseRunner) + + def test_preprocessors_created(self, act_package): + """Preprocessors auto-created from io.preprocessors.""" + model = InferenceModel(act_package) + assert len(model.preprocessors) > 0 + assert isinstance(model.preprocessors[0], StatsNormalizer) + + def test_postprocessors_created(self, act_package): + """Postprocessors auto-created from io.postprocessors.""" + model = InferenceModel(act_package) + assert len(model.postprocessors) > 0 + assert isinstance(model.postprocessors[0], StatsDenormalizer) +``` -physicalai provides callbacks useful for robotics applications: +### Parity Tests -### ActionSafetyCallback +Verify PhysicalAI output matches LeRobot's standalone runtime: ```python -# physicalai/callbacks/safety.py - -class ActionSafetyCallback(Callback): - """Clamp actions to safe ranges.""" - - def __init__( - self, - action_min: np.ndarray | float = -1.0, - action_max: np.ndarray | float = 1.0, - velocity_limit: float | None = None, - ): - self.action_min = action_min - self.action_max = action_max - self.velocity_limit = velocity_limit - self._last_action = None - - def on_predict_end(self, outputs: dict) -> dict: - action = outputs["action"] - - # Clamp to range - action = np.clip(action, self.action_min, self.action_max) - - # Velocity limiting - if self.velocity_limit and self._last_action is not None: - delta = action - self._last_action - delta = np.clip(delta, -self.velocity_limit, self.velocity_limit) - action = self._last_action + delta - - self._last_action = action.copy() - outputs["action"] = action - return outputs - - def on_reset(self): - self._last_action = None +def test_parity_with_lerobot_runtime(pi0_package): + """Output matches LeRobot's own runtime (bit-for-bit with same seed).""" + # Load with PhysicalAI + pai_model = InferenceModel(pi0_package) + + # Load with LeRobot standalone + from lerobot.export import load_exported_policy + lr_model = load_exported_policy(pi0_package) + + # Compare outputs with same random seed + obs = generate_test_observation() + np.random.seed(42) + pai_output = pai_model(obs) + np.random.seed(42) + lr_output = lr_model.predict(obs) + + np.testing.assert_allclose(pai_output["action"], lr_output["action"], rtol=1e-5) ``` -### EpisodeLoggingCallback +### Backward Compatibility Tests -```python -# physicalai/callbacks/logging.py - -class EpisodeLoggingCallback(Callback): - """Log episode data for replay/debugging.""" - - def __init__(self, log_dir: Path, log_observations: bool = True): - self.log_dir = Path(log_dir) - self.log_observations = log_observations - self._episode_data = [] - self._episode_count = 0 - - def on_predict_end(self, outputs: dict, inputs: dict | None = None) -> dict: - step_data = {"action": outputs["action"].tolist()} - if self.log_observations and inputs: - step_data["observation"] = {k: v.tolist() for k, v in inputs.items()} - self._episode_data.append(step_data) - return outputs - - def on_reset(self): - if self._episode_data: - self._save_episode() - self._episode_data = [] - self._episode_count += 1 +Verify v1.0 (flat) manifests still load: - def _save_episode(self): - path = self.log_dir / f"episode_{self._episode_count:04d}.json" - path.parent.mkdir(parents=True, exist_ok=True) - path.write_text(json.dumps(self._episode_data)) +```python +def test_legacy_flat_manifest(legacy_package): + """v1.0 flat manifest loads without error.""" + model = InferenceModel(legacy_package) + assert model.runner is not None + assert model.adapter is not None ``` --- -## 8. Unified Manifest Format +## 10. Migration from Legacy Formats -All exported models — regardless of source framework — use the same `manifest.json` format. +This is a **clean cut** to the nested manifest format. There is no `_normalize_metadata()` shim --- the nested Pydantic models are the only manifest representation. Two legacy scenarios are handled: -### Why One Format +### Legacy `metadata.yaml` (Pre-Manifest Era) -Previous designs had two formats: `metadata.yaml` for physicalai-train and `manifest.json` for LeRobot. This created unnecessary divergence: +Early PhysicalAI exports used a flat `metadata.yaml` file instead of `manifest.json`. The existing `from_legacy_metadata()` classmethod on `Manifest` handles this case: -- Two parsers to maintain -- Two sets of schema conventions -- Confusion about which format to use +```python +class Manifest(BaseModel): + @classmethod + def from_legacy_metadata(cls, metadata: dict) -> "Manifest": + """Convert old metadata.yaml fields to the nested Manifest structure. + + This handles truly old exports that predate manifest.json entirely. + """ + ... +``` -The unified `manifest.json` format eliminates this. Benefits: +This is the **only** backward compatibility code needed. It converts old YAML metadata to the new nested `Manifest` model once, at load time. -- **One parser** — simpler codebase, fewer bugs -- **One schema** — consistent across all model sources -- **No special-casing** — the loader doesn't need to know where a model came from -- **JSON for data, not code** — `policy.kind` maps to built‑in runners; `class_path` is only for exotic patterns +### Migration Path -### Unified Loading +| Step | Action | Breaking? | Target Version | +| ---- | ------ | --------- | -------------- | +| 1 | Implement nested `Manifest` Pydantic models (`manifest.py`) | No --- new code | v1.x (current) | +| 2 | Update `mixin_policy.py` to write nested `manifest.json` | No --- new exports use new format | v1.x | +| 3 | Update `model.py` and `factory.py` to use `Manifest` directly | No --- `from_legacy_metadata()` handles old YAML | v1.x | +| 4 | Update LeRobot PR to write same nested format | No --- same schema | v1.x | +| 5 | Add `manifest.schema.json` for CI validation in both repos | No --- additive | v1.x | -```python -# Works with LeRobot packages -model = InferenceModel("./lerobot_package") +> **Key point:** There is no flat-to-nested migration shim. All new manifests are nested from day one. Only pre-manifest `metadata.yaml` files need the legacy path, and that already exists. -# Works with physicalai-train packages -model = InferenceModel("./physicalai_train_package") +### Schema Enforcement -# Works with custom packages -model = InferenceModel("./custom_package") +To prevent schema drift between PhysicalAI and LeRobot, a shared `manifest.schema.json` (JSON Schema) is maintained and validated against in CI for both projects: -# All read manifest.json — same code path +```text +manifest.schema.json (shared, vendored into both repos) + | + +-- physicalai CI: validate exported manifests against schema + +-- lerobot CI: validate exported manifests against schema ``` +This catches divergence at PR time rather than at runtime. The schema file is the single source of truth for manifest structure. + --- -## 9. Testing Compatibility +## Appendix A: Design Rationale -### Conformance Test Suite +### Why One Format Instead of Two? -```python -# tests/format_loaders/test_lerobot_loader.py +The previous design (v1 plan) proposed two manifest formats: `lerobot_exported_policy` for LeRobot and `policy_package` for PhysicalAI, bridged by a `LeRobotManifestAdapter` class. This was rejected because: -class TestLeRobotFormatLoaderConformance: -"""Verify physicalai correctly loads LeRobot packages.""" +1. **Unnecessary complexity** --- An adapter class to translate between nearly-identical JSON schemas is pure overhead. +2. **Divergence risk** --- Two formats inevitably drift apart over time, making the adapter increasingly complex. +3. **Testing burden** --- Every feature needs testing against both formats. +4. **User confusion** --- Which format should I use? Does it matter? - def test_detect_lerobot_package(self, lerobot_package_path): - """Format loader detects LeRobot packages.""" - assert LeRobotFormatLoader.detect(lerobot_package_path) +The converged format eliminates all of these problems. One schema, two producers, zero translation. - def test_load_single_shot(self, act_package_path): - """Load single_shot policy.""" - model = InferenceModel(act_package_path) - assert isinstance(model.runner, SinglePassRunner) +### Why `inference` as a Container? - def test_load_iterative(self, pi0_package_path): - """Load iterative policy.""" - model = InferenceModel(pi0_package_path) - assert isinstance(model.runner, IterativeRunner) - assert model.runner.num_steps == 10 # from manifest - - def test_override_num_steps(self, pi0_package_path): - """Override iterative params at load time.""" - model = InferenceModel(pi0_package_path, num_steps=20) - assert model.runner.num_steps == 20 - - def test_parity_with_lerobot_runtime(self, pi0_package_path): - """Output matches LeRobot's own runtime.""" -# Load with physicalai -ik_model = InferenceModel(pi0_package_path) - - # Load with LeRobot - from lerobot.export import load as lerobot_load - lr_runtime = lerobot_load(pi0_package_path) - - # Compare outputs - obs = generate_test_observation() - np.random.seed(42) - ik_output = ik_model(obs) - np.random.seed(42) - lr_output = lr_runtime.predict_action_chunk(obs) - - np.testing.assert_allclose(ik_output["action"], lr_output, rtol=1e-5) -``` +The `inference` section mirrors the `InferenceModel` class hierarchy: ---- +- `InferenceModel` composes runner, adapter, preprocessors, postprocessors +- `inference` contains runner, artifacts (for adapter), io (for pre/postprocessors) -## 10. Summary +This follows the same pattern as training configs where the top-level sections (`model`, `data`, `trainer`) mirror the class hierarchy. It makes the manifest self-documenting: the JSON structure tells you the code structure. -### What physicalai Adds Over LeRobot Runtime +### Why Not `inference: null` for Single-Pass? -| Feature | LeRobot Runtime | physicalai | -| ----------------------------------- | --------------- | ----------------------- | -| Load PolicyPackage | ✓ | ✓ | -| Single-pass inference | ✓ | ✓ | -| Iterative inference | ✓ | ✓ | -| Two-phase inference | ✓ | ✓ | -| Action queue wrapper | ✓ | ✓ | -| Callbacks (timing, logging, safety) | ✗ | ✓ | -| Multi-backend with fallback | ✗ | ✓ | -| Preprocessor/postprocessor chains | ✗ | ✓ | -| Unified manifest format | ✗ | ✓ (same format for all) | +All policies need `n_obs_steps`, `artifacts`, and `io` regardless of runner type. Making `inference` nullable would force these universal fields elsewhere (top-level or in `policy`), breaking the logical grouping. Instead, `inference` is always present --- only the runner params differ between policy types. -### Dependency Direction +### Why Preprocessors Inside `io`? -```text -LeRobot ──────────────────────────────────────────────────┐ - │ │ - │ defines (proposed) │ - ▼ │ -manifest.json (unified format) │ - │ │ - │ consumed by │ - ▼ │ -physicalai (unified manifest loader) ◄─────────┘ - no dependency on LeRobot code -``` +Preprocessors and postprocessors are I/O transforms --- they sit at the boundary between raw observations and model inputs. Placing them inside `io` (alongside `inputs` and `outputs`) makes this relationship explicit. The alternative (top-level `preprocessors`/`postprocessors`) separates logically related concepts. -**LeRobot does not depend on physicalai.** -**physicalai can load LeRobot packages without importing LeRobot.** -**physicalai-train exports the same manifest.json format — no special handling needed.** +### Why `policy.source.class_path`? + +The `class_path` field enables PhysicalAI to instantiate the original policy class when the full PhysicalAI training framework is available. LeRobot ignores this field entirely. It is optional --- packages exported by LeRobot standalone may omit it or use a LeRobot-specific class path. + +--- -> **Reminder:** This integration depends on LeRobot adopting the proposed PolicyPackage export format. If LeRobot adopts a different format, the manifest loader implementation changes but the architecture (unified loader, no runtime dependency) remains the same. +## Appendix B: Comparison with Previous Design + +| Aspect | Previous Design (v1) | Current Design (converged) | +| ------------------ | -------------------------------------------------- | ---------------------------------------------- | +| Manifest formats | Two (`lerobot_exported_policy` + `policy_package`) | One (`policy_package`) | +| Format adapter | `LeRobotManifestAdapter` class (~100 lines) | None --- direct Pydantic parsing | +| Format detection | `if format == "lerobot_exported_policy"` branching | Not needed --- single format | +| Schema maintenance | Two schemas to keep in sync | One shared `manifest.schema.json` | +| Test matrix | 2x (each feature tested against both formats) | 1x | +| Normalization | Adapter auto-generates ComponentSpecs | Manifest declares pre/postprocessors directly | +| Runner resolution | `policy.kind` + separate `inference` block | Dual-path: `class_path` OR `type` → registry → `instantiate_component()` | +| Action params | Separate `action` section | Params in `inference.runner` | +| Backward compat | Format detection + adapter routing | `from_legacy_metadata()` for pre-manifest YAML only | +| Component formats | `class_path` + `init_args` only | Both `type` + flat params (interop) and `class_path` + `init_args` (full-power) | --- ## Related Documents -- **[Strategy](../../architecture/strategy.md)** - Big-picture architecture -- **[Inference Core Design](./inferencekit.md)** - Domain-agnostic inference layer -- **[LeRobot Export Suggestions](../internal/lerobot-export-suggestions.md)** - Our proposed improvements to LeRobot's export API +- **[Inference Core Design](../components/inferencekit.md)** --- Domain-agnostic inference layer +- **[Strategy](../architecture/strategy.md)** --- Big-picture architecture and layering decisions +- **[Architecture](../architecture/architecture.md)** --- PhysicalAI runtime CLI and packaging --- -_Document version: 3.0_ -_Last updated: 2026-02-16_ +_Document version: 5.0_ +_Last updated: 2026-03-27_ From 448bed3e289282719dd159f7ed84f89bca3b4310 Mon Sep 17 00:00:00 2001 From: Samet Date: Tue, 31 Mar 2026 11:42:28 +0200 Subject: [PATCH 2/4] =?UTF-8?q?docs:=20rename=20manifest=20'inference'=20?= =?UTF-8?q?=E2=86=92=20'model',=20use=20full=20class=5Fpath?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address PR #7 review feedback from @maxxgx: 1. Rename top-level 'inference' section to 'model' in manifest format. 'model.io' is self-documenting (model's I/O shapes), removing the ambiguity with hardware I/O that 'inference.io' caused. 2. Fix class_path examples to always use full Python paths: - 'action_chunking' → 'physicalai.inference.runners.ActionChunkingRunner' - 'normalize' → 'physicalai.inference.preprocessors.StatsNormalizer' - 'denormalize' → 'physicalai.inference.postprocessors.StatsDenormalizer' 3. Verified stats_path is already correct in class_path+init_args examples. Updated both inferencekit.md and lerobot.md to v5.1. --- docs/design/components/inferencekit.md | 26 +++--- docs/design/integrations/lerobot.md | 111 ++++++++++++------------- 2 files changed, 68 insertions(+), 69 deletions(-) diff --git a/docs/design/components/inferencekit.md b/docs/design/components/inferencekit.md index 2b16c39..3a5c27e 100644 --- a/docs/design/components/inferencekit.md +++ b/docs/design/components/inferencekit.md @@ -438,7 +438,7 @@ class Postprocessor(ABC): ### Manifest Format -All exported models use a unified `manifest.json` format. The manifest uses a nested structure that mirrors the `InferenceModel` class hierarchy, with logical sections for policy identity, inference configuration, hardware, and metadata: +All exported models use a unified `manifest.json` format. The manifest uses a nested structure that mirrors the `InferenceModel` class hierarchy, with logical sections for policy identity, model configuration, hardware, and metadata: ```text manifest.json @@ -446,7 +446,7 @@ manifest.json ├── policy (identity — what policy is this?) │ ├── name (human-readable name) │ └── source (provenance: repo_id, class_path) -├── inference (InferenceModel — how to run it?) +├── model (exported model — how to run it?) │ ├── n_obs_steps (observation window) │ ├── runner (execution pattern + params) │ ├── artifacts (model files by named role) @@ -468,7 +468,7 @@ manifest.json "class_path": "mypackage.policies.MyPolicy" } }, - "inference": { + "model": { "n_obs_steps": 1, "runner": { "type": "action_chunking", @@ -532,10 +532,10 @@ PhysicalAI can also write manifests using the full `class_path` + `init_args` fo "class_path": "physicalai.policies.act.policy.ACT" } }, - "inference": { + "model": { "n_obs_steps": 1, "runner": { - "class_path": "action_chunking", + "class_path": "physicalai.inference.runners.ActionChunkingRunner", "init_args": { "chunk_size": 100, "n_action_steps": 100 @@ -554,7 +554,7 @@ PhysicalAI can also write manifests using the full `class_path` + `init_args` fo ], "preprocessors": [ { - "class_path": "normalize", + "class_path": "physicalai.inference.preprocessors.StatsNormalizer", "init_args": { "mode": "mean_std", "stats_path": "stats.safetensors", @@ -564,7 +564,7 @@ PhysicalAI can also write manifests using the full `class_path` + `init_args` fo ], "postprocessors": [ { - "class_path": "denormalize", + "class_path": "physicalai.inference.postprocessors.StatsDenormalizer", "init_args": { "mode": "mean_std", "stats_path": "stats.safetensors", @@ -594,9 +594,9 @@ The framework reads `manifest.json` and resolves the model configuration using * 1. **Manifest parsing**: `manifest.json` is parsed directly into nested Pydantic models --- no flattening or normalization step. 2. **Runner resolution**: Components support two formats that both resolve through the same `ComponentRegistry` + `instantiate_component()` pipeline: - **`type` + flat params** (interoperable, written by LeRobot): `{"type": "action_chunking", "chunk_size": 100}` → registry lookup → `ComponentSpec` → `instantiate_component()` - - **`class_path` + `init_args`** (full-power, written by PhysicalAI): `{"class_path": "action_chunking", "init_args": {"chunk_size": 100}}` → `ComponentSpec` → `instantiate_component()` -3. **Backend selection**: `inference.artifacts` maps named roles (e.g., `"model"`, `"encoder"`) to filenames. The first available backend is auto-selected, or the user can override at load time. -4. **I/O pipeline**: `inference.io.preprocessors` and `inference.io.postprocessors` declare input/output transforms (normalization, denormalization) resolved via the same dual-path mechanism. + - **`class_path` + `init_args`** (full-power, written by PhysicalAI): `{"class_path": "physicalai.inference.runners.ActionChunkingRunner", "init_args": {"chunk_size": 100}}` → `ComponentSpec` → `instantiate_component()` +3. **Backend selection**: `model.artifacts` maps named roles (e.g., `"model"`, `"encoder"`) to filenames. The first available backend is auto-selected, or the user can override at load time. +4. **I/O pipeline**: `model.io.preprocessors` and `model.io.postprocessors` declare input/output transforms (normalization, denormalization) resolved via the same dual-path mechanism. 5. **Hardware validation**: `hardware.robots` and `hardware.cameras` sections declare expected shapes. The runtime can validate observations against these. 6. **Custom components**: Domain layers can extend the manifest with custom processor types or runner parameters without modifying inferencekit. Any component with a `class_path` is instantiated directly; any component with a `type` goes through the registry. @@ -694,7 +694,7 @@ Domain layers can publish model packages to HuggingFace that include: "class_path": "my_domain_inference.models.MyModel" } }, - "inference": { + "model": { "n_obs_steps": 1, "runner": { "type": "action_chunking", @@ -1223,5 +1223,5 @@ Instead of replacing model_api, inferencekit provides the **foundation** that mo --- -_Document Version: 5.0_ -_Last Updated: 2026-03-27_ +_Document Version: 5.1_ +_Last Updated: 2026-03-31_ diff --git a/docs/design/integrations/lerobot.md b/docs/design/integrations/lerobot.md index c0532f0..f368e5c 100644 --- a/docs/design/integrations/lerobot.md +++ b/docs/design/integrations/lerobot.md @@ -2,7 +2,7 @@ **Status**: Proposal **Author**: Samet Akcay -**Date**: 2026-03-27 +**Date**: 2026-03-31 **Relates to**: [Inference Core Design](../components/inferencekit.md) --- @@ -36,7 +36,7 @@ The manifest supports two ways to specify components (runners, preprocessors, po | Format | Who writes | Who reads | Example | | ------------------------------ | ---------------------------------- | -------------------- | --------------------------------------------------------------------- | | **`type` + flat params** | LeRobot, simple PhysicalAI exports | Both (interoperable) | `{"type": "action_chunking", "chunk_size": 100}` | -| **`class_path` + `init_args`** | PhysicalAI (full-power) | PhysicalAI only | `{"class_path": "action_chunking", "init_args": {"chunk_size": 100}}` | +| **`class_path` + `init_args`** | PhysicalAI (full-power) | PhysicalAI only | `{"class_path": "physicalai.inference.runners.ActionChunkingRunner", "init_args": {"chunk_size": 100}}` | PhysicalAI resolves both through the same `ComponentRegistry` + `instantiate_component()` pipeline. LeRobot only reads `type` and maps to its own implementations. See [Runner Resolution](#runner-resolution) for the resolution algorithm. @@ -56,7 +56,7 @@ PhysicalAI resolves both through the same `ComponentRegistry` + `instantiate_com - [Field Reference](#field-reference) - [Top-Level Envelope](#top-level-envelope) - [`policy` --- Identity](#policy-----identity) - - [`inference` --- How to Run](#inference-----how-to-run) + - [`model` --- How to Run](#model-----how-to-run) - [`hardware` --- Deployment](#hardware-----deployment) - [`metadata` --- Provenance](#metadata-----provenance) - [Preprocessor / Postprocessor Entry](#preprocessor--postprocessor-entry) @@ -67,8 +67,8 @@ PhysicalAI resolves both through the same `ComponentRegistry` + `instantiate_com - [Preprocessor and Postprocessor Construction](#preprocessor-and-postprocessor-construction) - [4. How LeRobot Uses the Manifest](#4-how-lerobot-uses-the-manifest) - [5. Runner Mapping](#5-runner-mapping) - - [`inference.runner.type` to Runner](#inferencerunnertype-to-runner) - - [Runner Parameters (All in `inference.runner`)](#runner-parameters-all-in-inferencerunner) + - [`model.runner.type` to Runner](#modelrunnertype-to-runner) + - [Runner Parameters (All in `model.runner`)](#runner-parameters-all-in-modelrunner) - [6. Normalization Handling](#6-normalization-handling) - [Problem](#problem) - [Solution: Preprocessor and Postprocessor Entries](#solution-preprocessor-and-postprocessor-entries) @@ -90,8 +90,8 @@ PhysicalAI resolves both through the same `ComponentRegistry` + `instantiate_com - [Schema Enforcement](#schema-enforcement) - [Appendix A: Design Rationale](#appendix-a-design-rationale) - [Why One Format Instead of Two?](#why-one-format-instead-of-two) - - [Why `inference` as a Container?](#why-inference-as-a-container) - - [Why Not `inference: null` for Single-Pass?](#why-not-inference-null-for-single-pass) + - [Why `model` as a Container?](#why-model-as-a-container) + - [Why Not `model: null` for Single-Pass?](#why-not-model-null-for-single-pass) - [Why Preprocessors Inside `io`?](#why-preprocessors-inside-io) - [Why `policy.source.class_path`?](#why-policysourceclass_path) - [Appendix B: Comparison with Previous Design](#appendix-b-comparison-with-previous-design) @@ -166,7 +166,7 @@ manifest.json +-- policy (identity --- what policy is this?) | +-- name (human-readable name) | +-- source (provenance: repo_id, class_path) -+-- inference (InferenceModel --- how to run it?) ++-- model (exported model --- how to run it?) | +-- n_obs_steps (observation window size) | +-- runner (execution pattern + parameters) | +-- artifacts (model files by named role) @@ -190,7 +190,7 @@ manifest.json "class_path": "physicalai.policies.act.policy.ACT" } }, - "inference": { + "model": { "n_obs_steps": 1, "runner": { "type": "action_chunking", @@ -241,7 +241,7 @@ manifest.json ### Runner Variants -The `inference.runner` section is open-ended --- policy-specific parameters go directly in the runner object alongside `type`. This avoids the need for a rigid union schema. Each runner implementation declares its expected parameters and logs a warning for any unrecognized keys (see [Runner Parameter Validation](#runner-parameter-validation)). +The `model.runner` section is open-ended --- policy-specific parameters go directly in the runner object alongside `type`. This avoids the need for a rigid union schema. Each runner implementation declares its expected parameters and logs a warning for any unrecognized keys (see [Runner Parameter Validation](#runner-parameter-validation)). **ACT / VQBeT** (single-pass with action chunking): @@ -267,7 +267,7 @@ The `inference.runner` section is open-ended --- policy-specific parameters go d **PI0** (two-phase: encode once + denoise iteratively): -For two-phase policies, all model artifacts are listed in `inference.artifacts` with named roles (`encoder`, `denoise`) rather than backend names. The runner references these roles: +For two-phase policies, all model artifacts are listed in `model.artifacts` with named roles (`encoder`, `denoise`) rather than backend names. The runner references these roles: ```json "artifacts": { @@ -328,19 +328,19 @@ For two-phase policies, all model artifacts are listed in `inference.artifacts` | `policy.source.repo_id` | string | No | HuggingFace repo ID | | `policy.source.class_path` | string | No | Original Python class path | -#### `inference` --- How to Run +#### `model` --- How to Run | Field | Type | Required | Description | | ----------------------------- | ------ | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | -| `inference.n_obs_steps` | int | Yes | Number of observation timesteps needed by the model (see [n_obs_steps rationale](#why-n_obs_steps-in-inference)) | -| `inference.runner` | object | Yes | Runner configuration (see variants) | -| `inference.runner.type` | string | Yes | Runner type: `action_chunking`, `iterative`, `two_phase` | -| `inference.artifacts` | object | Yes | Map of artifact role to filename. Single-model: `{"model": "model.onnx"}`. Two-phase: `{"encoder": "encoder.onnx", "denoise": "denoise.onnx"}` | -| `inference.io` | object | Yes | I/O specification | -| `inference.io.inputs` | array | Yes | Input tensor specifications | -| `inference.io.outputs` | array | Yes | Output tensor specifications | -| `inference.io.preprocessors` | array | No | Input transforms (normalize, etc.) | -| `inference.io.postprocessors` | array | No | Output transforms (denormalize, etc.) | +| `model.n_obs_steps` | int | Yes | Number of observation timesteps needed by the model (see [Design Decisions](#design-decisions)) | +| `model.runner` | object | Yes | Runner configuration (see variants) | +| `model.runner.type` | string | Yes | Runner type: `action_chunking`, `iterative`, `two_phase` | +| `model.artifacts` | object | Yes | Map of artifact role to filename. Single-model: `{"model": "model.onnx"}`. Two-phase: `{"encoder": "encoder.onnx", "denoise": "denoise.onnx"}` | +| `model.io` | object | Yes | I/O specification | +| `model.io.inputs` | array | Yes | Input tensor specifications | +| `model.io.outputs` | array | Yes | Output tensor specifications | +| `model.io.preprocessors` | array | No | Input transforms (normalize, etc.) | +| `model.io.postprocessors` | array | No | Output transforms (denormalize, etc.) | #### `hardware` --- Deployment @@ -381,17 +381,17 @@ Built-in types resolve by convention: `"normalize"` maps to `StatsNormalizer`, ` | Decision | Rationale | | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| **No `policy.type` field** | `inference.runner.type` is the sole source of truth for runner construction. A separate `policy.type` would overlap without matching (e.g., ACT is `single_pass` but uses `action_chunking` runner), causing confusion. Eliminated to avoid ambiguity. | +| **No `policy.type` field** | `model.runner.type` is the sole source of truth for runner construction. A separate `policy.type` would overlap without matching (e.g., ACT is `single_pass` but uses `action_chunking` runner), causing confusion. Eliminated to avoid ambiguity. | | **`hardware` is top-level** | Deployment needs hardware information even if LeRobot doesn't use it yet. Cheap future-proofing. | -| **Preprocessors inside `inference.io`** | They are I/O transforms, logically part of the I/O boundary. Not separate top-level sections. | +| **Preprocessors inside `model.io`** | They are I/O transforms, logically part of the I/O boundary. Not separate top-level sections. | | **`format` + `version` kept** | Cheap future-proofing for schema evolution. `format` enables detection, `version` enables migration. | -| **`inference` is a container** | Mirrors `InferenceModel` class hierarchy. Matches training config philosophy (`model`/`data`/`trainer`). | +| **`model` is a container** | Mirrors `InferenceModel` class hierarchy. Matches training config philosophy (`model`/`data`/`trainer`). | | **No separate `action` section** | `chunk_size` and `n_action_steps` are runner behavioral params. `action_dim` is redundant with output shape. | | **`mode` per processor** | Different features may use different normalization modes (mean_std vs min_max). | | **Runner params are open with validation** | Policy-specific fields go directly in runner. Each runner declares expected params and warns on unknown keys (see [Runner Parameter Validation](#runner-parameter-validation)). | -| **Named artifact roles** | `inference.artifacts` uses role names (`model`, `encoder`, `denoise`) not backend names. This eliminates the split between `inference.artifacts` and runner-level artifact refs for two-phase policies, giving a single authoritative location for all model files. | +| **Named artifact roles** | `model.artifacts` uses role names (`model`, `encoder`, `denoise`) not backend names. This eliminates the split between `model.artifacts` and runner-level artifact refs for two-phase policies, giving a single authoritative location for all model files. | | **Extensible processor types** | Built-in types (`normalize`, `denormalize`) resolve by convention. Unknown types fall back to `class_path`, allowing custom processors without code changes to the loader. | -| **`n_obs_steps` in `inference`** | Although `n_obs_steps` describes what the model expects (a contract), it is consumed during inference preparation --- the caller uses it to decide how many observation frames to collect before calling the model. It sits alongside other inference-time parameters rather than in `policy` (which is purely identity/provenance). | +| **`n_obs_steps` in `model`** | Although `n_obs_steps` describes what the model expects (a contract), it is consumed during inference preparation --- the caller uses it to decide how many observation frames to collect before calling the model. It sits alongside other inference-time parameters rather than in `policy` (which is purely identity/provenance). | | **Shared JSON Schema for CI** | Both projects validate exported manifests against a shared `manifest.schema.json` to prevent schema drift (see [Schema Enforcement](#schema-enforcement)). | | **Dual component resolution** | Components support both `type` + flat params (interoperable, LeRobot-compatible) and `class_path` + `init_args` (PhysicalAI full-power). Both resolve through the same `ComponentRegistry` + `instantiate_component()` pipeline. This avoids per-type if-chains while preserving the existing component system. See [Dual Component Resolution](#dual-component-resolution). | | **No `_normalize_metadata()`** | The nested manifest structure is parsed directly into Pydantic models. No flattening shim needed --- both the schema and the loader are designed together. Legacy `metadata.yaml` files (pre-manifest era) are handled by `from_legacy_metadata()` only. | @@ -417,10 +417,10 @@ if raw.get("format") != "policy_package": manifest = Manifest.model_validate(raw) # Resolve components from typed manifest fields -runner = resolve_runner(manifest.inference.runner) -adapter = create_adapter(manifest.inference.artifacts, path) -preprocessors = resolve_processors(manifest.inference.io.preprocessors, path) -postprocessors = resolve_processors(manifest.inference.io.postprocessors, path) +runner = resolve_runner(manifest.model.runner) +adapter = create_adapter(manifest.model.artifacts, path) +preprocessors = resolve_processors(manifest.model.io.preprocessors, path) +postprocessors = resolve_processors(manifest.model.io.postprocessors, path) ``` > **Legacy `metadata.yaml` files** (pre-manifest era, before `manifest.json` existed) are handled separately by `from_legacy_metadata()` in `manifest.py`. This is unrelated to the manifest format --- it handles the old YAML-based metadata from early PhysicalAI exports. @@ -450,12 +450,12 @@ def resolve_runner(runner_config: dict) -> InferenceRunner: return instantiate_component(spec) ``` -**How `instantiate_component()` handles `class_path` values:** +**How `instantiate_component()` handles the two paths:** -- Full path (e.g., `"physicalai.inference.runners.ActionChunkingRunner"`) → direct import -- Short name (e.g., `"action_chunking"`) → `ComponentRegistry.resolve()` maps to full path → import +- **`class_path`** (full Python path, e.g., `"physicalai.inference.runners.ActionChunkingRunner"`) → direct import +- **`type`** (short name, e.g., `"action_chunking"`) → `ComponentRegistry.resolve()` maps to full path → import -This means `type: "action_chunking"` and `class_path: "action_chunking"` both resolve identically through the registry. The registry is the single source of truth for mapping names to classes. +`class_path` always uses the full Python class path for explicit, unambiguous resolution. `type` uses the registry as the single source of truth for mapping short names to classes. **Example: How the same runner loads from both formats:** @@ -464,12 +464,11 @@ This means `type: "action_chunking"` and `class_path: "action_chunking"` both re {"type": "action_chunking", "chunk_size": 100, "n_action_steps": 100} // PhysicalAI writes (class_path + init_args): -{"class_path": "action_chunking", "init_args": {"chunk_size": 100, "n_action_steps": 100}} +{"class_path": "physicalai.inference.runners.ActionChunkingRunner", "init_args": {"chunk_size": 100, "n_action_steps": 100}} -// Both resolve to: -// ComponentSpec(class_path="action_chunking", init_args={"chunk_size": 100, "n_action_steps": 100}) -// → ComponentRegistry.resolve("action_chunking") → ActionChunkingRunner -// → instantiate_component(spec) → ActionChunkingRunner(chunk_size=100, n_action_steps=100) +// Both resolve to the same ActionChunkingRunner: +// type path: "action_chunking" → ComponentRegistry.resolve() → ActionChunkingRunner(...) +// class_path path: "physicalai.inference.runners.ActionChunkingRunner" → direct import → ActionChunkingRunner(...) ``` ### Runner Parameter Validation @@ -528,7 +527,7 @@ def resolve_processors(specs: list[dict], path: Path) -> list: {"type": "normalize", "mode": "mean_std", "artifact": "stats.safetensors", "features": ["observation.state"]} // PhysicalAI writes (class_path + init_args): -{"class_path": "normalize", "init_args": {"mode": "mean_std", "stats_path": "stats.safetensors", "features": ["observation.state"]}} +{"class_path": "physicalai.inference.preprocessors.StatsNormalizer", "init_args": {"mode": "mean_std", "stats_path": "stats.safetensors", "features": ["observation.state"]}} // Both resolve to: StatsNormalizer(mode="mean_std", stats_path=..., features=["observation.state"]) ``` @@ -552,7 +551,7 @@ def load_exported_policy(path: str | Path) -> ExportedPolicy: raw = json.loads((path / "manifest.json").read_text()) # Read runner config - runner_config = raw["inference"]["runner"] + runner_config = raw["model"]["runner"] runner_type = runner_config["type"] # Build LeRobot's own runner (standalone, numpy-only) @@ -572,12 +571,12 @@ def load_exported_policy(path: str | Path) -> ExportedPolicy: ... # Load normalizer from io specs - preprocessors = raw["inference"]["io"].get("preprocessors", []) - postprocessors = raw["inference"]["io"].get("postprocessors", []) + preprocessors = raw["model"]["io"].get("preprocessors", []) + postprocessors = raw["model"]["io"].get("postprocessors", []) normalizer = Normalizer.from_specs(preprocessors + postprocessors, path) # Load backend adapter (from named artifact role) - artifacts = raw["inference"]["artifacts"] + artifacts = raw["model"]["artifacts"] adapter = ONNXRuntimeAdapter(path / artifacts["model"]) return ExportedPolicy(runner=runner, adapter=adapter, normalizer=normalizer) @@ -589,7 +588,7 @@ def load_exported_policy(path: str | Path) -> ExportedPolicy: ## 5. Runner Mapping -### `inference.runner.type` to Runner +### `model.runner.type` to Runner | `runner.type` | PhysicalAI Runner | LeRobot Runner | Policies | | ----------------- | ------------------------------------ | ----------------------------------------- | ---------------- | @@ -597,7 +596,7 @@ def load_exported_policy(path: str | Path) -> ExportedPolicy: | `iterative` | `IterativeRunner(SinglePass())` | `IterativeRunner` | Diffusion, TDMPC | | `two_phase` | `TwoPhaseRunner(encoder, Iterative)` | `TwoPhaseRunner` | PI0, SmolVLA | -### Runner Parameters (All in `inference.runner`) +### Runner Parameters (All in `model.runner`) | Parameter | Used By | Description | | --------------------- | ------------------------------------- | --------------------------------------- | @@ -609,7 +608,7 @@ def load_exported_policy(path: str | Path) -> ExportedPolicy: | `use_mpc` | iterative | Enable model-predictive control (TDMPC) | | `cem_iterations` | iterative | CEM optimization iterations (TDMPC) | -> **Note:** Two-phase artifact paths (`encoder`, `denoise`) live in `inference.artifacts`, not in the runner config. See [Runner Variants](#runner-variants) for examples. +> **Note:** Two-phase artifact paths (`encoder`, `denoise`) live in `model.artifacts`, not in the runner config. See [Runner Variants](#runner-variants) for examples. --- @@ -624,7 +623,7 @@ LeRobot policies operate on **normalized** inputs and produce **normalized** out ### Solution: Preprocessor and Postprocessor Entries -The manifest declares normalization as I/O transforms in `inference.io.preprocessors` and `inference.io.postprocessors`: +The manifest declares normalization as I/O transforms in `model.io.preprocessors` and `model.io.postprocessors`: ```json "io": { @@ -923,18 +922,18 @@ The previous design (v1 plan) proposed two manifest formats: `lerobot_exported_p The converged format eliminates all of these problems. One schema, two producers, zero translation. -### Why `inference` as a Container? +### Why `model` as a Container? -The `inference` section mirrors the `InferenceModel` class hierarchy: +The `model` section mirrors the `InferenceModel` class hierarchy: - `InferenceModel` composes runner, adapter, preprocessors, postprocessors -- `inference` contains runner, artifacts (for adapter), io (for pre/postprocessors) +- `model` contains runner, artifacts (for adapter), io (for pre/postprocessors) This follows the same pattern as training configs where the top-level sections (`model`, `data`, `trainer`) mirror the class hierarchy. It makes the manifest self-documenting: the JSON structure tells you the code structure. -### Why Not `inference: null` for Single-Pass? +### Why Not `model: null` for Single-Pass? -All policies need `n_obs_steps`, `artifacts`, and `io` regardless of runner type. Making `inference` nullable would force these universal fields elsewhere (top-level or in `policy`), breaking the logical grouping. Instead, `inference` is always present --- only the runner params differ between policy types. +All policies need `n_obs_steps`, `artifacts`, and `io` regardless of runner type. Making `model` nullable would force these universal fields elsewhere (top-level or in `policy`), breaking the logical grouping. Instead, `model` is always present --- only the runner params differ between policy types. ### Why Preprocessors Inside `io`? @@ -957,7 +956,7 @@ The `class_path` field enables PhysicalAI to instantiate the original policy cla | Test matrix | 2x (each feature tested against both formats) | 1x | | Normalization | Adapter auto-generates ComponentSpecs | Manifest declares pre/postprocessors directly | | Runner resolution | `policy.kind` + separate `inference` block | Dual-path: `class_path` OR `type` → registry → `instantiate_component()` | -| Action params | Separate `action` section | Params in `inference.runner` | +| Action params | Separate `action` section | Params in `model.runner` | | Backward compat | Format detection + adapter routing | `from_legacy_metadata()` for pre-manifest YAML only | | Component formats | `class_path` + `init_args` only | Both `type` + flat params (interop) and `class_path` + `init_args` (full-power) | @@ -971,5 +970,5 @@ The `class_path` field enables PhysicalAI to instantiate the original policy cla --- -_Document version: 5.0_ -_Last updated: 2026-03-27_ +_Document version: 5.1_ +_Last updated: 2026-03-31_ From 93b029fbb3a1b7c135554095e72130a47ddf0687 Mon Sep 17 00:00:00 2001 From: Samet Date: Tue, 31 Mar 2026 12:40:40 +0200 Subject: [PATCH 3/4] docs: simplify inferencekit and lerobot design docs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Aggressively trim both design docs for coherency and readability: - inferencekit.md: 1202 → 742 lines (cut Appendix, API Reference, verbose domain examples, full runner implementations) - lerobot.md: ~960 → ~420 lines (cut rationale appendices, testing/migration sections, verbose code blocks) Keep: Architecture, Core Components, Manifest Format (both type and class_path examples), Dual Resolution, Runners, Usage Examples. Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus --- docs/design/components/inferencekit.md | 613 +++------------------- docs/design/integrations/lerobot.md | 683 ++++--------------------- 2 files changed, 173 insertions(+), 1123 deletions(-) diff --git a/docs/design/components/inferencekit.md b/docs/design/components/inferencekit.md index 3a5c27e..60216d8 100644 --- a/docs/design/components/inferencekit.md +++ b/docs/design/components/inferencekit.md @@ -26,35 +26,14 @@ - [Preprocessors and Postprocessors](#preprocessors-and-postprocessors) - [Manifest Format](#manifest-format) - [Extension \& Plugin System](#extension--plugin-system) - - [Backend Registry](#backend-registry) - - [Building a Custom Domain Layer](#building-a-custom-domain-layer) - - [Publishing to HuggingFace](#publishing-to-huggingface) - [Runners (Domain-Provided)](#runners-domain-provided) - - [Contrib Runners](#contrib-runners) - [Supported Backends](#supported-backends) - - [Domain Layer Examples](#domain-layer-examples) - - [Example 1: Vision (model_api)](#example-1-vision-model_api) - - [Example 2: Physical‑AI Plugins](#example-2-physicalai-plugins) - - [Example 3: Custom Domain](#example-3-custom-domain) + - [Domain Layer Example](#domain-layer-example) - [Usage Examples](#usage-examples) - [Basic usage](#basic-usage) - [With explicit backend](#with-explicit-backend) - [With callbacks](#with-callbacks) - [Context manager for resource cleanup](#context-manager-for-resource-cleanup) - - [API Reference](#api-reference) - - [Main Entry Point](#main-entry-point) - - [Runners](#runners) - - [Adapters](#adapters) - - [Callbacks](#callbacks) - - [Plugins](#plugins) - - [Extension Points](#extension-points) - - [Appendix: Design Rationale](#appendix-design-rationale) - - [Why a separate inference package?](#why-a-separate-inference-package) - - [Why inferencekit is a base layer, not a model_api replacement](#why-inferencekit-is-a-base-layer-not-a-model_api-replacement) - - [Migration path for model_api](#migration-path-for-model_api) - - [Why runners are separate from adapters?](#why-runners-are-separate-from-adapters) - - [Why callbacks instead of inheritance?](#why-callbacks-instead-of-inheritance) - - [Why a plugin system?](#why-a-plugin-system) - [Related Documents](#related-documents) --- @@ -450,7 +429,8 @@ manifest.json │ ├── n_obs_steps (observation window) │ ├── runner (execution pattern + params) │ ├── artifacts (model files by named role) -│ └── io (shapes, preprocessors, postprocessors) +│ ├── preprocessors (input transforms: normalize, etc.) +│ └── postprocessors (output transforms: denormalize, etc.) ├── hardware (deployment — what hardware?) │ ├── robots (robot configurations) │ └── cameras (camera configurations) @@ -478,31 +458,22 @@ manifest.json "artifacts": { "model": "model.onnx" }, - "io": { - "inputs": [ - {"name": "observation.image", "dtype": "float32", "shape": ["B", 3, 96, 96]}, - {"name": "observation.state", "dtype": "float32", "shape": ["B", 14]} - ], - "outputs": [ - {"name": "action", "dtype": "float32", "shape": ["B", 100, 14]} - ], - "preprocessors": [ - { - "type": "normalize", - "mode": "mean_std", - "artifact": "stats.safetensors", - "features": ["observation.state"] - } - ], - "postprocessors": [ - { - "type": "denormalize", - "mode": "mean_std", - "artifact": "stats.safetensors", - "features": ["action"] - } - ] - } + "preprocessors": [ + { + "type": "normalize", + "mode": "mean_std", + "artifact": "stats.safetensors", + "features": ["observation.state"] + } + ], + "postprocessors": [ + { + "type": "denormalize", + "mode": "mean_std", + "artifact": "stats.safetensors", + "features": ["action"] + } + ] }, "hardware": { "robots": [], @@ -544,35 +515,26 @@ PhysicalAI can also write manifests using the full `class_path` + `init_args` fo "artifacts": { "model": "model.onnx" }, - "io": { - "inputs": [ - {"name": "observation.image", "dtype": "float32", "shape": ["B", 3, 96, 96]}, - {"name": "observation.state", "dtype": "float32", "shape": ["B", 14]} - ], - "outputs": [ - {"name": "action", "dtype": "float32", "shape": ["B", 100, 14]} - ], - "preprocessors": [ - { - "class_path": "physicalai.inference.preprocessors.StatsNormalizer", - "init_args": { - "mode": "mean_std", - "stats_path": "stats.safetensors", - "features": ["observation.state"] - } + "preprocessors": [ + { + "class_path": "physicalai.inference.preprocessors.StatsNormalizer", + "init_args": { + "mode": "mean_std", + "stats_path": "stats.safetensors", + "features": ["observation.state"] } - ], - "postprocessors": [ - { - "class_path": "physicalai.inference.postprocessors.StatsDenormalizer", - "init_args": { - "mode": "mean_std", - "stats_path": "stats.safetensors", - "features": ["action"] - } + } + ], + "postprocessors": [ + { + "class_path": "physicalai.inference.postprocessors.StatsDenormalizer", + "init_args": { + "mode": "mean_std", + "stats_path": "stats.safetensors", + "features": ["action"] } - ] - } + } + ] }, "hardware": { "robots": [], @@ -596,7 +558,7 @@ The framework reads `manifest.json` and resolves the model configuration using * - **`type` + flat params** (interoperable, written by LeRobot): `{"type": "action_chunking", "chunk_size": 100}` → registry lookup → `ComponentSpec` → `instantiate_component()` - **`class_path` + `init_args`** (full-power, written by PhysicalAI): `{"class_path": "physicalai.inference.runners.ActionChunkingRunner", "init_args": {"chunk_size": 100}}` → `ComponentSpec` → `instantiate_component()` 3. **Backend selection**: `model.artifacts` maps named roles (e.g., `"model"`, `"encoder"`) to filenames. The first available backend is auto-selected, or the user can override at load time. -4. **I/O pipeline**: `model.io.preprocessors` and `model.io.postprocessors` declare input/output transforms (normalization, denormalization) resolved via the same dual-path mechanism. +4. **I/O pipeline**: `model.preprocessors` and `model.postprocessors` declare input/output transforms (normalization, denormalization) resolved via the same dual-path mechanism. 5. **Hardware validation**: `hardware.robots` and `hardware.cameras` sections declare expected shapes. The runtime can validate observations against these. 6. **Custom components**: Domain layers can extend the manifest with custom processor types or runner parameters without modifying inferencekit. Any component with a `class_path` is instantiated directly; any component with a `type` goes through the registry. @@ -606,132 +568,33 @@ The framework reads `manifest.json` and resolves the model configuration using * ## Extension & Plugin System -inferencekit only supports **backend adapters** as extensions. All domain plugins live above it (physical‑ai‑framework, model_api, custom layers). - -### Backend Registry - -inferencekit exposes a backend registry for RuntimeAdapters. Domain plugins are not registered here. +inferencekit supports **backend adapters** as extensions via a registry. Domain-specific plugins (runners, processors, models) live in their respective domain layers, not in inferencekit. -### Building a Custom Domain Layer +**Backend registry:** New backends implement `RuntimeAdapter` and register via Python entry points (`inferencekit.backends`). Domain layers register runners and processors via their own entry points (`inferencekit.runners`, `inferencekit.callbacks`). -Anyone can create a domain-specific inference layer on top of inferencekit. Here's the pattern: - -**Step 1: Define your domain model** +**Building a custom domain layer:** Subclass `InferenceModel`, implement domain-specific runners and pre/postprocessors, and register via entry points: ```python -# my_domain_inference/model.py -from inferencekit import InferenceModel - +# my_domain/model.py — subclass InferenceModel class MyDomainModel(InferenceModel): - """Domain-specific inference model. - - Extends InferenceModel with domain-specific methods, - preprocessing, and postprocessing. - """ - def __init__(self, path, **kwargs): super().__init__(path, **kwargs) - # Attach domain preprocessors/postprocessors - self.preprocessors = self._load_preprocessors(path) - self.postprocessors = self._load_postprocessors(path) + self.preprocessors = [MyPreprocessor()] def domain_predict(self, domain_inputs): - """Domain-specific prediction method.""" - # Preprocess domain inputs -> generic inputs inputs = self._preprocess(domain_inputs) - # Run generic inference - outputs = self(inputs) - # Postprocess generic outputs -> domain outputs - return self._postprocess(outputs) + return self(inputs) ``` -**Step 2: Define domain-specific runners (if needed)** - -```python -# my_domain_inference/runners.py -from inferencekit.runners import InferenceRunner - -class MyDomainRunner(InferenceRunner): - """Runner for domain-specific inference patterns.""" - - def run(self, adapter, inputs): - # Implement domain-specific execution logic - ... -``` - -**Step 3: Register via entry points** - ```toml -# my_domain_inference/pyproject.toml +# pyproject.toml — register custom runners [project.entry-points."inferencekit.runners"] -my_domain_runner = "my_domain_inference.runners:MyDomainRunner" -``` - -**Step 4: Package and distribute** - -```bash -# Publish to PyPI -pip install my-domain-inference - -# Or publish to HuggingFace (see below) -``` - -### Publishing to HuggingFace - -Domain layers can publish model packages to HuggingFace that include: - -1. **Exported model artifacts** (ONNX, OpenVINO, etc.) -2. **Manifest** (`manifest.json`) specifying the inferencekit runner, preprocessors, etc. -3. **Domain package dependency** declared in the manifest - -```json -{ - "format": "policy_package", - "version": "1.0", - "policy": { - "name": "my_model", - "source": { - "class_path": "my_domain_inference.models.MyModel" - } - }, - "model": { - "n_obs_steps": 1, - "runner": { - "type": "action_chunking", - "chunk_size": 1, - "n_action_steps": 1 - }, - "artifacts": { - "model": "model.onnx" - }, - "io": { - "inputs": [ - {"name": "input", "dtype": "float32", "shape": ["B", 3, 640, 640]} - ], - "outputs": [ - {"name": "output", "dtype": "float32", "shape": ["B", 100, 6]} - ], - "preprocessors": [], - "postprocessors": [] - } - }, - "hardware": { - "robots": [], - "cameras": [] - }, - "metadata": { - "created_at": "2026-03-27T12:00:00Z", - "created_by": "my-domain-inference" - } -} +my_runner = "my_domain.runners:MyDomainRunner" ``` -**Loading from HuggingFace:** +**HuggingFace publishing:** Domain layers can publish model packages to HuggingFace containing exported artifacts + `manifest.json`. Loading is automatic: ```python -from inferencekit import InferenceModel - -# Auto-downloads model + resolves domain package model = InferenceModel("hf://username/my-model") outputs = model(inputs) ``` @@ -740,7 +603,13 @@ outputs = model(inputs) ## Runners (Domain-Provided) -inferencekit defines the `InferenceRunner` interface. Domain layers implement concrete runners. +inferencekit defines the `InferenceRunner` interface. Domain layers implement concrete runners: + +| Runner | Description | Stateful | +| --- | --- | --- | +| **SinglePassRunner** | Default. One forward pass per call. Covers 90% of use cases. | No | +| **BatchRunner** | Splits inputs into batches for throughput optimization. | No | +| **StreamingRunner** | Buffers inputs for real-time streaming applications. | Yes | ```python class SinglePassRunner(InferenceRunner): @@ -748,96 +617,16 @@ class SinglePassRunner(InferenceRunner): def run(self, adapter: RuntimeAdapter, inputs: dict) -> dict: return adapter.predict(inputs) - - def reset(self) -> None: - pass # No state - - -class BatchRunner(InferenceRunner): - """Batched inference for throughput optimization.""" - - def __init__(self, batch_size: int = 8): - self.batch_size = batch_size - - def run(self, adapter: RuntimeAdapter, inputs: dict) -> dict: - # Split inputs into batches, run, merge results - ... - - -class StreamingRunner(InferenceRunner): - """Streaming inference for real-time applications.""" - - def __init__(self, buffer_size: int = 1): - self.buffer_size = buffer_size - - def run(self, adapter: RuntimeAdapter, inputs: dict) -> dict: - # Process streaming inputs with buffering - ... ``` -### Contrib Runners - -If desired, inferencekit can host a small `contrib` module for reference implementations, but it does not own domain logic. - -```python -# inferencekit/contrib/iterative.py -class IterativeRunner(InferenceRunner): - """Runner for iterative/flow-matching inference. - - Performs multiple forward passes with denoising steps. - Used by diffusion models, flow-matching policies, etc. - """ +**Contrib runners** (`inferencekit.contrib`): Reference implementations for common patterns, shipped as optional extras: - def __init__( - self, - num_steps: int = 10, - scheduler: str = "euler", - timestep_spacing: str = "linear", - ): - self.num_steps = num_steps - self.scheduler = scheduler - self.timestep_spacing = timestep_spacing - - def run(self, adapter: RuntimeAdapter, inputs: dict) -> dict: - x_t = np.random.randn(*self._infer_shape(inputs)).astype(np.float32) - timesteps = self._generate_timesteps() - dt = -1.0 / self.num_steps +| Runner | Description | Use Case | +| --- | --- | --- | +| **IterativeRunner** | Multi-step denoising with configurable scheduler | Diffusion, flow-matching policies | +| **TiledRunner** | Tile-based inference with overlap and merging | High-resolution images, satellite imagery | - for t in timesteps: - step_inputs = {**inputs, "x_t": x_t, "timestep": np.array([t])} - v_t = adapter.predict(step_inputs)["v_t"] - x_t = self._step(x_t, v_t, dt) - - return {"output": x_t} -``` - -```python -# inferencekit/contrib/tiled.py -class TiledRunner(InferenceRunner): - """Runner for tile-based inference on large inputs. - - Splits large inputs into overlapping tiles, runs inference - on each tile, and merges results. Useful for high-resolution - images, satellite imagery, medical imaging, etc. - """ - - def __init__( - self, - tile_size: tuple[int, int] = (640, 640), - overlap: float = 0.25, - merge_strategy: str = "average", - ): - self.tile_size = tile_size - self.overlap = overlap - self.merge_strategy = merge_strategy - - def run(self, adapter: RuntimeAdapter, inputs: dict) -> dict: - tiles = self._split_into_tiles(inputs) - tile_results = [adapter.predict(tile) for tile in tiles] - return self._merge_results(tile_results) -``` - -Domain layers can also contribute runners back to `inferencekit.contrib` via pull request, or ship them in their own packages. +Domain layers can contribute runners back to `inferencekit.contrib` via pull request, or ship them in their own packages. --- @@ -854,103 +643,15 @@ Third-party backends can be added via the backend registry without modifying inf --- -## Domain Layer Examples - -These examples show how domain-specific libraries build on inferencekit's interfaces. Each example demonstrates the pattern; full implementations live in their respective packages. +## Domain Layer Example -### Example 1: Vision (model_api) - -[model_api](https://github.com/open-edge-platform/model_api) provides vision-specific inference on top of inferencekit. It adds image preprocessing, task-specific model wrappers, and structured result types. +This example shows how physicalai builds on inferencekit's interfaces. Policy-specific behavior (`select_action`, episode reset) is implemented in physical‑ai‑framework's `InferenceModel` wrapper: ```python -# model_api wrapping inferencekit for vision inference -from inferencekit import InferenceModel -from inferencekit.runners import InferenceRunner, SinglePassRunner -from inferencekit.preprocessors import Preprocessor -from inferencekit.postprocessors import Postprocessor - - -# Vision-specific preprocessor -class ImagePreprocessor(Preprocessor): - """Resize, normalize, and layout-transform images.""" - - def __init__(self, target_size, mean, std, layout="NCHW"): - self.target_size = target_size - self.mean = np.array(mean) - self.std = np.array(std) - self.layout = layout - - def __call__(self, inputs: dict) -> dict: - image = inputs["image"] - image = cv2.resize(image, self.target_size) - image = (image.astype(np.float32) / 255.0 - self.mean) / self.std - if self.layout == "NCHW": - image = image.transpose(2, 0, 1) - inputs["image"] = image[np.newaxis, ...] - return inputs - - -# Vision-specific postprocessor (e.g., NMS for detection) -class DetectionPostprocessor(Postprocessor): - """Decode detection outputs and apply NMS.""" - - def __init__(self, confidence_threshold=0.5, nms_threshold=0.45): - self.confidence_threshold = confidence_threshold - self.nms_threshold = nms_threshold - - def __call__(self, outputs: dict) -> dict: - boxes, scores, labels = self._decode(outputs) - keep = self._nms(boxes, scores) - return { - "boxes": boxes[keep], - "scores": scores[keep], - "labels": labels[keep], - } - - -# Vision model built on top of InferenceModel -class DetectionModel(InferenceModel): - """YOLO/SSD/etc. detection model.""" - - def __init__(self, path, confidence=0.5, **kwargs): - super().__init__(path, **kwargs) - self.preprocessors = [ - ImagePreprocessor( - target_size=(640, 640), - mean=[0.485, 0.456, 0.406], - std=[0.229, 0.224, 0.225], - ) - ] - self.postprocessors = [ - DetectionPostprocessor(confidence_threshold=confidence) - ] - - def detect(self, image: np.ndarray) -> dict: - """Convenience method for vision users.""" - return self({"image": image}) -``` - -**Usage:** - -```python -from model_api import DetectionModel - -model = DetectionModel("./exports/yolo_v8", backend="openvino") -detections = model.detect(image) -print(detections["boxes"], detections["scores"]) -``` - -### Example 2: Physical‑AI Plugins - -physicalai hosts policy plugins for physicalai-train, LeRobot, and custom frameworks. Each plugin supplies preprocessors, runners, and optional wrappers. - -```python -# physical‑ai‑framework plugin example (policy-specific) from inferencekit import InferenceModel from inferencekit.runners import InferenceRunner -# Policy-specific runner with action chunking class ActionChunkingRunner(InferenceRunner): """Runner that manages action chunk queues. @@ -974,65 +675,9 @@ class ActionChunkingRunner(InferenceRunner): def reset(self): self._action_queue = [] - - ``` -Policy‑specific behavior (e.g., `select_action`, episode reset) is implemented in physical‑ai‑framework’s `InferenceModel` wrapper, which subclasses inferencekit’s base `InferenceModel`. - -### Example 3: Custom Domain - -Anyone can build a domain layer. Here's a minimal example for audio inference: - -```python -# audio_inference/model.py -from inferencekit import InferenceModel -from inferencekit.preprocessors import Preprocessor - - -class AudioPreprocessor(Preprocessor): - """Convert audio to mel spectrogram.""" - - def __init__(self, sample_rate=16000, n_mels=80): - self.sample_rate = sample_rate - self.n_mels = n_mels - - def __call__(self, inputs): - audio = inputs["audio"] - mel = librosa.feature.melspectrogram( - y=audio, sr=self.sample_rate, n_mels=self.n_mels - ) - inputs["mel_spectrogram"] = mel - return inputs - - -class AudioClassificationModel(InferenceModel): - """Audio classification on top of inferencekit.""" - - def __init__(self, path, **kwargs): - super().__init__(path, **kwargs) - self.preprocessors = [AudioPreprocessor()] - - def classify(self, audio: np.ndarray) -> dict: - return self({"audio": audio}) -``` - -**Package and publish:** - -```toml -# audio_inference/pyproject.toml -[project] -name = "audio-inference-kit" -dependencies = ["inferencekit", "librosa"] - -[project.entry-points."inferencekit.runners"] -audio_streaming = "audio_inference.runners:AudioStreamingRunner" -``` - -```bash -pip install audio-inference-kit -# or publish to HuggingFace with model artifacts + metadata -``` +Other domain layers (model_api for vision, custom audio/NLP packages) follow the same pattern: subclass `InferenceModel`, implement domain runners and pre/postprocessors, register via entry points. --- @@ -1085,136 +730,6 @@ with InferenceModel("./exports/my_model") as model: --- -## API Reference - -### Main Entry Point - -```python -from inferencekit import InferenceModel - -model = InferenceModel("./exports/my_model") -outputs = model(inputs) -``` - -### Runners - -```python -from inferencekit.runners import ( - InferenceRunner, # ABC - subclass for custom runners - SinglePassRunner, # Default - covers 90% of models - BatchRunner, # Throughput-optimized batching - StreamingRunner, # Real-time streaming -) - -# Contrib runners (install with inferencekit[contrib]) -from inferencekit.contrib import ( - IterativeRunner, # Multi-step denoising / flow matching - TiledRunner, # Tile-based for large inputs -) -``` - -### Adapters - -```python -from inferencekit.adapters import ( - RuntimeAdapter, # ABC - OpenVINOAdapter, # Intel devices - ONNXAdapter, # Cross-platform - TorchExportAdapter, # PyTorch - get_adapter, # Factory function -) -``` - -### Callbacks - -```python -from inferencekit.callbacks import ( - Callback, # ABC - TimingCallback, # Performance profiling - LoggingCallback, # Prediction logging -) -``` - -### Plugins - -```python -from inferencekit.plugins import registry - -# List available backends -print(registry.backends.list()) - -# Register custom backend -registry.backends.register("my_backend", MyBackend) - -# Get a backend by name -adapter = registry.backends.get("onnx", device="cuda") -``` - -### Extension Points - -| Extension | How to Extend | Registration | -| ---------------- | --------------------------- | ------------------------------------- | -| New backend | Implement `RuntimeAdapter` | Entry point: `inferencekit.backends` | -| New runner | Implement `InferenceRunner` | Entry point: `inferencekit.runners` | -| New model format | Implement format plugin | Entry point: `inferencekit.formats` | -| New callback | Subclass `Callback` | Entry point: `inferencekit.callbacks` | -| Preprocessing | Implement `Preprocessor` | Via metadata `class_path` | -| Postprocessing | Implement `Postprocessor` | Via metadata `class_path` | - ---- - -## Appendix: Design Rationale - -### Why a separate inference package? - -1. **Reusability**: Same core across vision (model_api), robotics (physicalai-train), audio, NLP, and custom domains -2. **Clear boundaries**: Generic concerns (backends, metadata, plugins) separated from domain concerns (images, robots, audio) -3. **Easier testing**: Domain-agnostic package has fewer dependencies -4. **Ecosystem growth**: Anyone can build and publish domain layers without modifying inferencekit - -### Why inferencekit is a base layer, not a model_api replacement - -model_api provides rich vision-specific functionality: image preprocessing embedded in model graphs, task-specific wrappers (YOLO, SSD, SAM), result types, parameter validation, and tiling. These are vision concerns that don't belong in a generic inference framework. - -Instead of replacing model_api, inferencekit provides the **foundation** that model_api can build on: - -| Concern | inferencekit provides | model_api adds | -| ----------------- | -------------------------------------- | ---------------------------------------- | -| Backend execution | RuntimeAdapter (OV, ONNX, TRT) | Wraps RuntimeAdapter in InferenceAdapter | -| Model loading | Manifest-driven `InferenceModel(path)` | Vision-specific `Model.create_model()` | -| Preprocessing | Preprocessor ABC | ImageResize, Normalize, LayoutTransform | -| Postprocessing | Postprocessor ABC | NMS, BoxDecoder, MaskDecoder | -| Runners | SinglePassRunner, BatchRunner | TiledRunner (via contrib or own impl) | -| Result types | `dict[str, Any]` | DetectionResult, ClassificationResult | - -### Migration path for model_api - -1. **Phase 1 (compatibility)**: model_api wraps inferencekit's RuntimeAdapter inside its existing InferenceAdapter. No public API change. -2. **Phase 2 (adoption)**: model_api adopts RuntimeAdapter directly, deprecates its own adapter layer. -3. **Phase 3 (simplification)**: model_api becomes a pure domain layer on top of inferencekit. - -### Why runners are separate from adapters? - -- **Adapters** handle backend-specific execution (ONNX vs OpenVINO) -- **Runners** handle algorithm-specific patterns (single-pass vs iterative) -- This separation allows N backends × M inference patterns without N×M implementations - -### Why callbacks instead of inheritance? - -- **Composability**: Mix and match (timing + logging + safety) -- **Reusability**: Same callback works across all models and domains -- **Maintainability**: Add cross-cutting concerns without changing core code -- **Familiarity**: Lightning users already understand this pattern - -### Why a plugin system? - -- **Ecosystem growth**: Third parties can extend without forking -- **Clean dependencies**: inferencekit doesn't depend on domain packages -- **Discoverability**: Entry points make extensions automatically available -- **Publishability**: Domain layers can be packaged and shared independently - ---- - ## Related Documents - **[Strategy](../architecture/strategy.md)** — Big-picture architecture and layering decisions @@ -1223,5 +738,5 @@ Instead of replacing model_api, inferencekit provides the **foundation** that mo --- -_Document Version: 5.1_ +_Document Version: 6.0_ _Last Updated: 2026-03-31_ diff --git a/docs/design/integrations/lerobot.md b/docs/design/integrations/lerobot.md index f368e5c..36a3f56 100644 --- a/docs/design/integrations/lerobot.md +++ b/docs/design/integrations/lerobot.md @@ -13,8 +13,8 @@ This document describes how **PhysicalAI** integrates with **LeRobot** exported **Key principles:** -1. **One schema, two expressiveness levels** --- The manifest schema supports two component formats: `type` + flat params (interoperable, used by LeRobot) and `class_path` + `init_args` (full-power, used by PhysicalAI). PhysicalAI reads both; LeRobot reads `type` only. -2. **LeRobot is standalone** --- LeRobot's export system works perfectly without PhysicalAI installed. No PhysicalAI imports, no PhysicalAI class paths in manifests. +1. **One schema, two expressiveness levels** --- The manifest supports two component formats: `type` + flat params (interoperable, used by LeRobot) and `class_path` + `init_args` (full-power, used by PhysicalAI). PhysicalAI reads both; LeRobot reads `type` only. +2. **LeRobot is standalone** --- LeRobot's export system works without PhysicalAI installed. No PhysicalAI imports, no PhysicalAI class paths in manifests. 3. **PhysicalAI loads LeRobot exports natively** --- `InferenceModel.load("./lerobot_export")` works out of the box. No adapter class, no special-casing. 4. **Dependency is strictly one-way** --- LeRobot does not depend on PhysicalAI. PhysicalAI reads LeRobot's output (pure JSON) without importing LeRobot. @@ -25,83 +25,35 @@ policy.export("./out") --produces--> InferenceModel.load("./out") | Same manifest.json schema +-- reads manifest.json Writes: type + flat params +-- resolves via type OR class_path - Own runners (numpy-only) +-- builds preprocessors/postprocessors from io + Own runners (numpy-only) +-- builds preprocessors/postprocessors Zero physicalai deps +-- runs inference through pipeline ``` -### Dual Component Resolution - -The manifest supports two ways to specify components (runners, preprocessors, postprocessors): - -| Format | Who writes | Who reads | Example | -| ------------------------------ | ---------------------------------- | -------------------- | --------------------------------------------------------------------- | -| **`type` + flat params** | LeRobot, simple PhysicalAI exports | Both (interoperable) | `{"type": "action_chunking", "chunk_size": 100}` | -| **`class_path` + `init_args`** | PhysicalAI (full-power) | PhysicalAI only | `{"class_path": "physicalai.inference.runners.ActionChunkingRunner", "init_args": {"chunk_size": 100}}` | - -PhysicalAI resolves both through the same `ComponentRegistry` + `instantiate_component()` pipeline. LeRobot only reads `type` and maps to its own implementations. See [Runner Resolution](#runner-resolution) for the resolution algorithm. - --- ## Table of Contents -- [PhysicalAI: LeRobot Integration Design](#physicalai-lerobot-integration-design) - - [Executive Summary](#executive-summary) - - [Dual Component Resolution](#dual-component-resolution) - - [Table of Contents](#table-of-contents) - - [1. Architecture Overview](#1-architecture-overview) - - [2. Converged Manifest Format](#2-converged-manifest-format) - - [Schema Overview](#schema-overview) - - [Full Example: ACT Policy](#full-example-act-policy) - - [Runner Variants](#runner-variants) - - [Field Reference](#field-reference) - - [Top-Level Envelope](#top-level-envelope) - - [`policy` --- Identity](#policy-----identity) - - [`model` --- How to Run](#model-----how-to-run) - - [`hardware` --- Deployment](#hardware-----deployment) - - [`metadata` --- Provenance](#metadata-----provenance) - - [Preprocessor / Postprocessor Entry](#preprocessor--postprocessor-entry) - - [Design Decisions](#design-decisions) - - [3. How PhysicalAI Loads the Manifest](#3-how-physicalai-loads-the-manifest) - - [Loading Flow](#loading-flow) - - [Runner Resolution](#runner-resolution) - - [Preprocessor and Postprocessor Construction](#preprocessor-and-postprocessor-construction) - - [4. How LeRobot Uses the Manifest](#4-how-lerobot-uses-the-manifest) - - [5. Runner Mapping](#5-runner-mapping) - - [`model.runner.type` to Runner](#modelrunnertype-to-runner) - - [Runner Parameters (All in `model.runner`)](#runner-parameters-all-in-modelrunner) - - [6. Normalization Handling](#6-normalization-handling) - - [Problem](#problem) - - [Solution: Preprocessor and Postprocessor Entries](#solution-preprocessor-and-postprocessor-entries) - - [PhysicalAI Implementation](#physicalai-implementation) - - [Normalization Modes](#normalization-modes) - - [Stats File Format](#stats-file-format) - - [7. Usage Examples](#7-usage-examples) - - [Basic Usage](#basic-usage) - - [With Callbacks](#with-callbacks) - - [Override Runner Parameters](#override-runner-parameters) - - [Real-Time Control](#real-time-control) - - [8. Supported Policies](#8-supported-policies) - - [9. Testing Strategy](#9-testing-strategy) - - [Conformance Tests](#conformance-tests) - - [Parity Tests](#parity-tests) - - [Backward Compatibility Tests](#backward-compatibility-tests) - - [10. Migration from Legacy Formats](#10-migration-from-legacy-formats) - - [Migration Path](#migration-path) - - [Schema Enforcement](#schema-enforcement) - - [Appendix A: Design Rationale](#appendix-a-design-rationale) - - [Why One Format Instead of Two?](#why-one-format-instead-of-two) - - [Why `model` as a Container?](#why-model-as-a-container) - - [Why Not `model: null` for Single-Pass?](#why-not-model-null-for-single-pass) - - [Why Preprocessors Inside `io`?](#why-preprocessors-inside-io) - - [Why `policy.source.class_path`?](#why-policysourceclass_path) - - [Appendix B: Comparison with Previous Design](#appendix-b-comparison-with-previous-design) - - [Related Documents](#related-documents) +- [Executive Summary](#executive-summary) +- [1. Architecture Overview](#1-architecture-overview) +- [2. Converged Manifest Format](#2-converged-manifest-format) + - [Schema Overview](#schema-overview) + - [Full Example: ACT Policy](#full-example-act-policy) + - [Runner Variants](#runner-variants) + - [Field Reference](#field-reference) + - [Dual Component Resolution](#dual-component-resolution) +- [3. How PhysicalAI Loads the Manifest](#3-how-physicalai-loads-the-manifest) +- [4. How LeRobot Uses the Manifest](#4-how-lerobot-uses-the-manifest) +- [5. Runner Mapping](#5-runner-mapping) +- [6. Normalization Handling](#6-normalization-handling) +- [7. Usage Examples](#7-usage-examples) +- [8. Supported Policies](#8-supported-policies) +- [Related Documents](#related-documents) --- ## 1. Architecture Overview -The integration is seamless because both frameworks share the same manifest schema. PhysicalAI's `InferenceModel` reads the manifest, resolves components (runner, preprocessors, postprocessors, adapter), and runs inference --- regardless of which framework produced the export. +Both frameworks share the same manifest schema. PhysicalAI's `InferenceModel` reads the manifest, resolves components (runner, preprocessors, postprocessors, adapter), and runs inference --- regardless of which framework produced the export. ```text +-----------------------------------------------------------------------+ @@ -142,9 +94,7 @@ The integration is seamless because both frameworks share the same manifest sche | Feature | LeRobot Standalone | PhysicalAI | | ----------------------------------- | ------------------ | ---------------------- | | Load exported policy | Yes | Yes | -| Single-pass inference | Yes | Yes | -| Iterative inference | Yes | Yes | -| Two-phase inference | Yes | Yes | +| Single-pass / iterative / two-phase | Yes | Yes | | Action chunking | Yes | Yes | | Callbacks (timing, logging, safety) | No | Yes | | Multi-backend with fallback | ONNX + OpenVINO | ONNX + OpenVINO + TRT | @@ -158,7 +108,7 @@ The integration is seamless because both frameworks share the same manifest sche ### Schema Overview -The manifest mirrors PhysicalAI's `InferenceModel` class hierarchy, following the same philosophy as training configs (which split into `model`, `data`, `trainer` sections): +The manifest mirrors PhysicalAI's `InferenceModel` class hierarchy: ```text manifest.json @@ -170,7 +120,8 @@ manifest.json | +-- n_obs_steps (observation window size) | +-- runner (execution pattern + parameters) | +-- artifacts (model files by named role) -| +-- io (I/O contract: shapes, preprocessors, postprocessors) +| +-- preprocessors (input transforms: normalize, etc.) +| +-- postprocessors (output transforms: denormalize, etc.) +-- hardware (deployment --- what hardware?) | +-- robots (robot configurations) | +-- cameras (camera configurations) @@ -200,31 +151,22 @@ manifest.json "artifacts": { "model": "model.onnx" }, - "io": { - "inputs": [ - {"name": "observation.image", "dtype": "float32", "shape": ["B", 3, 96, 96]}, - {"name": "observation.state", "dtype": "float32", "shape": ["B", 14]} - ], - "outputs": [ - {"name": "action", "dtype": "float32", "shape": ["B", 100, 14]} - ], - "preprocessors": [ - { - "type": "normalize", - "mode": "mean_std", - "artifact": "stats.safetensors", - "features": ["observation.state"] - } - ], - "postprocessors": [ - { - "type": "denormalize", - "mode": "mean_std", - "artifact": "stats.safetensors", - "features": ["action"] - } - ] - } + "preprocessors": [ + { + "type": "normalize", + "mode": "mean_std", + "artifact": "stats.safetensors", + "features": ["observation.state"] + } + ], + "postprocessors": [ + { + "type": "denormalize", + "mode": "mean_std", + "artifact": "stats.safetensors", + "features": ["action"] + } + ] }, "hardware": { "robots": [], @@ -241,7 +183,7 @@ manifest.json ### Runner Variants -The `model.runner` section is open-ended --- policy-specific parameters go directly in the runner object alongside `type`. This avoids the need for a rigid union schema. Each runner implementation declares its expected parameters and logs a warning for any unrecognized keys (see [Runner Parameter Validation](#runner-parameter-validation)). +The `model.runner` section is open-ended --- policy-specific parameters go directly in the runner object alongside `type`. **ACT / VQBeT** (single-pass with action chunking): @@ -267,8 +209,6 @@ The `model.runner` section is open-ended --- policy-specific parameters go direc **PI0** (two-phase: encode once + denoise iteratively): -For two-phase policies, all model artifacts are listed in `model.artifacts` with named roles (`encoder`, `denoise`) rather than backend names. The runner references these roles: - ```json "artifacts": { "encoder": "encoder.onnx", @@ -283,33 +223,6 @@ For two-phase policies, all model artifacts are listed in `model.artifacts` with } ``` -**SmolVLA** (two-phase, no explicit scheduler): - -```json -"artifacts": { - "encoder": "encoder.onnx", - "denoise": "denoise.onnx" -}, -"runner": { - "type": "two_phase", - "chunk_size": 50, - "n_action_steps": 50, - "num_inference_steps": 10 -} -``` - -**TDMPC** (iterative with model-predictive control): - -```json -"runner": { - "type": "iterative", - "horizon": 5, - "n_action_steps": 1, - "use_mpc": true, - "cem_iterations": 6 -} -``` - ### Field Reference #### Top-Level Envelope @@ -330,17 +243,14 @@ For two-phase policies, all model artifacts are listed in `model.artifacts` with #### `model` --- How to Run -| Field | Type | Required | Description | -| ----------------------------- | ------ | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | -| `model.n_obs_steps` | int | Yes | Number of observation timesteps needed by the model (see [Design Decisions](#design-decisions)) | -| `model.runner` | object | Yes | Runner configuration (see variants) | -| `model.runner.type` | string | Yes | Runner type: `action_chunking`, `iterative`, `two_phase` | -| `model.artifacts` | object | Yes | Map of artifact role to filename. Single-model: `{"model": "model.onnx"}`. Two-phase: `{"encoder": "encoder.onnx", "denoise": "denoise.onnx"}` | -| `model.io` | object | Yes | I/O specification | -| `model.io.inputs` | array | Yes | Input tensor specifications | -| `model.io.outputs` | array | Yes | Output tensor specifications | -| `model.io.preprocessors` | array | No | Input transforms (normalize, etc.) | -| `model.io.postprocessors` | array | No | Output transforms (denormalize, etc.) | +| Field | Type | Required | Description | +| -------------------------- | ------ | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | +| `model.n_obs_steps` | int | Yes | Number of observation timesteps needed by the model | +| `model.runner` | object | Yes | Runner configuration (see [Runner Variants](#runner-variants)) | +| `model.runner.type` | string | Yes | Runner type: `action_chunking`, `iterative`, `two_phase` | +| `model.artifacts` | object | Yes | Map of artifact role to filename. Single-model: `{"model": "model.onnx"}`. Two-phase: `{"encoder": "encoder.onnx", "denoise": "denoise.onnx"}` | +| `model.preprocessors` | array | No | Input transforms (normalize, etc.) | +| `model.postprocessors` | array | No | Output transforms (denormalize, etc.) | #### `hardware` --- Deployment @@ -361,228 +271,104 @@ For two-phase policies, all model artifacts are listed in `model.artifacts` with | Field | Type | Required | Description | | ------------ | ------ | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------- | | `type` | string | Yes | Processor type: `"normalize"`, `"denormalize"`, or custom | -| `class_path` | string | No | Python class path for custom processor types. Built-in types (`normalize`, `denormalize`) resolve by convention; unknown types require `class_path`. | -| `mode` | string | No | Normalization mode: `"mean_std"`, `"min_max"`, `"identity"` (required for `normalize`/`denormalize`) | -| `artifact` | string | No | Path to stats file (e.g., `"stats.safetensors"`) (required for `normalize`/`denormalize`) | -| `features` | array | No | Feature names to process (e.g., `["observation.state"]`) (required for `normalize`/`denormalize`) | +| `class_path` | string | No | Full Python class path (required for custom types; built-in types resolve by convention) | +| `mode` | string | No | Normalization mode: `"mean_std"`, `"min_max"`, `"identity"` | +| `artifact` | string | No | Path to stats file (e.g., `"stats.safetensors"`) | +| `features` | array | No | Feature names to process (e.g., `["observation.state"]`) | + +### Dual Component Resolution -Built-in types resolve by convention: `"normalize"` maps to `StatsNormalizer`, `"denormalize"` maps to `StatsDenormalizer`. For custom processor types, provide a `class_path`: +The manifest supports two ways to specify components (runners, preprocessors, postprocessors): + +| Format | Who writes | Who reads | Example | +| ------------------------------ | ---------------------------------- | -------------------- | --------------------------------------------------------------------- | +| **`type` + flat params** | LeRobot, simple PhysicalAI exports | Both (interoperable) | `{"type": "action_chunking", "chunk_size": 100}` | +| **`class_path` + `init_args`** | PhysicalAI (full-power) | PhysicalAI only | `{"class_path": "physicalai.inference.runners.ActionChunkingRunner", "init_args": {"chunk_size": 100}}` | + +Both formats resolve through the same `ComponentRegistry` + `instantiate_component()` pipeline: + +- **`class_path`** (full Python path) → direct import → instantiate +- **`type`** (short name) → registry lookup → resolve to full path → instantiate ```json -{ - "type": "clamp", - "class_path": "physicalai.inference.postprocessors.ActionClamp", - "min": -1.0, - "max": 1.0 -} -``` +// LeRobot writes (type + flat params): +{"type": "action_chunking", "chunk_size": 100, "n_action_steps": 100} -### Design Decisions - -| Decision | Rationale | -| ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| **No `policy.type` field** | `model.runner.type` is the sole source of truth for runner construction. A separate `policy.type` would overlap without matching (e.g., ACT is `single_pass` but uses `action_chunking` runner), causing confusion. Eliminated to avoid ambiguity. | -| **`hardware` is top-level** | Deployment needs hardware information even if LeRobot doesn't use it yet. Cheap future-proofing. | -| **Preprocessors inside `model.io`** | They are I/O transforms, logically part of the I/O boundary. Not separate top-level sections. | -| **`format` + `version` kept** | Cheap future-proofing for schema evolution. `format` enables detection, `version` enables migration. | -| **`model` is a container** | Mirrors `InferenceModel` class hierarchy. Matches training config philosophy (`model`/`data`/`trainer`). | -| **No separate `action` section** | `chunk_size` and `n_action_steps` are runner behavioral params. `action_dim` is redundant with output shape. | -| **`mode` per processor** | Different features may use different normalization modes (mean_std vs min_max). | -| **Runner params are open with validation** | Policy-specific fields go directly in runner. Each runner declares expected params and warns on unknown keys (see [Runner Parameter Validation](#runner-parameter-validation)). | -| **Named artifact roles** | `model.artifacts` uses role names (`model`, `encoder`, `denoise`) not backend names. This eliminates the split between `model.artifacts` and runner-level artifact refs for two-phase policies, giving a single authoritative location for all model files. | -| **Extensible processor types** | Built-in types (`normalize`, `denormalize`) resolve by convention. Unknown types fall back to `class_path`, allowing custom processors without code changes to the loader. | -| **`n_obs_steps` in `model`** | Although `n_obs_steps` describes what the model expects (a contract), it is consumed during inference preparation --- the caller uses it to decide how many observation frames to collect before calling the model. It sits alongside other inference-time parameters rather than in `policy` (which is purely identity/provenance). | -| **Shared JSON Schema for CI** | Both projects validate exported manifests against a shared `manifest.schema.json` to prevent schema drift (see [Schema Enforcement](#schema-enforcement)). | -| **Dual component resolution** | Components support both `type` + flat params (interoperable, LeRobot-compatible) and `class_path` + `init_args` (PhysicalAI full-power). Both resolve through the same `ComponentRegistry` + `instantiate_component()` pipeline. This avoids per-type if-chains while preserving the existing component system. See [Dual Component Resolution](#dual-component-resolution). | -| **No `_normalize_metadata()`** | The nested manifest structure is parsed directly into Pydantic models. No flattening shim needed --- both the schema and the loader are designed together. Legacy `metadata.yaml` files (pre-manifest era) are handled by `from_legacy_metadata()` only. | +// PhysicalAI writes (class_path + init_args): +{"class_path": "physicalai.inference.runners.ActionChunkingRunner", "init_args": {"chunk_size": 100, "n_action_steps": 100}} + +// Both resolve to the same ActionChunkingRunner(chunk_size=100, n_action_steps=100) +``` --- ## 3. How PhysicalAI Loads the Manifest -### Loading Flow - -The manifest is parsed directly into nested Pydantic models --- no intermediate flattening or normalization step. The nested JSON maps 1:1 to the Pydantic model hierarchy: +The manifest is parsed directly into nested Pydantic models --- no intermediate flattening step: ```python # In InferenceModel.load(): raw = json.loads((path / "manifest.json").read_text()) - -# Validate format -if raw.get("format") != "policy_package": - msg = f"Unknown manifest format: {raw.get('format')}" - raise ValueError(msg) - -# Parse directly into nested Pydantic models manifest = Manifest.model_validate(raw) # Resolve components from typed manifest fields runner = resolve_runner(manifest.model.runner) adapter = create_adapter(manifest.model.artifacts, path) -preprocessors = resolve_processors(manifest.model.io.preprocessors, path) -postprocessors = resolve_processors(manifest.model.io.postprocessors, path) +preprocessors = resolve_processors(manifest.model.preprocessors, path) +postprocessors = resolve_processors(manifest.model.postprocessors, path) ``` -> **Legacy `metadata.yaml` files** (pre-manifest era, before `manifest.json` existed) are handled separately by `from_legacy_metadata()` in `manifest.py`. This is unrelated to the manifest format --- it handles the old YAML-based metadata from early PhysicalAI exports. - -### Runner Resolution - -The runner factory uses **dual-path resolution** --- a single if-check, not an if-chain per type. If `class_path` is present, it goes straight to `ComponentSpec` instantiation. Otherwise, `type` is resolved through the same `ComponentRegistry` pipeline: +Runner and processor resolution both use **dual-path resolution** --- a single if-check, not an if-chain per type: ```python def resolve_runner(runner_config: dict) -> InferenceRunner: - """Resolve runner from manifest config using dual-path resolution. - - Path 1: class_path + init_args → ComponentSpec → instantiate_component() - Path 2: type + flat params → registry lookup → ComponentSpec → instantiate_component() - - Both paths end at the same instantiate_component() call. - """ if "class_path" in runner_config: - # PhysicalAI-native: full ComponentSpec path + # PhysicalAI-native: class_path + init_args → ComponentSpec → instantiate spec = ComponentSpec.model_validate(runner_config) return instantiate_component(spec) - # Framework-agnostic: type → registry resolves short name to class_path + # Framework-agnostic: type → registry lookup → instantiate runner_type = runner_config["type"] init_args = {k: v for k, v in runner_config.items() if k != "type"} spec = ComponentSpec(class_path=runner_type, init_args=init_args) return instantiate_component(spec) ``` -**How `instantiate_component()` handles the two paths:** - -- **`class_path`** (full Python path, e.g., `"physicalai.inference.runners.ActionChunkingRunner"`) → direct import -- **`type`** (short name, e.g., `"action_chunking"`) → `ComponentRegistry.resolve()` maps to full path → import - -`class_path` always uses the full Python class path for explicit, unambiguous resolution. `type` uses the registry as the single source of truth for mapping short names to classes. - -**Example: How the same runner loads from both formats:** - -```json -// LeRobot writes (type + flat params): -{"type": "action_chunking", "chunk_size": 100, "n_action_steps": 100} - -// PhysicalAI writes (class_path + init_args): -{"class_path": "physicalai.inference.runners.ActionChunkingRunner", "init_args": {"chunk_size": 100, "n_action_steps": 100}} - -// Both resolve to the same ActionChunkingRunner: -// type path: "action_chunking" → ComponentRegistry.resolve() → ActionChunkingRunner(...) -// class_path path: "physicalai.inference.runners.ActionChunkingRunner" → direct import → ActionChunkingRunner(...) -``` - -### Runner Parameter Validation - -Each runner declares the parameters it consumes. Unknown keys trigger a warning, catching typos without breaking forward compatibility: - -```python -class IterativeRunner(InferenceRunner): - EXPECTED_PARAMS = {"type", "num_inference_steps", "scheduler", "horizon", "n_action_steps"} - - @classmethod - def from_config(cls, config: dict) -> "IterativeRunner": - unknown = set(config.keys()) - cls.EXPECTED_PARAMS - if unknown: - logger.warning("IterativeRunner: ignoring unknown params: %s", unknown) - return cls( - num_steps=config.get("num_inference_steps", 10), - scheduler=config.get("scheduler", "euler"), - ) -``` - -### Preprocessor and Postprocessor Construction - -Processors use the same dual-path resolution as runners. The `resolve_processor()` function handles both `class_path` + `init_args` (PhysicalAI-native) and `type` + flat params (interoperable): - -```python -def resolve_processors(specs: list[dict], path: Path) -> list: - """Build processor chain from manifest specs using dual-path resolution. - - Each spec is resolved identically to runners: - - class_path present → ComponentSpec → instantiate_component() - - type present → registry lookup → ComponentSpec → instantiate_component() - """ - processors = [] - for spec in specs: - if "class_path" in spec: - # PhysicalAI-native: full ComponentSpec path - component_spec = ComponentSpec.model_validate(spec) - processors.append(instantiate_component(component_spec)) - else: - # Framework-agnostic: type → registry → ComponentSpec - processor_type = spec["type"] - init_args = {k: v for k, v in spec.items() if k != "type"} - # Resolve relative artifact paths to absolute - if "artifact" in init_args: - init_args["stats_path"] = path / init_args.pop("artifact") - component_spec = ComponentSpec(class_path=processor_type, init_args=init_args) - processors.append(instantiate_component(component_spec)) - return processors -``` - -**Example: Normalize processor from both formats:** - -```json -// LeRobot writes (type + flat params): -{"type": "normalize", "mode": "mean_std", "artifact": "stats.safetensors", "features": ["observation.state"]} - -// PhysicalAI writes (class_path + init_args): -{"class_path": "physicalai.inference.preprocessors.StatsNormalizer", "init_args": {"mode": "mean_std", "stats_path": "stats.safetensors", "features": ["observation.state"]}} - -// Both resolve to: StatsNormalizer(mode="mean_std", stats_path=..., features=["observation.state"]) -``` +Processors follow the same pattern, with one addition: the `artifact` key in `type`-format specs is resolved to an absolute `stats_path` at load time. -> **Note:** The `artifact` → `stats_path` key rename happens during resolution for `type`-format specs. In `class_path` format, the key is already `stats_path` (matching the constructor parameter name). +> **Legacy `metadata.yaml` files** (pre-manifest era) are handled separately by `from_legacy_metadata()` in `manifest.py`. --- ## 4. How LeRobot Uses the Manifest -LeRobot reads the same `manifest.json` with its own tooling. It does NOT use pydantic --- it uses `draccus` dataclasses or plain `json.load()`. +LeRobot reads the same `manifest.json` with its own tooling (no PhysicalAI dependency): ```python -# LeRobot's own loading (no physicalai dependency) import json from pathlib import Path def load_exported_policy(path: str | Path) -> ExportedPolicy: - """Load an exported policy package.""" path = Path(path) raw = json.loads((path / "manifest.json").read_text()) - # Read runner config + # Build LeRobot's own runner (standalone, numpy-only) runner_config = raw["model"]["runner"] - runner_type = runner_config["type"] + runner = build_runner(runner_config) - # Build LeRobot's own runner (standalone, numpy-only) - if runner_type == "action_chunking": - runner = ActionChunkingWrapper( - SinglePassRunner(), - chunk_size=runner_config["chunk_size"], - n_action_steps=runner_config["n_action_steps"], - ) - elif runner_type == "iterative": - runner = IterativeRunner( - num_steps=runner_config["num_inference_steps"], - scheduler=runner_config.get("scheduler", "euler"), - ) - elif runner_type == "two_phase": - runner = TwoPhaseRunner(...) - ... - - # Load normalizer from io specs - preprocessors = raw["model"]["io"].get("preprocessors", []) - postprocessors = raw["model"]["io"].get("postprocessors", []) + # Load normalizer from manifest specs + preprocessors = raw["model"].get("preprocessors", []) + postprocessors = raw["model"].get("postprocessors", []) normalizer = Normalizer.from_specs(preprocessors + postprocessors, path) - # Load backend adapter (from named artifact role) + # Load backend adapter artifacts = raw["model"]["artifacts"] adapter = ONNXRuntimeAdapter(path / artifacts["model"]) return ExportedPolicy(runner=runner, adapter=adapter, normalizer=normalizer) ``` -**Key point:** LeRobot's runners, normalizer, and adapters are its own implementations. They have zero overlap with PhysicalAI's implementations. The only shared artifact is the `manifest.json` file on disk. +LeRobot's runners, normalizer, and adapters are its own implementations with zero overlap with PhysicalAI's. The only shared artifact is `manifest.json` on disk. --- @@ -605,88 +391,33 @@ def load_exported_policy(path: str | Path) -> ExportedPolicy: | `num_inference_steps` | iterative, two_phase | Number of denoising steps | | `scheduler` | iterative, two_phase | Scheduler algorithm (euler, ddpm, ddim) | | `horizon` | iterative | Planning horizon (Diffusion, TDMPC) | -| `use_mpc` | iterative | Enable model-predictive control (TDMPC) | -| `cem_iterations` | iterative | CEM optimization iterations (TDMPC) | - -> **Note:** Two-phase artifact paths (`encoder`, `denoise`) live in `model.artifacts`, not in the runner config. See [Runner Variants](#runner-variants) for examples. --- ## 6. Normalization Handling -### Problem - -LeRobot policies operate on **normalized** inputs and produce **normalized** outputs. Normalization statistics are saved alongside the model in `stats.safetensors`. At inference time: - -1. **Observations must be normalized** before feeding to the model -2. **Actions must be denormalized** after the model produces them - -### Solution: Preprocessor and Postprocessor Entries - -The manifest declares normalization as I/O transforms in `model.io.preprocessors` and `model.io.postprocessors`: +LeRobot policies operate on **normalized** inputs and produce **normalized** outputs. The manifest declares normalization as transforms in `model.preprocessors` and `model.postprocessors`: ```json -"io": { - "inputs": [...], - "outputs": [...], - "preprocessors": [ - { - "type": "normalize", - "mode": "mean_std", - "artifact": "stats.safetensors", - "features": ["observation.state"] - } - ], - "postprocessors": [ - { - "type": "denormalize", - "mode": "mean_std", - "artifact": "stats.safetensors", - "features": ["action"] - } - ] -} -``` - -### PhysicalAI Implementation - -Two pipeline components handle normalization: - -**`StatsNormalizer`** (preprocessor): - -```python -class StatsNormalizer(Preprocessor): - """Normalize input features using saved statistics.""" - - def __init__(self, stats_path: Path, features: list[str], mode: str = "mean_std"): - self.stats = load_stats(stats_path) - self.features = features - self.mode = mode - - def __call__(self, inputs: dict[str, np.ndarray]) -> dict[str, np.ndarray]: - for feature in self.features: - if feature in inputs: - inputs[feature] = self._normalize(inputs[feature], feature) - return inputs +"preprocessors": [ + { + "type": "normalize", + "mode": "mean_std", + "artifact": "stats.safetensors", + "features": ["observation.state"] + } +], +"postprocessors": [ + { + "type": "denormalize", + "mode": "mean_std", + "artifact": "stats.safetensors", + "features": ["action"] + } +] ``` -**`StatsDenormalizer`** (postprocessor): - -```python -class StatsDenormalizer(Postprocessor): - """Denormalize output features using saved statistics.""" - - def __init__(self, stats_path: Path, features: list[str], mode: str = "mean_std"): - self.stats = load_stats(stats_path) - self.features = features - self.mode = mode - - def __call__(self, outputs: dict[str, np.ndarray]) -> dict[str, np.ndarray]: - for feature in self.features: - if feature in outputs: - outputs[feature] = self._denormalize(outputs[feature], feature) - return outputs -``` +PhysicalAI resolves these to `StatsNormalizer` (preprocessor) and `StatsDenormalizer` (postprocessor), which load stats from `stats.safetensors` and apply per-feature transforms. ### Normalization Modes @@ -696,9 +427,7 @@ class StatsDenormalizer(Postprocessor): | `min_max` | `(x - min) / (max - min) * 2 - 1` | `(x + 1) / 2 * (max - min) + min` | | `identity` | passthrough | passthrough | -### Stats File Format - -Normalization statistics are stored in `safetensors` format. Each feature has `{feature}/mean`, `{feature}/std`, `{feature}/min`, `{feature}/max` tensors as needed by the normalization mode. +Statistics are stored in `safetensors` format with `{feature}/mean`, `{feature}/std`, `{feature}/min`, `{feature}/max` tensors. --- @@ -712,7 +441,6 @@ from physicalai import InferenceModel # Load LeRobot-exported policy (detected automatically via manifest.json) model = InferenceModel("./act_exported") -# Run inference observation = { "observation.image": image_array, # float32, shape (1, 3, 96, 96) "observation.state": state_array, # float32, shape (1, 14) @@ -727,19 +455,13 @@ action = outputs["action"] # float32, shape (1, 14) from physicalai import InferenceModel from physicalai.inference.callbacks import TimingCallback -model = InferenceModel( - "./pi0_exported", - callbacks=[TimingCallback()], -) - +model = InferenceModel("./pi0_exported", callbacks=[TimingCallback()]) outputs = model(observation) -# TimingCallback logs: "Inference: 12.3ms" ``` ### Override Runner Parameters ```python -# Override denoising steps at load time (no re-export needed) model = InferenceModel( "./diffusion_exported", num_steps=20, # Override manifest default of 100 @@ -750,8 +472,6 @@ model = InferenceModel( ### Real-Time Control ```python -from physicalai import InferenceModel - policy = InferenceModel("./act_exported") policy.reset() @@ -759,7 +479,6 @@ while not done: action = policy.select_action(observation) observation, reward, done, info = env.step(action) -# Reset between episodes policy.reset() ``` @@ -767,8 +486,6 @@ policy.reset() ## 8. Supported Policies -All LeRobot policy types are supported through the converged runner system: - | Policy | `runner.type` | Runner Stack | Artifact Roles | | --------- | --------------- | -------------------------------------------- | -------------------- | | ACT | action_chunking | ActionChunking(SinglePass) | `model` | @@ -780,188 +497,6 @@ All LeRobot policy types are supported through the converged runner system: --- -## 9. Testing Strategy - -### Conformance Tests - -Verify that PhysicalAI correctly loads manifests produced by LeRobot: - -```python -class TestConvergedManifestLoading: - """Verify PhysicalAI loads converged manifest format.""" - - def test_detect_policy_package(self, package_path): - """Detect exported package via format field.""" - manifest = json.loads((package_path / "manifest.json").read_text()) - assert manifest["format"] == "policy_package" - - def test_load_action_chunking(self, act_package): - """Load ACT policy with action chunking runner.""" - model = InferenceModel(act_package) - assert isinstance(model.runner, ActionChunkingRunner) - - def test_load_iterative(self, diffusion_package): - """Load Diffusion policy with iterative runner.""" - model = InferenceModel(diffusion_package) - assert isinstance(model.runner, IterativeRunner) - - def test_load_two_phase(self, pi0_package): - """Load PI0 policy with two-phase runner.""" - model = InferenceModel(pi0_package) - assert isinstance(model.runner, TwoPhaseRunner) - - def test_preprocessors_created(self, act_package): - """Preprocessors auto-created from io.preprocessors.""" - model = InferenceModel(act_package) - assert len(model.preprocessors) > 0 - assert isinstance(model.preprocessors[0], StatsNormalizer) - - def test_postprocessors_created(self, act_package): - """Postprocessors auto-created from io.postprocessors.""" - model = InferenceModel(act_package) - assert len(model.postprocessors) > 0 - assert isinstance(model.postprocessors[0], StatsDenormalizer) -``` - -### Parity Tests - -Verify PhysicalAI output matches LeRobot's standalone runtime: - -```python -def test_parity_with_lerobot_runtime(pi0_package): - """Output matches LeRobot's own runtime (bit-for-bit with same seed).""" - # Load with PhysicalAI - pai_model = InferenceModel(pi0_package) - - # Load with LeRobot standalone - from lerobot.export import load_exported_policy - lr_model = load_exported_policy(pi0_package) - - # Compare outputs with same random seed - obs = generate_test_observation() - np.random.seed(42) - pai_output = pai_model(obs) - np.random.seed(42) - lr_output = lr_model.predict(obs) - - np.testing.assert_allclose(pai_output["action"], lr_output["action"], rtol=1e-5) -``` - -### Backward Compatibility Tests - -Verify v1.0 (flat) manifests still load: - -```python -def test_legacy_flat_manifest(legacy_package): - """v1.0 flat manifest loads without error.""" - model = InferenceModel(legacy_package) - assert model.runner is not None - assert model.adapter is not None -``` - ---- - -## 10. Migration from Legacy Formats - -This is a **clean cut** to the nested manifest format. There is no `_normalize_metadata()` shim --- the nested Pydantic models are the only manifest representation. Two legacy scenarios are handled: - -### Legacy `metadata.yaml` (Pre-Manifest Era) - -Early PhysicalAI exports used a flat `metadata.yaml` file instead of `manifest.json`. The existing `from_legacy_metadata()` classmethod on `Manifest` handles this case: - -```python -class Manifest(BaseModel): - @classmethod - def from_legacy_metadata(cls, metadata: dict) -> "Manifest": - """Convert old metadata.yaml fields to the nested Manifest structure. - - This handles truly old exports that predate manifest.json entirely. - """ - ... -``` - -This is the **only** backward compatibility code needed. It converts old YAML metadata to the new nested `Manifest` model once, at load time. - -### Migration Path - -| Step | Action | Breaking? | Target Version | -| ---- | ------ | --------- | -------------- | -| 1 | Implement nested `Manifest` Pydantic models (`manifest.py`) | No --- new code | v1.x (current) | -| 2 | Update `mixin_policy.py` to write nested `manifest.json` | No --- new exports use new format | v1.x | -| 3 | Update `model.py` and `factory.py` to use `Manifest` directly | No --- `from_legacy_metadata()` handles old YAML | v1.x | -| 4 | Update LeRobot PR to write same nested format | No --- same schema | v1.x | -| 5 | Add `manifest.schema.json` for CI validation in both repos | No --- additive | v1.x | - -> **Key point:** There is no flat-to-nested migration shim. All new manifests are nested from day one. Only pre-manifest `metadata.yaml` files need the legacy path, and that already exists. - -### Schema Enforcement - -To prevent schema drift between PhysicalAI and LeRobot, a shared `manifest.schema.json` (JSON Schema) is maintained and validated against in CI for both projects: - -```text -manifest.schema.json (shared, vendored into both repos) - | - +-- physicalai CI: validate exported manifests against schema - +-- lerobot CI: validate exported manifests against schema -``` - -This catches divergence at PR time rather than at runtime. The schema file is the single source of truth for manifest structure. - ---- - -## Appendix A: Design Rationale - -### Why One Format Instead of Two? - -The previous design (v1 plan) proposed two manifest formats: `lerobot_exported_policy` for LeRobot and `policy_package` for PhysicalAI, bridged by a `LeRobotManifestAdapter` class. This was rejected because: - -1. **Unnecessary complexity** --- An adapter class to translate between nearly-identical JSON schemas is pure overhead. -2. **Divergence risk** --- Two formats inevitably drift apart over time, making the adapter increasingly complex. -3. **Testing burden** --- Every feature needs testing against both formats. -4. **User confusion** --- Which format should I use? Does it matter? - -The converged format eliminates all of these problems. One schema, two producers, zero translation. - -### Why `model` as a Container? - -The `model` section mirrors the `InferenceModel` class hierarchy: - -- `InferenceModel` composes runner, adapter, preprocessors, postprocessors -- `model` contains runner, artifacts (for adapter), io (for pre/postprocessors) - -This follows the same pattern as training configs where the top-level sections (`model`, `data`, `trainer`) mirror the class hierarchy. It makes the manifest self-documenting: the JSON structure tells you the code structure. - -### Why Not `model: null` for Single-Pass? - -All policies need `n_obs_steps`, `artifacts`, and `io` regardless of runner type. Making `model` nullable would force these universal fields elsewhere (top-level or in `policy`), breaking the logical grouping. Instead, `model` is always present --- only the runner params differ between policy types. - -### Why Preprocessors Inside `io`? - -Preprocessors and postprocessors are I/O transforms --- they sit at the boundary between raw observations and model inputs. Placing them inside `io` (alongside `inputs` and `outputs`) makes this relationship explicit. The alternative (top-level `preprocessors`/`postprocessors`) separates logically related concepts. - -### Why `policy.source.class_path`? - -The `class_path` field enables PhysicalAI to instantiate the original policy class when the full PhysicalAI training framework is available. LeRobot ignores this field entirely. It is optional --- packages exported by LeRobot standalone may omit it or use a LeRobot-specific class path. - ---- - -## Appendix B: Comparison with Previous Design - -| Aspect | Previous Design (v1) | Current Design (converged) | -| ------------------ | -------------------------------------------------- | ---------------------------------------------- | -| Manifest formats | Two (`lerobot_exported_policy` + `policy_package`) | One (`policy_package`) | -| Format adapter | `LeRobotManifestAdapter` class (~100 lines) | None --- direct Pydantic parsing | -| Format detection | `if format == "lerobot_exported_policy"` branching | Not needed --- single format | -| Schema maintenance | Two schemas to keep in sync | One shared `manifest.schema.json` | -| Test matrix | 2x (each feature tested against both formats) | 1x | -| Normalization | Adapter auto-generates ComponentSpecs | Manifest declares pre/postprocessors directly | -| Runner resolution | `policy.kind` + separate `inference` block | Dual-path: `class_path` OR `type` → registry → `instantiate_component()` | -| Action params | Separate `action` section | Params in `model.runner` | -| Backward compat | Format detection + adapter routing | `from_legacy_metadata()` for pre-manifest YAML only | -| Component formats | `class_path` + `init_args` only | Both `type` + flat params (interop) and `class_path` + `init_args` (full-power) | - ---- - ## Related Documents - **[Inference Core Design](../components/inferencekit.md)** --- Domain-agnostic inference layer @@ -970,5 +505,5 @@ The `class_path` field enables PhysicalAI to instantiate the original policy cla --- -_Document version: 5.1_ +_Document version: 6.0_ _Last updated: 2026-03-31_ From 7f6e84f48f4ee327af2cf8de9cd5fcaf8d75c203 Mon Sep 17 00:00:00 2001 From: Samet Date: Tue, 31 Mar 2026 14:56:56 +0200 Subject: [PATCH 4/4] docs: add concrete hardware examples and component naming mapping table --- docs/design/components/inferencekit.md | 70 ++++++++++++++++++++++++-- docs/design/integrations/lerobot.md | 39 +++++++++++--- 2 files changed, 99 insertions(+), 10 deletions(-) diff --git a/docs/design/components/inferencekit.md b/docs/design/components/inferencekit.md index 60216d8..4e285fa 100644 --- a/docs/design/components/inferencekit.md +++ b/docs/design/components/inferencekit.md @@ -476,8 +476,26 @@ manifest.json ] }, "hardware": { - "robots": [], - "cameras": [] + "robots": [ + { + "name": "main", + "type": "SO-100", + "state": { + "shape": [6], + "dtype": "float32", + "order": ["shoulder_pan", "shoulder_lift", "elbow_flex", "wrist_flex", "wrist_roll", "gripper"] + }, + "action": { + "shape": [6], + "dtype": "float32", + "order": ["shoulder_pan", "shoulder_lift", "elbow_flex", "wrist_flex", "wrist_roll", "gripper"] + } + } + ], + "cameras": [ + {"name": "top", "shape": [3, 480, 640], "dtype": "uint8"}, + {"name": "wrist", "shape": [3, 480, 640], "dtype": "uint8"} + ] }, "metadata": { "created_at": "2026-03-27T12:00:00Z", @@ -486,6 +504,8 @@ manifest.json } ``` +The `hardware` section declares what the policy **expects** at inference time — logical names, tensor shapes, and dtypes. These are the names used during training (e.g., `"top"`, `"wrist"` for cameras; `"main"` for the robot). At deployment, the user maps logical names to physical devices (e.g., `"top"` → `/dev/video0`). The `order` field in robot specs declares joint ordering — critical for multi-arm setups where `[left, right]` vs `[right, left]` concatenation produces valid shapes with wrong semantics. + > **Note:** For the full manifest schema reference (all runner variants, field descriptions, and design rationale), see [LeRobot Integration Design](../integrations/lerobot.md#2-converged-manifest-format). The format is shared by both PhysicalAI and LeRobot exports. **PhysicalAI-native format (`class_path` + `init_args`):** @@ -537,8 +557,26 @@ PhysicalAI can also write manifests using the full `class_path` + `init_args` fo ] }, "hardware": { - "robots": [], - "cameras": [] + "robots": [ + { + "name": "main", + "type": "SO-100", + "state": { + "shape": [6], + "dtype": "float32", + "order": ["shoulder_pan", "shoulder_lift", "elbow_flex", "wrist_flex", "wrist_roll", "gripper"] + }, + "action": { + "shape": [6], + "dtype": "float32", + "order": ["shoulder_pan", "shoulder_lift", "elbow_flex", "wrist_flex", "wrist_roll", "gripper"] + } + } + ], + "cameras": [ + {"name": "top", "shape": [3, 480, 640], "dtype": "uint8"}, + {"name": "wrist", "shape": [3, 480, 640], "dtype": "uint8"} + ] }, "metadata": { "created_at": "2026-03-27T12:00:00Z", @@ -549,6 +587,30 @@ PhysicalAI can also write manifests using the full `class_path` + `init_args` fo > **Both formats resolve identically.** The `type`-based example above (used by LeRobot) and this `class_path`-based example both resolve to the same runner and processor instances through the `ComponentRegistry`. See [Dual Component Resolution](../integrations/lerobot.md#dual-component-resolution) for the full resolution algorithm. +**Naming mapping between the two formats:** + +| `type` (short name) | `class_path` (full Python path) | Purpose | +| --- | --- | --- | +| `"action_chunking"` | `"physicalai.inference.runners.ActionChunkingRunner"` | Runner: returns a chunk of future actions per step | +| `"single_pass"` | `"physicalai.inference.runners.SinglePassRunner"` | Runner: single forward pass per step | +| `"normalize"` | `"physicalai.inference.preprocessors.StatsNormalizer"` | Preprocessor: normalizes observations using dataset stats | +| `"denormalize"` | `"physicalai.inference.postprocessors.StatsDenormalizer"` | Postprocessor: denormalizes actions using dataset stats | + +- **`type` + flat params**: LeRobot writes this. Both frameworks can read it. Short name resolved via `ComponentRegistry`. +- **`class_path` + `init_args`**: PhysicalAI writes this. Direct Python import — no registry lookup needed. Useful for custom/third-party components that aren't in the built-in registry. + +For example, these two `ComponentSpec`s resolve to the exact same object: + +```json +// type format (LeRobot-compatible): +{"type": "action_chunking", "chunk_size": 100, "n_action_steps": 100} + +// class_path format (PhysicalAI-native): +{"class_path": "physicalai.inference.runners.ActionChunkingRunner", "init_args": {"chunk_size": 100, "n_action_steps": 100}} + +// Both → ActionChunkingRunner(chunk_size=100, n_action_steps=100) +``` + **How models are loaded:** The framework reads `manifest.json` and resolves the model configuration using **dual-path component resolution**: diff --git a/docs/design/integrations/lerobot.md b/docs/design/integrations/lerobot.md index 36a3f56..f4f6741 100644 --- a/docs/design/integrations/lerobot.md +++ b/docs/design/integrations/lerobot.md @@ -169,8 +169,26 @@ manifest.json ] }, "hardware": { - "robots": [], - "cameras": [] + "robots": [ + { + "name": "main", + "type": "SO-100", + "state": { + "shape": [6], + "dtype": "float32", + "order": ["shoulder_pan", "shoulder_lift", "elbow_flex", "wrist_flex", "wrist_roll", "gripper"] + }, + "action": { + "shape": [6], + "dtype": "float32", + "order": ["shoulder_pan", "shoulder_lift", "elbow_flex", "wrist_flex", "wrist_roll", "gripper"] + } + } + ], + "cameras": [ + {"name": "top", "shape": [3, 480, 640], "dtype": "uint8"}, + {"name": "wrist", "shape": [3, 480, 640], "dtype": "uint8"} + ] }, "metadata": { "created_at": "2026-03-27T12:00:00Z", @@ -254,10 +272,19 @@ The `model.runner` section is open-ended --- policy-specific parameters go direc #### `hardware` --- Deployment -| Field | Type | Required | Description | -| ------------------ | ----- | -------- | -------------------------------- | -| `hardware.robots` | array | No | Robot configurations (optional) | -| `hardware.cameras` | array | No | Camera configurations (optional) | +| Field | Type | Required | Description | +| ------------------------------- | ------ | -------- | ------------------------------------------------------------------ | +| `hardware.robots` | array | No | Robot configurations | +| `hardware.robots[].name` | string | Yes | Logical name (e.g., `"main"`, `"left_arm"`) | +| `hardware.robots[].type` | string | No | Robot model string (informational, e.g., `"SO-100"`) | +| `hardware.robots[].state` | object | No | Expected state tensor: `shape`, `dtype`, `order` (joint ordering) | +| `hardware.robots[].action` | object | No | Expected action tensor: `shape`, `dtype`, `order` (joint ordering) | +| `hardware.cameras` | array | No | Camera configurations | +| `hardware.cameras[].name` | string | Yes | Logical name matching training data keys (e.g., `"top"`, `"wrist"`) | +| `hardware.cameras[].shape` | array | No | `[C, H, W]` tensor shape (e.g., `[3, 480, 640]`) | +| `hardware.cameras[].dtype` | string | No | Numpy dtype string (default: `"uint8"`) | + +The `order` field in robot specs declares joint ordering. This is critical for multi-arm setups where `[left, right]` vs `[right, left]` concatenation produces valid shapes with wrong semantics. When present, the runtime can compare declared order against the robot's actual joint order and catch mismatches at startup. Camera and robot `name` fields are **logical names** matching the keys used during training — at deployment, the user maps these to physical devices. #### `metadata` --- Provenance