From 33ea11cf05c31d80a1445edeb657b80b61837d36 Mon Sep 17 00:00:00 2001
From: Max Xiang <xiangxiang.ma@intel.com>
Date: Fri, 15 May 2026 14:22:46 +0200
Subject: [PATCH 01/19] runtime draft

---
 .../runtime-system/implementation_plan.md     | 1146 +++++++++++++++++
 examples/runtime/inference_async.py           |  764 +++++++++++
 examples/runtime/inference_pi05_gpu.ipynb     |  838 ++++++++++++
 src/physicalai/runtime/__init__.py            |   43 +
 src/physicalai/runtime/_action_queue.py       |   72 ++
 src/physicalai/runtime/execution.py           |  250 ++++
 src/physicalai/runtime/runtime.py             |  226 ++++
 src/physicalai/runtime/smoothers.py           |   71 +
 8 files changed, 3410 insertions(+)
 create mode 100644 docs/design/components/runtime-system/implementation_plan.md
 create mode 100644 examples/runtime/inference_async.py
 create mode 100644 examples/runtime/inference_pi05_gpu.ipynb
 create mode 100644 src/physicalai/runtime/__init__.py
 create mode 100644 src/physicalai/runtime/_action_queue.py
 create mode 100644 src/physicalai/runtime/execution.py
 create mode 100644 src/physicalai/runtime/runtime.py
 create mode 100644 src/physicalai/runtime/smoothers.py

diff --git a/docs/design/components/runtime-system/implementation_plan.md b/docs/design/components/runtime-system/implementation_plan.md
new file mode 100644
index 0000000..07b1473
--- /dev/null
+++ b/docs/design/components/runtime-system/implementation_plan.md
@@ -0,0 +1,1146 @@
+# Runtime System — Implementation Plan
+
+This document is the implementation plan for `physicalai.runtime`. It refines the original [policy_runtime_design.md](./policy_runtime_design.md) based on codebase exploration, bug analysis, and architecture review.
+
+Read [policy_runtime_design.md](./policy_runtime_design.md) first for API shape and ownership rules. This document covers what to build, in what order, and why.
+
+## Reference Implementation
+
+The golden reference is `physicalai/examples/runtime/inference_async.py` — a working async prototype with QueueMixer, InferenceThread, velocity clamping, and camera discovery. Every runtime component must match or exceed its behavior.
+
+---
+
+## Phase 1: Critical Bug Fixes (half day) ✅
+
+Fix bugs on the code path Phase 2 depends on. Phase 2 defines a public runtime contract — workarounds would calcify into permanent API shape. `predict_action_chunk()` currently raises `RuntimeError` without these fixes because Bug 2's inverted guard blocks the runtime's call to `model.predict_action_chunk(obs)`. This is not stylistic — it is a hard blocker.
+
+### Bug 1: `use_action_queue` checks manifest, ignores runtime runner
+
+**File**: `physicalai/src/physicalai/inference/model.py` — `use_action_queue` property
+
+**Problem**: Reads `self.manifest.model.runner` class_path. Ignores `self.runner` passed at construction or set at runtime.
+
+**Fix**: Check `isinstance(self.runner, ActionChunking)` instead:
+
+```python
+@property
+def use_action_queue(self) -> bool:
+    from physicalai.inference.runners.action_chunking import ActionChunking
+    return isinstance(self.runner, ActionChunking)
+```
+
+### Bug 2: `select_action()` / `predict_action_chunk()` guards inverted
+
+**File**: `physicalai/src/physicalai/inference/model.py`
+
+**Problem**: `select_action()` raises when `not use_action_queue`. `predict_action_chunk()` raises when `use_action_queue`. Both are backwards — `select_action()` should be the generic one-action API, `predict_action_chunk()` should return raw chunks.
+
+**Fix**: Remove guards entirely. Both methods should work for any runner (shape-stable contract per design doc §3):
+
+| Runner          | `select_action()`  | `predict_action_chunk()` |
+| --------------- | ------------------ | ------------------------ |
+| single-pass     | runner output      | wrap as `(1, D)` chunk   |
+| chunk-producing | pop one via cursor | runner output            |
+
+### Bug 3: ACT export manifest declares SinglePass instead of ActionChunking
+
+**File**: `library/src/physicalai/export/mixin_policy.py` — `_build_manifest()`
+
+**Problem**: Checks `metadata.get("use_action_queue", False)` but ACT never sets this metadata flag. Manifest always gets `SinglePass` runner.
+
+**Fix**: ACT export must pass `use_action_queue=True` and `chunk_size=<config.chunk_size>` in its metadata kwargs. Same fix needed for Pi0.5 (Bug 5).
+
+### Deferred Bugs (document as issues, do not block Phase 2)
+
+| Bug   | Summary                                                               | Why deferred                                            |
+| ----- | --------------------------------------------------------------------- | ------------------------------------------------------- |
+| Bug 4 | Manifest missing `hardware` section (`RobotSpec`, `CameraSpec`)       | Not on inference code path                              |
+| Bug 5 | Pi0.5 export also declares SinglePass                                 | Same root cause as Bug 3, fix together                  |
+| Bug 7 | Pi0.5 normalization not baked into graph (external pre/postprocessor) | By design — manifest `preprocessors_specs` handles this |
+| Bug 8 | Pi0.5 denoising loop not exportable cleanly (11x graph size)          | Export concern, not runtime concern                     |
+| Bug 9 | `OVTokenizer` may need `import openvino_tokenizers` for custom ops    | Needs verification — may already work via adapter       |
+
+---
+
+## Phase 2: Runtime System (2–3 days) ✅
+
+New package: `physicalai/src/physicalai/runtime/`
+
+```text
+physicalai/src/physicalai/runtime/
+├── __init__.py              # exports: PolicyRuntime, SyncExecution, AsyncExecution,
+│                            #          ActionQueue, LerpSmoother, ReplaceSmoother, RunStats
+├── smoothers.py             # ChunkSmoother ABC, ReplaceSmoother, LerpSmoother
+├── _action_queue.py         # ActionQueue (public via __init__, internal module)
+├── execution.py             # Execution ABC, SyncExecution, AsyncExecution, WorkerDiedError
+└── runtime.py               # PolicyRuntime, RunStats, default_observation_to_input
+```
+
+Dependency order: `smoothers.py` → `_action_queue.py` → `execution.py` → `runtime.py` → `__init__.py`
+
+### Architectural Decisions
+
+**ActionQueue is owned by PolicyRuntime, not hidden inside Execution.**
+
+The original design doc keeps Execution (scheduling) and ActionQueue (buffering) as separate concerns. This is correct — when `AsyncExecution(transport="process")` or `RemoteExecution` arrive, they should push chunks into the same ActionQueue without duplicating buffer logic.
+
+Users get a clean default API:
+
+```python
+runtime = PolicyRuntime(
+    robot=robot,
+    model=model,
+    execution=AsyncExecution(threshold=0.5),
+    fps=30,
+)
+runtime.run(duration_s=60)
+```
+
+Power users can override buffering:
+
+```python
+runtime = PolicyRuntime(
+    robot=robot,
+    model=model,
+    execution=AsyncExecution(threshold=0.5),
+    action_queue=ActionQueue(smoother=LerpSmoother(duration_frames=10)),
+    fps=30,
+)
+```
+
+**Execution is a scheduler, not a buffer.** It decides when/where inference runs and pushes results into ActionQueue. It does not own pop, remaining, or chunk_size.
+
+**InferenceModel must NOT import ActionQueue.** Per design doc §4: if both layers need pop-from-chunk mechanics, they share `ActionChunkCursor`, not `ActionQueue`.
+
+### 2.1 `smoothers.py`
+
+Extracted from `QueueMixer.add()` in inference_async.py.
+
+```python
+class ChunkSmoother(ABC):
+    """Merges a new action chunk into remaining actions from the previous chunk."""
+
+    @abstractmethod
+    def merge(
+        self,
+        remaining: np.ndarray,    # (R, action_dim) — unconsumed actions from previous chunk
+        incoming: np.ndarray,     # (H, action_dim) — new chunk from inference
+        offset: int,              # skip first N actions of incoming (latency compensation)
+    ) -> np.ndarray:
+        """Return merged actions array. Called by ActionQueue.push_chunk()."""
+        ...
+
+
+class ReplaceSmoother(ChunkSmoother):
+    """Drop remaining actions, use incoming[offset:]."""
+
+    def merge(self, remaining, incoming, offset):
+        return incoming[offset:]
+
+
+class LerpSmoother(ChunkSmoother):
+    """Lerp-blend overlapping region, then append non-overlapping tail.
+
+    Stateless merge — no hidden mutation. duration_frames is the fallback
+    blending window used when offset is 0. When offset > 0, the blending
+    window is computed from offset directly: lerp_dur = max(offset, 1).
+
+    Matches QueueMixer.add() from inference_async.py:
+    - Weights: w_i = max(1.0 - i / lerp_dur, 0.0) for old actions
+    - Overlap region: blended = w * remaining + (1 - w) * incoming
+    """
+
+    def __init__(self, duration_frames: int = 5) -> None:
+        self.duration_frames = duration_frames
+
+    def merge(self, remaining, incoming, offset):
+        lerp_dur = max(offset, 1) if offset > 0 else self.duration_frames
+
+        incoming = incoming[offset:]
+        n_remain = len(remaining)
+        lerp_dur = min(n_remain, lerp_dur)
+
+        weights = np.maximum(1.0 - np.arange(n_remain) / max(lerp_dur, 1), 0.0)
+        weights = weights[:, np.newaxis]
+
+        n_blend = min(n_remain, len(incoming))
+        blended = weights[:n_blend] * remaining[:n_blend] + (1.0 - weights[:n_blend]) * incoming[:n_blend]
+
+        return np.concatenate([blended, incoming[n_blend:]], axis=0).astype(np.float32)
+```
+
+Key: `offset` is not just "skip N actions." It is latency compensation — `offset = int(inference_latency * fps)`. The smoother must handle this, not the caller.
+
+### 2.2 `_action_queue.py` (public API, internal module)
+
+Thread-safe action buffer with smoother integration and hold telemetry.
+
+```python
+class ActionQueue:
+    """Thread-safe action buffer with chunk smoothing and starvation telemetry.
+
+    Public API — exported from physicalai.runtime. Power users can override
+    the default ActionQueue on PolicyRuntime to customize smoothing behavior.
+    Execution pushes chunks into it; PolicyRuntime pops actions from it.
+    """
+
+    def __init__(self, smoother: ChunkSmoother | None = None) -> None:
+        self._smoother = smoother or ReplaceSmoother()
+        self._deque: deque[np.ndarray] = deque()
+        self._lock = threading.Lock()
+        self._consecutive_holds: int = 0
+        self._total_holds: int = 0
+        self._total_pops: int = 0
+
+    def push_chunk(self, chunk: np.ndarray, offset: int = 0) -> None:
+        """Merge a new chunk into the queue. Thread-safe."""
+        with self._lock:
+            remaining = (np.stack(list(self._deque))
+                         if self._deque
+                         else np.empty((0, chunk.shape[1]), dtype=chunk.dtype))
+            merged = self._smoother.merge(remaining, chunk, offset)
+            self._deque.clear()
+            self._deque.extend(merged)
+
+    def pop(self) -> np.ndarray | None:
+        """Pop next action, or None if empty. Thread-safe."""
+        with self._lock:
+            if not self._deque:
+                self._consecutive_holds += 1
+                self._total_holds += 1
+                return None
+            self._consecutive_holds = 0
+            self._total_pops += 1
+            return self._deque.popleft()
+
+    @property
+    def remaining(self) -> int:
+        with self._lock:
+            return len(self._deque)
+
+    def below_threshold(self, threshold: int) -> bool:
+        return self.remaining < threshold
+
+    def clear(self) -> None:
+        with self._lock:
+            self._queue = None
+            self._index = 0
+
+    # --- Telemetry ---
+    @property
+    def consecutive_holds(self) -> int:
+        return self._consecutive_holds
+
+    @property
+    def total_holds(self) -> int:
+        return self._total_holds
+
+    @property
+    def total_pops(self) -> int:
+        return self._total_pops
+```
+
+### 2.3 `execution.py`
+
+**Execution ABC** — scheduler only. Pushes chunks into ActionQueue, does not own pop/remaining.
+
+```python
+class Execution(ABC):
+    """Decides when and where inference runs. Pushes results into ActionQueue."""
+
+    @abstractmethod
+    def start(self, model: InferenceModel, action_queue: ActionQueue) -> None:
+        """Bind to model and queue. Called once before the loop."""
+        ...
+
+    @abstractmethod
+    def maybe_request(self, observation: dict[str, np.ndarray]) -> None:
+        """Check if new inference is needed. If so, run or schedule it."""
+        ...
+
+    @abstractmethod
+    def warmup(self, sample_observation: dict[str, np.ndarray]) -> None:
+        """Run one inference to discover chunk_size and seed the queue.
+
+        After warmup():
+        - action_queue has one chunk ready (robot starts moving immediately)
+        - self.chunk_size is set
+        - self.action_dim is set
+        """
+        ...
+
+    @abstractmethod
+    def stop(self) -> None:
+        """Stop scheduling. For async: signal thread, join with timeout."""
+        ...
+
+    @property
+    @abstractmethod
+    def chunk_size(self) -> int:
+        """Discovered after warmup(). Used to compute threshold."""
+        ...
+```
+
+**SyncExecution** — blocks on inference when queue runs low.
+
+```python
+class SyncExecution(Execution):
+    """Synchronous inference in the control thread."""
+
+    def __init__(self) -> None:
+        self._model: InferenceModel | None = None
+        self._queue: ActionQueue | None = None
+        self._chunk_size: int = 0
+
+    def start(self, model, action_queue):
+        self._model = model
+        self._queue = action_queue
+
+    def warmup(self, sample_observation):
+        actions = self._model.predict_action_chunk(sample_observation)
+        self._chunk_size = actions.shape[0]
+        self._queue.push_chunk(actions, offset=0)
+
+    def maybe_request(self, observation):
+        if self._queue.below_threshold(1):  # refill when empty
+            actions = self._model.predict_action_chunk(observation)
+            self._queue.push_chunk(actions, offset=0)
+
+    def stop(self):
+        pass
+
+    @property
+    def chunk_size(self):
+        return self._chunk_size
+```
+
+**AsyncExecution** — background thread with health monitoring. Maps to `InferenceThread` from inference_async.py.
+
+```python
+class WorkerDiedError(RuntimeError):
+    """Raised when the inference worker thread dies unexpectedly."""
+    pass
+
+
+class AsyncExecution(Execution):
+    """Async inference in a background thread.
+
+    Thread architecture (matches inference_async.py):
+
+        Control thread (main):              Inference thread (background):
+        ─────────────────────               ────────────────────────────
+        loop at fps:                        loop:
+          obs = robot.get_observation()       wait for obs_slot
+          execution.maybe_request(obs)        chunk = model.predict_action_chunk(obs)
+          action = queue.pop()                offset = int(latency * fps)
+          robot.send_action(action)           queue.push_chunk(chunk, offset)
+    """
+
+    def __init__(
+        self,
+        threshold: float = 0.5,
+        fps: int = 30,
+        watchdog_timeout_s: float = 30.0,
+        max_consecutive_holds: int | None = None,   # default: 3 * fps (3 seconds)
+    ) -> None:
+        self._threshold_frac = threshold
+        self._fps = fps
+        self._watchdog_timeout_s = watchdog_timeout_s
+        self._max_consecutive_holds = max_consecutive_holds or 3 * fps
+
+        # Set during start()
+        self._model: InferenceModel | None = None
+        self._queue: ActionQueue | None = None
+        self._chunk_size: int = 0
+        self._threshold_count: int = 0
+
+        # Thread state
+        self._lock = threading.Lock()
+        self._obs_slot: dict | None = None
+        self._obs_ready = threading.Event()
+        self._running_inference = False
+        self._request_time: float = 0.0
+        self._stop_event = threading.Event()
+        self._thread: threading.Thread | None = None
+        self._death_cause: BaseException | None = None
+        self._inference_count: int = 0
+
+    def start(self, model, action_queue):
+        self._model = model
+        self._queue = action_queue
+        self._thread = threading.Thread(target=self._run, name="InferenceThread", daemon=True)
+        self._thread.start()
+
+    def warmup(self, sample_observation):
+        actions = self._model.predict_action_chunk(sample_observation)
+        self._chunk_size = actions.shape[0]
+        self._threshold_count = int(self._chunk_size * self._threshold_frac)
+        self._queue.push_chunk(actions, offset=0)
+
+    def maybe_request(self, observation):
+        # Check for worker death — raise, don't silently continue
+        if self._thread is not None and not self._thread.is_alive():
+            if self._death_cause is not None:
+                raise WorkerDiedError(
+                    f"Inference thread died: {self._death_cause}"
+                ) from self._death_cause
+
+        # Check for stuck inference
+        if self._busy_duration > self._watchdog_timeout_s:
+            logger.warning(
+                "Inference stuck for %.0fs — force resetting", self._busy_duration,
+            )
+            self._force_reset()
+
+        # Submit if queue is low and worker is idle
+        if self._queue.below_threshold(self._threshold_count):
+            if not self._busy:
+                # Defensive copy — observation may be reused by caller
+                snapshot = {
+                    k: v.copy() if isinstance(v, np.ndarray) else v
+                    for k, v in observation.items()
+                }
+                with self._lock:
+                    self._obs_slot = snapshot
+                    self._request_time = time.perf_counter()
+                self._obs_ready.set()
+
+    def stop(self):
+        if self._thread is not None:
+            self._stop_event.set()
+            self._obs_ready.set()   # unblock wait
+            self._thread.join(timeout=10.0)
+
+    @property
+    def chunk_size(self):
+        return self._chunk_size
+
+    # --- Health properties ---
+
+    @property
+    def alive(self) -> bool:
+        return self._thread is not None and self._thread.is_alive()
+
+    @property
+    def _busy(self) -> bool:
+        with self._lock:
+            return self._obs_slot is not None or self._running_inference
+
+    @property
+    def _busy_duration(self) -> float:
+        with self._lock:
+            if not (self._obs_slot is not None or self._running_inference):
+                return 0.0
+            return time.perf_counter() - self._request_time
+
+    @property
+    def inference_count(self) -> int:
+        return self._inference_count
+
+    # --- Internal ---
+
+    def _force_reset(self) -> None:
+        with self._lock:
+            self._obs_slot = None
+            self._running_inference = False
+        logger.warning("Force reset — cleared stuck inference state")
+
+    def _run(self) -> None:
+        """Inference thread main loop."""
+        try:
+            while not self._stop_event.is_set():
+                self._obs_ready.wait()
+                self._obs_ready.clear()
+
+                if self._stop_event.is_set():
+                    return
+
+                with self._lock:
+                    obs = self._obs_slot
+                    self._obs_slot = None
+                    if obs is None:
+                        continue
+                    self._running_inference = True
+
+                t0 = time.perf_counter()
+                actions = self._model.predict_action_chunk(obs)
+                latency = time.perf_counter() - t0
+
+                offset = int(latency * self._fps)
+
+                self._queue.push_chunk(actions, offset=offset)
+                self._inference_count += 1
+
+                with self._lock:
+                    self._running_inference = False
+
+        except Exception as e:
+            self._death_cause = e
+            logger.error("Inference thread died: %s", e, exc_info=True)
+```
+
+### 2.4 `runtime.py`
+
+```python
+from __future__ import annotations
+
+import logging
+import time
+from collections.abc import Callable, Mapping, Sequence
+from typing import Any, Protocol
+
+import numpy as np
+
+from physicalai.capture.camera import Camera, Frame
+from physicalai.inference import InferenceModel
+from physicalai.robot.interface import Robot, RobotObservation
+from physicalai.runtime._action_queue import ActionQueue
+from physicalai.runtime.execution import Execution, WorkerDiedError
+from physicalai.runtime.smoothers import LerpSmoother
+
+logger = logging.getLogger(__name__)
+
+def default_observation_to_input(
+    robot_obs: RobotObservation,
+    camera_frames: dict[str, Frame],
+) -> dict[str, Any]:
+    """Default observation-to-model-input conversion.
+
+    Maps:
+    - Joint positions → "state" array
+    - Camera frames → "images.{name}" arrays
+
+    For Pi0.5 or other models needing custom keys (e.g. "task"),
+    pass a custom obs_to_input callable to PolicyRuntime.
+    """
+    model_input: dict[str, Any] = {}
+
+    # Collect joint positions into "state" vector
+    if robot_obs.joint_positions:
+        model_input["state"] = np.array([robot_obs.joint_positions], dtype=np.float32)
+
+    # Map camera frames to "images.{name}"
+    for name, frame in camera_frames.items():
+        model_input[f"images.{name}"] = frame.data
+
+    return model_input
+
+
+class RuntimeCallback(Protocol):
+    """Optional hook points in the PolicyRuntime control loop."""
+
+    def before_send_action(self, *, action: np.ndarray, step: int) -> np.ndarray | None:
+        """Called before sending action. Return modified action or None to use original."""
+        ...
+
+    def on_action_sent(self, *, action: np.ndarray, step: int) -> None:
+        """Called after action is sent to robot."""
+        ...
+
+    def on_hold(self, *, step: int, holds: int) -> None:
+        """Called when action queue is empty and robot holds last position."""
+        ...
+
+
+class PolicyRuntime:
+    """Runs a policy on robot hardware.
+
+    Loop shape (matches inference_async.py):
+        obs = robot.get_observation()
+        model_input = obs_to_input(obs, cameras)
+        execution.maybe_request(model_input)
+        action = action_queue.pop()
+        if action is None: hold position
+        robot.send_action(action)
+        sleep_until_next_tick()
+    """
+
+    def __init__(
+        self,
+        robot: Robot,
+        model: InferenceModel,
+        execution: Execution,
+        fps: float,
+        cameras: Mapping[str, Camera] | None = None,
+        action_queue: ActionQueue | None = None,
+        obs_to_input: Callable[[RobotObservation, dict[str, Frame]], dict[str, Any]] | None = None,
+        callbacks: Sequence[RuntimeCallback] = (),
+    ) -> None:
+        self._robot = robot
+        self._model = model
+        self._execution = execution
+        self._fps = fps
+        self._cameras = cameras or {}
+        self._action_queue = action_queue or ActionQueue(smoother=LerpSmoother(duration_frames=5))
+        self._obs_to_input = obs_to_input or default_observation_to_input
+        self._callbacks = list(callbacks)
+
+    def run(self, *, duration_s: float | None = None) -> RunStats:
+        """Run the control loop.
+
+        1. Warm up — run one inference, seed queue, discover chunk_size
+        2. Loop — observe, maybe_request, pop, send, sleep
+        3. Shutdown — stop execution, drain
+        """
+        # --- Init ---
+        self._execution.start(self._model, self._action_queue)
+
+        sample_obs = self._build_model_input()
+        self._execution.warmup(sample_obs)
+
+        goal_time = 1.0 / self._fps
+        step = 0
+        last_action: np.ndarray | None = None
+
+        try:
+            while True:
+                if duration_s is not None and step * goal_time >= duration_s:
+                    break
+
+                loop_start = time.perf_counter()
+
+                # 1. Observe
+                obs = self._build_model_input()
+
+                # 2. Maybe request inference
+                self._execution.maybe_request(obs)
+
+                # 3. Pop action
+                action = self._action_queue.pop()
+                if action is not None:
+                    last_action = action
+                else:
+                    action = last_action
+                    holds = self._action_queue.consecutive_holds
+                    if holds == 1:
+                        logger.warning("Queue empty — holding position")
+                    elif holds % self._fps == 0:
+                        logger.warning(
+                            "Queue starvation: %d consecutive holds (%.1fs)",
+                            holds, holds / self._fps,
+                        )
+                    self._invoke_callback("on_hold", step=step, holds=holds)
+
+                if action is None:
+                    # No warmup result and no previous action — skip
+                    logger.error("No action available (warmup may have failed)")
+                    continue
+
+                # 4. Callbacks
+                action = self._invoke_callback("before_send_action", action=action, step=step) or action
+
+                # 5. Send
+                self._robot.send_action(action)
+                self._invoke_callback("on_action_sent", action=action, step=step)
+
+                # 6. Timing
+                elapsed = time.perf_counter() - loop_start
+                sleep_time = goal_time - elapsed
+                if sleep_time > 0:
+                    time.sleep(sleep_time)
+
+                step += 1
+
+        except KeyboardInterrupt:
+            logger.info("Interrupted by user")
+        except WorkerDiedError as e:
+            logger.error("Worker died during runtime: %s", e)
+            raise
+        finally:
+            self._shutdown(step)
+
+        return RunStats(
+            steps=step,
+            total_pops=self._action_queue.total_pops,
+            total_holds=self._action_queue.total_holds,
+            inference_count=getattr(self._execution, "inference_count", 0),
+        )
+
+    def _build_model_input(self) -> dict[str, Any]:
+        robot_obs = self._robot.get_observation()
+        camera_frames = {name: cam.read_latest() for name, cam in self._cameras.items()}
+        return self._obs_to_input(robot_obs, camera_frames)
+
+    def _shutdown(self, step: int) -> None:
+        """Robot and cameras must be connected before run(). Caller owns lifecycle."""
+        # 1. Stop inference scheduling
+        self._execution.stop()
+
+        # 2. Drain remaining actions (up to 1s) for smooth stop
+        remaining = self._action_queue.remaining
+        drain_limit = min(remaining, int(self._fps))
+        for _ in range(drain_limit):
+            action = self._action_queue.pop()
+            if action is not None:
+                self._robot.send_action(action)
+                time.sleep(1.0 / self._fps)
+
+        logger.info(
+            "Shutdown complete — %d steps, %d pops, %d holds",
+            step, self._action_queue.total_pops, self._action_queue.total_holds,
+        )
+
+    def _invoke_callback(self, method: str, **kwargs):
+        result = None
+        for cb in self._callbacks:
+            fn = getattr(cb, method, None)
+            if fn is not None:
+                result = fn(**kwargs)
+        return result
+```
+
+### 2.5 Tests
+
+```text
+physicalai/tests/unit/runtime/
+├── test_smoothers.py        # ReplaceSmoother, LerpSmoother: offset handling, lerp weights,
+│                            #   dynamic duration, edge cases (empty remaining, offset > chunk)
+├── test_action_queue.py     # push/pop, smoother integration, thread safety (concurrent
+│                            #   push+pop from 2 threads), hold counters, clear()
+├── test_execution.py        # SyncExecution: warmup seeds queue, maybe_request refills
+│                            # AsyncExecution: mock model, health monitoring (alive/busy/
+│                            #   busy_duration), WorkerDiedError propagation, force_reset
+└── test_runtime.py          # PolicyRuntime with mock robot + mock model:
+                             #   full loop, hold fallback, shutdown drain, callbacks,
+                             #   WorkerDiedError propagation, duration_s limit
+```
+
+All tests use mock `InferenceModel` and mock `Robot` — no hardware, no exported models.
+
+---
+
+## Phase 3: CLI and Integration (1–2 days)
+
+1. `physicalai run --config so101_pi05.yaml` CLI command
+2. YAML config loader (`PolicyRuntime.from_config()`)
+3. Observation builder (bridges `Robot` protocol + `Camera` → model input dict)
+4. Migrate `inference_async.py` to use `PolicyRuntime` (becomes ~20 lines)
+
+### Velocity clamping and camera discovery stay outside core
+
+Velocity clamping (`max_speed`, `ramp_steps`, `commanded_pos` tracking) is SO-101-specific. Camera discovery (interactive selection, name mapping, blank cameras, flip) is app-specific.
+
+These belong in:
+
+- Example scripts (`examples/runtime/`)
+- CLI helpers (`physicalai.cli.run`)
+- User callbacks
+
+Not in `PolicyRuntime` or `Execution`. If a reusable pattern emerges across 2+ robots, promote to a formal action-transform layer later.
+
+---
+
+## Phase 3.5: Runtime Telemetry (1–2 days)
+
+Streaming observability for the runtime control loop. The runtime process must not be bottlenecked by telemetry — all emission is fire-and-forget over zenoh pub-sub. A separate observer process handles visualization, aggregation, and persistence.
+
+### Constraint
+
+Two-week release deadline. Scope: emitter in runtime, observer CLI, zenoh-only transport. No OpenTelemetry, no Studio UI integration, no remote telemetry. Use OTel-compatible metric naming so a future OTLP bridge is trivial.
+
+### Architecture
+
+```text
+┌──────────────────────────────────────────┐
+│  Runtime Process (real-time budget)      │
+│                                          │
+│  PolicyRuntime.run()                     │
+│    └─ TelemetryEmitter                   │
+│         zenoh.put() per tick/inference   │
+│         no-op if zenoh not installed     │
+└──────────┬───────────────────────────────┘
+           │ zenoh pub-sub (local SHM)
+           ▼
+┌──────────────────────────────────────────┐
+│  Observer Process (separate, optional)   │
+│                                          │
+│  TelemetrySubscriber                     │
+│    ├─ Live console dashboard             │
+│    ├─ JSONL file sink (--record)         │
+│    └─ Future: WebSocket → Studio UI      │
+└──────────────────────────────────────────┘
+```
+
+### Topic Schema
+
+All topics prefixed with `physicalai/rt/{session_id}/`. Session ID is a short hex string generated at `run()` start.
+
+| Topic       | Frequency                            | Payload (msgpack)                                                                                                            | Purpose                                   |
+| ----------- | ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------- |
+| `tick`      | Every control loop tick (30 Hz)      | `step`, `timestamp`, `joint_positions: (D,)`, `action_sent: (D,)`, `queue_remaining: int`, `loop_duration_s`, `sleep_time_s` | Core loop health + executed trajectory    |
+| `inference` | Per inference completion (~2–3 Hz)   | `latency_s`, `offset`, `chunk: (H, D)`                                                                                       | Latency monitoring + predicted trajectory |
+| `lifecycle` | On start / warmup / shutdown / error | `event: str`, `metadata: dict`                                                                                               | Session boundaries                        |
+
+**Serialization**: msgpack with numpy arrays encoded as `{"__np__": true, "dtype": str, "shape": list, "data": bytes}`. Encode cost is ~50 μs per tick event, ~100 μs per inference event. Negligible in a 33 ms tick budget.
+
+**Naming convention**: Scalar fields use OTel-compatible names (`physicalai.runtime.loop_duration_s`, `physicalai.runtime.inference_latency_s`) so a future OTLP exporter in the observer can re-export without renaming.
+
+### New Files
+
+```text
+physicalai/src/physicalai/runtime/
+├── _telemetry.py                    # TelemetryEmitter (zenoh publisher, no-op fallback)
+└── observer/
+    ├── __init__.py
+    ├── __main__.py                  # python -m physicalai.runtime.observer
+    ├── _subscriber.py               # TelemetrySubscriber (zenoh sub + dispatch)
+    ├── _console.py                  # Live console dashboard
+    └── _recorder.py                 # JSONL file sink for offline replay
+```
+
+### Dependency
+
+```toml
+# pyproject.toml
+[project.optional-dependencies]
+telemetry = ["zenoh>=1.0", "msgpack>=1.0"]
+```
+
+zenoh is optional. `TelemetryEmitter` gracefully degrades to no-op when zenoh is not installed. Observer process requires `physicalai[telemetry]`.
+
+### ActionQueue Change
+
+Add `peek_remaining()` — returns a copy of queued actions as `(R, D)` array without consuming them. Called once per inference event (not per tick) to snapshot the post-smooth future trajectory.
+
+```python
+def peek_remaining(self) -> np.ndarray | None:
+    """Return copy of remaining actions without consuming them. Thread-safe."""
+    with self._lock:
+        if not self._deque:
+            return None
+        return np.stack(list(self._deque))
+```
+
+### Integration Points
+
+**PolicyRuntime** (4 touch points in `runtime.py`):
+
+1. Accept optional `telemetry: TelemetryEmitter | None` in `__init__`
+2. `emit_lifecycle("start")` at top of `run()`
+3. `emit_tick(...)` at end of each tick
+4. `emit_lifecycle("shutdown")` in `_shutdown()`
+
+**AsyncExecution** (1 touch point in `execution.py`):
+
+1. Accept optional `telemetry: TelemetryEmitter | None` in `__init__`
+2. `emit_inference(...)` after `push_chunk()` in the `_run()` thread
+
+No changes to `smoothers.py` or `__init__.py` exports. `TelemetryEmitter` is internal wiring, not user-facing API.
+
+### Out of Scope
+
+- Remote telemetry (zenoh is network-transparent; enabling it is config, not code)
+- Camera frame streaming (stays on iceoryx2)
+- OpenTelemetry / OTLP exporter (future observer plugin)
+- Studio UI WebSocket bridge
+- Model input snapshot recording (expensive, needs opt-in design)
+- Smoothing delta visualization (derivable in observer from `inference.chunk` vs `tick.action_sent`)
+
+---
+
+## Phase 3.6: Fault Tolerance (1 day)
+
+Robot connections over USB/serial and camera feeds are fragile. USB hubs lose power, serial timeouts occur, cameras drop frames. The runtime must not crash on transient hardware errors — it must retry and recover, or degrade gracefully.
+
+### Current Problem
+
+`PolicyRuntime._build_model_input()` calls `robot.get_observation()` and `cam.read_latest()` with no error handling. `robot.send_action()` is also unprotected. Any `ConnectionError`, `OSError`, or `serial.SerialException` from the SO-101 serial bus kills the control loop.
+
+### Design Principles
+
+1. **Never crash the loop on a transient error.** Retry the operation. If retries are exhausted, hold position and log — do not raise.
+2. **Distinguish transient vs fatal.** USB disconnect that resolves after replug = transient. `ValueError` from wrong action shape = fatal (programmer error). `KeyboardInterrupt` = always propagate.
+3. **No silent degradation.** Every recovery must log a warning and emit a telemetry lifecycle event. The operator must know the robot hiccupped.
+4. **Bound retry duration.** Retries must not exceed the tick budget. If `get_observation()` fails 3 times within the tick, use the last known observation and move on.
+5. **Camera failure ≠ robot failure.** A dropped camera frame should use the last known frame (stale but safe). A robot communication failure is more serious but still retryable.
+
+### Implementation: `_resilient_observe()` and `_resilient_send()`
+
+Replace the raw calls in the control loop with retry-wrapped variants:
+
+```python
+_MAX_OBS_RETRIES = 3
+_MAX_SEND_RETRIES = 2
+_RETRY_BACKOFF_S = 0.001  # 1ms between retries — bounded, won't bust 33ms budget
+
+def _resilient_observe(self) -> dict[str, Any]:
+    """Build model input with retry on transient hardware errors."""
+    # Robot observation
+    robot_obs = None
+    for attempt in range(_MAX_OBS_RETRIES):
+        try:
+            robot_obs = self._robot.get_observation()
+            self._consecutive_error_ticks = 0
+            break
+        except (ConnectionError, OSError) as e:
+            logger.warning("Robot observation failed (attempt %d/%d): %s",
+                           attempt + 1, _MAX_OBS_RETRIES, e)
+            if self._telemetry:
+                self._telemetry.emit_lifecycle("obs_error", error=str(e),
+                                               attempt=attempt + 1)
+            if attempt < _MAX_OBS_RETRIES - 1:
+                time.sleep(_RETRY_BACKOFF_S)
+
+    if robot_obs is None:
+        self._consecutive_error_ticks += 1
+        if self._consecutive_error_ticks >= self._max_consecutive_error_ticks:
+            msg = (f"Robot unreachable for {self._consecutive_error_ticks} consecutive ticks "
+                   f"({self._consecutive_error_ticks / self._fps:.1f}s)")
+            if self._telemetry:
+                self._telemetry.emit_lifecycle("connection_lost", error=msg)
+            raise ConnectionError(msg)
+        if self._last_robot_obs is not None:
+            logger.warning("Using stale robot observation (error tick %d/%d)",
+                           self._consecutive_error_ticks,
+                           self._max_consecutive_error_ticks)
+            robot_obs = self._last_robot_obs
+            self._stale_obs_ticks += 1
+        else:
+            raise ConnectionError("Robot observation failed and no fallback available")
+    self._last_robot_obs = robot_obs
+
+    # Camera frames — independent per camera, each can fail independently
+    camera_frames: dict[str, Frame] = {}
+    for name, cam in self._cameras.items():
+        try:
+            camera_frames[name] = cam.read_latest()
+            self._last_camera_frames[name] = camera_frames[name]
+        except (ConnectionError, OSError) as e:
+            logger.warning("Camera '%s' read failed: %s — using last frame", name, e)
+            if name in self._last_camera_frames:
+                camera_frames[name] = self._last_camera_frames[name]
+                self._stale_obs_ticks += 1
+
+    return self._obs_to_input(robot_obs, camera_frames)
+
+
+def _resilient_send(self, action: np.ndarray) -> None:
+    """Send action with retry on transient errors."""
+    for attempt in range(_MAX_SEND_RETRIES):
+        try:
+            self._robot.send_action(action)
+            self._consecutive_error_ticks = 0
+            return
+        except (ConnectionError, OSError) as e:
+            logger.warning("send_action failed (attempt %d/%d): %s",
+                           attempt + 1, _MAX_SEND_RETRIES, e)
+            if self._telemetry:
+                self._telemetry.emit_lifecycle("send_error", error=str(e),
+                                               attempt=attempt + 1)
+            if attempt < _MAX_SEND_RETRIES - 1:
+                time.sleep(_RETRY_BACKOFF_S)
+    self._consecutive_error_ticks += 1
+    self._transient_errors += 1
+    logger.error("send_action failed after %d attempts — skipping tick",
+                 _MAX_SEND_RETRIES)
+```
+
+### Error Classification
+
+| Exception type               | Source                                                    | Treatment                         |
+| ---------------------------- | --------------------------------------------------------- | --------------------------------- |
+| `ConnectionError`            | SO-101 serial port closed, servo not responding           | Retry, then stale fallback        |
+| `OSError`                    | USB disconnect, file descriptor error, camera device lost | Retry, then stale fallback        |
+| `TimeoutError`               | Serial read timeout (subclass of `OSError`)               | Retry, then stale fallback        |
+| `ValueError`, `RuntimeError` | Wrong action shape, uncalibrated mode, programming error  | **Fatal** — propagate immediately |
+| `WorkerDiedError`            | Inference thread crash                                    | **Fatal** — propagate immediately |
+| `KeyboardInterrupt`          | User Ctrl+C                                               | **Always propagate**              |
+
+### State Tracking
+
+`PolicyRuntime` gains fields for stale-observation fallback and error escalation:
+
+```python
+self._last_robot_obs: RobotObservation | None = None
+self._last_camera_frames: dict[str, Frame] = {}
+self._consecutive_error_ticks: int = 0
+self._max_consecutive_error_ticks: int = int(3 * fps)  # ~3 seconds
+self._stale_obs_ticks: int = 0
+self._transient_errors: int = 0
+```
+
+`_consecutive_error_ticks` is reset to 0 on any successful `get_observation()` or `send_action()`. It increments when all retries within a tick fail. When it reaches `max_consecutive_error_ticks`, the runtime raises `ConnectionError`.
+
+`RunStats` gains fault tolerance metrics:
+
+```python
+@dataclass(frozen=True)
+class RunStats:
+    steps: int
+    total_pops: int
+    total_holds: int
+    inference_count: int
+    transient_errors: int    # total retried hardware errors
+    stale_obs_ticks: int     # ticks where stale observation was used
+```
+
+The telemetry `tick` event gains a `stale_obs: bool` field so the observer can flag degraded ticks.
+
+### Warmup Resilience
+
+Warmup is the most fragile moment — USB may not be fully enumerated, servos may still be initializing. The first call to `_build_model_input()` has no stale fallback.
+
+Warmup gets its own retry loop with longer timeout:
+
+```python
+_WARMUP_RETRIES = 5
+_WARMUP_BACKOFF_S = 1.0  # 1 second between warmup retries
+
+def _warmup_with_retry(self) -> None:
+    for attempt in range(_WARMUP_RETRIES):
+        try:
+            sample_obs = self._build_model_input()
+            self._execution.warmup(sample_obs)
+            return
+        except (ConnectionError, OSError) as e:
+            logger.warning("Warmup failed (attempt %d/%d): %s",
+                           attempt + 1, _WARMUP_RETRIES, e)
+            if attempt < _WARMUP_RETRIES - 1:
+                time.sleep(_WARMUP_BACKOFF_S)
+    msg = f"Warmup failed after {_WARMUP_RETRIES} attempts — robot or cameras unreachable"
+    raise ConnectionError(msg)
+```
+
+Total warmup retry budget: 5 seconds. Long enough for USB enumeration, short enough to not confuse the user.
+
+### Reconnection
+
+Retry within a tick handles brief glitches (serial timeout, dropped USB packet). For sustained disconnects (USB cable pulled), the robot's `is_connected()` returns `False` and consecutive retries will keep failing.
+
+Full reconnection (calling `robot.disconnect()` then `robot.connect()`) is **not** done automatically in the runtime. Reconnecting a serial bus mid-loop is dangerous — it resets servo state, torque settings, and calibration. This must be an explicit user decision.
+
+Instead, the runtime:
+
+1. Logs an error with reconnection instructions
+2. Continues holding position (stale obs + last action)
+3. Emits `lifecycle("connection_lost")` telemetry event
+4. After `max_consecutive_errors` (default: `3 * fps` = 3 seconds of failures), raises `ConnectionError` with a clear message
+
+This gives the user time to replug USB without the loop crashing, but doesn't silently run for minutes with a dead robot.
+
+### Camera Recovery
+
+Cameras are more resilient than robots — `SharedCamera` (iceoryx2) handles publisher death and re-spawn. Direct camera backends (`RealsenseCamera`, `UVCCamera`) may need explicit reconnection.
+
+The runtime treats camera failure as soft: use last frame, log warning, continue. If a camera has never produced a frame (failure on first read), omit it from the model input dict. The model's behavior with missing camera keys is model-specific — not the runtime's concern.
+
+### Tests
+
+```text
+physicalai/tests/unit/runtime/
+└── test_fault_tolerance.py   # Mock robot that raises on get_observation/send_action:
+                              #   - transient error → retry succeeds
+                              #   - sustained error → stale fallback
+                              #   - fatal error → propagates
+                              #   - camera failure → stale frame used
+                              #   - max_consecutive_errors → ConnectionError raised
+```
+
+---
+
+## Phase 4: Advanced (later)
+
+1. `AsyncExecution(transport="process")` for PyTorch CPU (GIL contention)
+2. RTC guidance correction (requires split export from `library/`)
+3. `RemoteExecution` + gRPC `PolicyServer` (see [policy_server_design.md](./policy_server_design.md))
+
+---
+
+## Component Mapping: inference_async.py → Library
+
+| Script component                            | Library target                                             | Notes                                                   |
+| ------------------------------------------- | ---------------------------------------------------------- | ------------------------------------------------------- |
+| `QueueMixer`                                | `ActionQueue` + `LerpSmoother`                             | Offset-aware blending extracted into smoother           |
+| `QueueMixer.lerp_duration = max(offset, 1)` | `LerpSmoother.merge()` computes from offset                | Stateless — `duration_frames` is fallback when offset=0 |
+| `InferenceThread`                           | `AsyncExecution`                                           | Same thread architecture: obs_slot + result push        |
+| `InferenceThread.force_reset()`             | `AsyncExecution._force_reset()`                            | Clears stuck state                                      |
+| `InferenceThread.busy_duration`             | `AsyncExecution._busy_duration`                            | Watchdog timeout trigger                                |
+| `InferenceThread.alive`                     | `AsyncExecution.alive`                                     | Dead thread detection                                   |
+| `get_full_observation()`                    | `default_observation_to_input()` + `obs_to_input` callable | Separates robot obs format from model input format      |
+| `action_to_robot_dict()`                    | `Robot.send_action(ndarray)`                               | Robot protocol handles conversion                       |
+| `main()` while-loop                         | `PolicyRuntime.run()`                                      | Same 5-step structure                                   |
+| Velocity clamping + ramp                    | User callback or example code                              | Too robot-specific for core runtime                     |
+| Camera discovery + `SharedCamera`           | User code / CLI (Phase 3)                                  | App-specific                                            |
+| Warm-up inference + queue seeding           | `Execution.warmup()`                                       | Seeds queue so loop starts with actions                 |
+| `inference_thread.alive` check + restart    | `AsyncExecution.maybe_request()` raises `WorkerDiedError`  | Raise instead of silent restart                         |
+| `force_reset()` for stuck thread            | `AsyncExecution._force_reset()` via watchdog               | Auto-triggered when `busy_duration > timeout`           |
+| Hold position + `hold_count`                | `ActionQueue.consecutive_holds` + `PolicyRuntime` logging  | Telemetry via queue counters                            |
+| Two-backend support (torch/OV)              | `InferenceModel` abstraction                               | Runtime only calls `predict_action_chunk()`             |
+| (none — ad-hoc print/log)                   | `TelemetryEmitter` + zenoh pub-sub                         | Fire-and-forget, no-op when zenoh absent (Phase 3.5)    |
+| (none — loop crashes on USB error)          | `_resilient_observe()` / `_resilient_send()`               | Retry + stale fallback for transient errors (Phase 3.6) |
+
+---
+
+## Design Gaps Addressed
+
+Gaps identified during architecture review, with resolutions.
+
+### Gap 1: Observation ownership in async
+
+**Problem**: Main thread passes observation dict to `maybe_request()`. If main thread reuses camera buffers, inference thread reads corrupted data.
+
+**Resolution**: `AsyncExecution.maybe_request()` performs defensive copy before submitting:
+
+```python
+snapshot = {
+    k: v.copy() if isinstance(v, np.ndarray) else v
+    for k, v in observation.items()
+}
+```
+
+Cost: one dict of numpy copies per inference request (not per tick — only when threshold triggers). At 30fps with threshold=0.5 and chunk_size=50, that's roughly once every ~0.8s.
+
+### Gap 2: Empty-queue telemetry
+
+**Problem**: "Hold last action" silently masks queue starvation.
+
+**Resolution**: `ActionQueue` tracks `consecutive_holds` and `total_holds`. `PolicyRuntime` logs warnings on first hold and every `fps` consecutive holds (1 per second). `on_hold` callback exposes starvation events to user code.
+
+### Gap 3: Graceful shutdown
+
+**Problem**: Hard stop can jerk the robot.
+
+**Resolution**: `PolicyRuntime._shutdown()` drains up to 1 second of remaining actions at loop FPS. Beyond 1s, hard stop — the user pressed Ctrl+C for a reason. Robot and cameras stay connected here; caller owns connect/disconnect lifecycle.
+
+### Gap 4: Error propagation
+
+**Problem**: Inference thread exceptions silently swallowed — `pop_action()` returns None forever.
+
+**Resolution**: `AsyncExecution` stores `_death_cause` on thread exception. `maybe_request()` checks `alive` and raises `WorkerDiedError` with original traceback preserved via `raise ... from`. PolicyRuntime catches and re-raises.
+
+### Gap 5: Two-backend support
+
+**Problem**: inference_async.py handles PyTorch and OpenVINO with different preprocessing.
+
+**Resolution**: Not a runtime concern. `InferenceModel` abstracts backend differences. Runtime only calls `model.predict_action_chunk(obs)`. The torch bypass in inference_async.py exists because the script bypasses `InferenceModel` for direct Pi05 policy access — the library runtime won't need this.
+
+---
+
+## Resolved Questions
+
+1. **`PolicyRuntime.run()` returns `RunStats`.** `@dataclass` with 4 core fields (steps, total_pops, total_holds, inference_count) plus 2 fault tolerance fields (transient_errors, stale_obs_ticks) added in Phase 3.6. Useful for testing and logging.
+
+2. **Warm-up happens inside `run()`.** User doesn't forget, no "did I call warmup?" failure mode.
+
+3. **Dead worker raises `WorkerDiedError`.** Let caller decide recovery strategy. Auto-restart can mask systematic failures.
+
+---
+
+## Relationship to Existing Design Docs
+
+This plan refines [policy_runtime_design.md](./policy_runtime_design.md). Key differences:
+
+| Topic                           | Original design doc                                              | This plan                                                                                                  |
+| ------------------------------- | ---------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------- |
+| ActionQueue visibility          | Public parameter on PolicyRuntime                                | Public parameter with sensible default. Internal `_action_queue.py` module, exported via `__init__`.       |
+| Execution ABC                   | `start(action_queue, model)`, `maybe_request(obs, action_queue)` | `start(model, action_queue)`, `maybe_request(obs)` — queue stored internally on start, not passed per call |
+| Warmup                          | `warmup(sample_observation, n=2)`                                | `warmup(sample_observation)` — one call, seeds queue                                                       |
+| Health monitoring               | Not specified                                                    | First-class: alive, busy, busy_duration, WorkerDiedError, watchdog                                         |
+| Smoothing                       | `LerpChunkSmoother`, `ReplaceMerger`                             | `LerpSmoother`, `ReplaceSmoother` — stateless merge, offset-aware                                          |
+| Observation bridge              | Not specified                                                    | `obs_to_input` callable with `default_observation_to_input` fallback                                       |
+| `predict_action_chunk()` return | `Mapping[str, Any]` with `"actions"` key                         | `np.ndarray` directly (matches actual implementation)                                                      |
+| Bug fixes                       | Not applicable                                                   | Phase 1 prerequisite — bugs 1-3 on critical path                                                           |
+| Telemetry                       | Not specified                                                    | Phase 3.5: zenoh pub-sub emitter (fire-and-forget), separate observer process, optional dep                |
+| Fault tolerance                 | Not specified                                                    | Phase 3.6: retry + stale fallback for transient HW errors, error classification, bounded retry             |
+
+All ownership rules, boundary constraints, and deferred-until-needed decisions from the original design doc remain in effect.
diff --git a/examples/runtime/inference_async.py b/examples/runtime/inference_async.py
new file mode 100644
index 0000000..9eade0a
--- /dev/null
+++ b/examples/runtime/inference_async.py
@@ -0,0 +1,764 @@
+# pyrefly: ignore-errors
+# Copyright (C) 2026 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+"""Async threaded inference for SO-101 with lerp-blended action chunking.
+
+Supports two backends:
+- **PyTorch/CUDA**: Pi05 checkpoint (.ckpt file) — GPU inference via select_action.
+- **OpenVINO**: Exported model directory — CPU/GPU inference via InferenceModel.
+
+The backend is auto-detected from the checkpoint path:
+- File ending in .ckpt → PyTorch Pi05
+- Directory → OpenVINO InferenceModel
+
+The main thread runs the robot control loop at a fixed FPS.
+A background thread runs model inference and pushes action chunks
+into a shared queue with lerp blending for smooth transitions.
+
+Uses LeRobot SO101Follower (degrees) for robot control and
+physicalai SharedCamera (iceoryx2) for camera input.
+
+Usage (PyTorch):
+    python inference_async_pi05.py \
+        --model-path pi05/pi05_eugene.ckpt \
+        --device cuda \
+        --robot-port /dev/ttyACM1 \
+        --task "pick a can and place it in a bowl"
+
+Usage (OpenVINO):
+    python inference_async_pi05.py \
+        --model-path exports/pi05_ov \
+        --device CPU \
+        --robot-port /dev/ttyACM1 \
+        --camera-map overhead=top arm=wrist \
+        --task "pick a can and place it in a bowl"
+"""
+
+from __future__ import annotations
+
+import argparse
+import logging
+import threading
+import time
+from pathlib import Path
+
+import cv2
+import numpy as np
+
+from lerobot.robots.so_follower import SO101Follower, SO101FollowerConfig
+from physicalai.capture import SharedCamera
+from physicalai.capture.discovery import DeviceInfo, discover_all
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s %(levelname)s [%(threadName)s] %(name)s: %(message)s",
+)
+logger = logging.getLogger(__name__)
+
+JOINT_NAMES = [
+    "shoulder_pan",
+    "shoulder_lift",
+    "elbow_flex",
+    "wrist_flex",
+    "wrist_roll",
+    "gripper",
+]
+
+IMAGE_WIDTH = 640
+IMAGE_HEIGHT = 480
+CAMERA_FPS = 30
+
+
+# ---------------------------------------------------------------------------
+# Queue Mixer — lerp-blends old and new action chunks
+# ---------------------------------------------------------------------------
+
+class QueueMixer:
+    """Action queue with linear-interpolation blending between chunks.
+
+    When a new chunk arrives while old actions remain, the overlapping
+    region is lerp-blended so the robot doesn't jerk.  An *offset*
+    parameter lets us skip the first N actions of the new chunk to
+    compensate for inference latency.
+    """
+
+    def __init__(self, lerp_duration: int = 5) -> None:
+        self.queue: np.ndarray | None = None
+        self.lerp_duration = lerp_duration
+        self.index = 0
+
+    def add(self, chunk: np.ndarray, offset: int = 0) -> None:
+        """Merge *chunk* (H, action_dim) into the queue."""
+        if self.queue is None or self.index >= len(self.queue):
+            self.queue = chunk[offset:]
+            self.index = 0
+            logger.info(
+                "[QueueMixer] First/exhausted → replaced queue (len=%d, offset=%d)",
+                len(self.queue), offset,
+            )
+            return
+
+        remaining = self.queue[self.index:]
+        incoming = chunk[offset:]
+        n_remain = len(remaining)
+        lerp_dur = min(n_remain, self.lerp_duration)
+
+        weights = np.maximum(1.0 - np.arange(n_remain) / max(lerp_dur, 1), 0.0)
+        weights = weights[:, np.newaxis]
+
+        n_blend = min(n_remain, len(incoming))
+        blended = weights[:n_blend] * remaining[:n_blend] + (1.0 - weights[:n_blend]) * incoming[:n_blend]
+
+        self.queue = np.concatenate([blended, incoming[n_blend:]], axis=0).astype(np.float32)
+        self.index = 0
+        logger.info(
+            "[QueueMixer] Blended chunk (blended=%d, appended=%d, total=%d, lerp=%d, offset=%d)",
+            n_blend, max(len(incoming) - n_blend, 0), len(self.queue), lerp_dur, offset,
+        )
+
+    def pop(self) -> np.ndarray | None:
+        """Pop the next action, or return None if empty."""
+        if self.queue is None or self.index >= len(self.queue):
+            return None
+        action = self.queue[self.index]
+        self.index += 1
+        return action
+
+    @property
+    def remaining(self) -> int:
+        if self.queue is None:
+            return 0
+        return max(len(self.queue) - self.index, 0)
+
+    @property
+    def empty(self) -> bool:
+        return self.remaining == 0
+
+
+# ---------------------------------------------------------------------------
+# Inference Thread (PyTorch / CUDA)
+# ---------------------------------------------------------------------------
+
+class InferenceThread:
+    """Background thread that runs model inference on demand.
+
+    Supports two backends:
+    - "torch": Pi05 GPU inference via select_action + action_queue draining.
+    - "openvino": InferenceModel inference via model(dict)["action"].
+
+    The control loop writes an observation dict into `obs_slot` and sets
+    `obs_ready`.  The inference thread picks it up, runs inference, and
+    pushes the result (chunk + timing) into `result_slot`.
+    """
+
+    def __init__(
+        self,
+        model: object,
+        backend: str,
+        device: str,
+        task: str,
+        flip_cameras: set[str],
+        camera_name_map: dict[str, str] | None = None,
+        blank_cameras: list[str] | None = None,
+    ) -> None:
+        self.model = model
+        self.backend = backend
+        self.device = device
+        self.task = task
+        self.flip_cameras = flip_cameras
+        self.camera_name_map = camera_name_map or {}
+        self.blank_cameras = blank_cameras or []
+
+        self._lock = threading.Lock()
+        self._obs_slot: dict | None = None
+        self._obs_ready = threading.Event()
+        self._running_inference = False
+        self._request_time = 0.0
+
+        self._result_lock = threading.Lock()
+        self._result_slot: tuple[np.ndarray, float] | None = None
+
+        self._stop = threading.Event()
+        self._thread = threading.Thread(target=self._run, name="InferenceThread", daemon=True)
+        self.inference_count = 0
+
+    def start(self) -> None:
+        logger.info("[InferenceThread] Starting background thread")
+        self._thread.start()
+
+    def stop(self) -> None:
+        logger.info("[InferenceThread] Requesting stop")
+        self._stop.set()
+        self._obs_ready.set()
+        self._thread.join(timeout=10.0)
+        logger.info("[InferenceThread] Stopped (ran %d inferences)", self.inference_count)
+
+    def request(self, observation: dict) -> bool:
+        """Submit an observation dict for inference. Returns False if busy."""
+        with self._lock:
+            if self._obs_slot is not None or self._running_inference:
+                return False
+            self._obs_slot = observation
+            self._request_time = time.perf_counter()
+        self._obs_ready.set()
+        logger.debug("[InferenceThread] Observation submitted")
+        return True
+
+    @property
+    def busy(self) -> bool:
+        with self._lock:
+            return self._obs_slot is not None or self._running_inference
+
+    @property
+    def alive(self) -> bool:
+        return self._thread.is_alive()
+
+    @property
+    def busy_duration(self) -> float:
+        """Seconds since last request was submitted. 0 if not busy."""
+        with self._lock:
+            if not (self._obs_slot is not None or self._running_inference):
+                return 0.0
+            return time.perf_counter() - self._request_time
+
+    def force_reset(self) -> None:
+        """Clear stuck state so new requests can be submitted."""
+        with self._lock:
+            self._obs_slot = None
+            self._running_inference = False
+        logger.warning("[InferenceThread] Force reset — cleared stuck state")
+
+    def get_result(self) -> tuple[np.ndarray, float] | None:
+        """Non-blocking fetch of the latest result. Returns (chunk, latency_s) or None."""
+        with self._result_lock:
+            r = self._result_slot
+            self._result_slot = None
+        return r
+
+    def _build_torch_observation(self, obs_dict: dict) -> object:
+        """Convert raw observation dict to Pi05 Observation on device (torch)."""
+        import torch
+        from physicalai.data.observation import Observation
+
+        state = torch.tensor(
+            [[obs_dict[f"{jn}.pos"] for jn in JOINT_NAMES]],
+            dtype=torch.float32,
+            device=self.device,
+        )
+
+        images = {}
+        for cam_key in [k for k in obs_dict if isinstance(obs_dict[k], np.ndarray) and obs_dict[k].ndim == 3]:
+            img = np.ascontiguousarray(obs_dict[cam_key])
+            if cam_key in self.flip_cameras:
+                img = cv2.rotate(img, cv2.ROTATE_180)
+            t = torch.from_numpy(img.copy()).float() / 255.0
+            t = t.permute(2, 0, 1).unsqueeze(0)
+            images[cam_key] = t.to(self.device)
+
+        return Observation(state=state, images=images, task=self.task)
+
+    def _build_ov_observation(self, obs_dict: dict) -> dict[str, np.ndarray]:
+        """Convert raw observation dict to InferenceModel input (numpy)."""
+        state = np.array([[obs_dict[f"{jn}.pos"] for jn in JOINT_NAMES]], dtype=np.float32)
+        observation: dict[str, np.ndarray] = {"state": state, "task": [self.task]}
+
+        for cam_key in [k for k in obs_dict if isinstance(obs_dict[k], np.ndarray) and obs_dict[k].ndim == 3]:
+            img = np.ascontiguousarray(obs_dict[cam_key])
+            if cam_key in self.flip_cameras:
+                img = cv2.rotate(img, cv2.ROTATE_180)
+            img = cv2.resize(img, (IMAGE_WIDTH, IMAGE_HEIGHT))
+            img = img.transpose(2, 0, 1).astype(np.float32)[np.newaxis] / 255.0
+            model_name = self.camera_name_map.get(cam_key, cam_key)
+            observation[f"images.{model_name}"] = img
+
+        for blank_cam in self.blank_cameras:
+            observation[f"images.{blank_cam}"] = np.zeros(
+                (1, 3, IMAGE_HEIGHT, IMAGE_WIDTH), dtype=np.float32,
+            )
+
+        return observation
+
+    def _infer_torch(self, obs_dict: dict) -> np.ndarray:
+        """Run Pi05 inference and return the full action chunk as numpy."""
+        import torch
+        from collections import deque
+
+        observation = self._build_torch_observation(obs_dict)
+        self.model._action_queue = deque()
+
+        with torch.no_grad():
+            action = self.model.select_action(observation)
+        if self.device == "cuda":
+            torch.cuda.synchronize()
+
+        actions = [action.detach().cpu().numpy()]
+        while len(self.model._action_queue) > 0:
+            a = self.model._action_queue.popleft()
+            actions.append(a.detach().cpu().numpy() if isinstance(a, torch.Tensor) else a)
+
+        return np.array([np.squeeze(a) for a in actions], dtype=np.float32)
+
+    def _infer_ov(self, obs_dict: dict) -> np.ndarray:
+        """Run OpenVINO inference and return the full action chunk as numpy."""
+        model_input = self._build_ov_observation(obs_dict)
+        output = self.model(model_input)["action"]
+        chunk = np.squeeze(output)
+        if chunk.ndim == 1:
+            chunk = chunk[np.newaxis, :]
+        return chunk.astype(np.float32)
+
+    def _run(self) -> None:
+        logger.info("[InferenceThread] Thread started, waiting for observations")
+        while not self._stop.is_set():
+            self._obs_ready.wait()
+            self._obs_ready.clear()
+
+            if self._stop.is_set():
+                break
+
+            with self._lock:
+                obs_dict = self._obs_slot
+                self._obs_slot = None
+                self._running_inference = True
+
+            if obs_dict is None:
+                with self._lock:
+                    self._running_inference = False
+                continue
+
+            self.inference_count += 1
+            logger.info("[InferenceThread] >>> Inference #%d START", self.inference_count)
+            t0 = time.perf_counter()
+
+            try:
+                if self.backend == "torch":
+                    chunk = self._infer_torch(obs_dict)
+                else:
+                    chunk = self._infer_ov(obs_dict)
+            except Exception:
+                logger.exception("[InferenceThread] Inference failed")
+                with self._lock:
+                    self._running_inference = False
+                continue
+
+            latency = time.perf_counter() - t0
+            logger.info(
+                "[InferenceThread] <<< Inference #%d DONE  latency=%.1fms  chunk_shape=%s",
+                self.inference_count, latency * 1000, chunk.shape,
+            )
+
+            with self._result_lock:
+                self._result_slot = (chunk, latency)
+            with self._lock:
+                self._running_inference = False
+
+
+# ---------------------------------------------------------------------------
+# Observation helpers
+# ---------------------------------------------------------------------------
+
+def get_full_observation(
+    robot: SO101Follower,
+    cameras: dict[str, SharedCamera],
+) -> dict:
+    """Combine robot state + SharedCamera images into one dict.
+
+    LeRobot SO101Follower returns joint values in the correct
+    model domain (body joints in degrees, gripper in [0, 100]).
+    """
+    obs = robot.get_observation()
+    for cam_key, cam in cameras.items():
+        frame = cam.read_latest()
+        obs[cam_key] = frame.data
+    return obs
+
+
+def action_to_robot_dict(action: np.ndarray) -> dict[str, float]:
+    """Convert action array to a LeRobot send_action dict."""
+    return {f"{jn}.pos": float(action[i]) for i, jn in enumerate(JOINT_NAMES)}
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+# ---------------------------------------------------------------------------
+# Camera discovery
+# ---------------------------------------------------------------------------
+
+def select_cameras() -> dict[str, str]:
+    """Discover cameras and interactively ask user to assign overhead/arm."""
+    print("\n[cameras] Discovering cameras ...", flush=True)
+
+    all_devices = discover_all()
+
+    # Flatten into a single list (deduplicate across drivers by hardware_id)
+    seen: set[str] = set()
+    devices: list[DeviceInfo] = []
+    for driver_devices in all_devices.values():
+        for dev in driver_devices:
+            key = dev.hardware_id or f"{dev.driver}:{dev.device_id}"
+            if key not in seen:
+                seen.add(key)
+                devices.append(dev)
+
+    if not devices:
+        raise RuntimeError("No cameras found. Check connections and permissions.")
+
+    print(f"\n  Found {len(devices)} camera(s):", flush=True)
+    for i, dev in enumerate(devices):
+        stable_id = dev.hardware_id or ""
+        print(f"    [{i}] {dev.name or dev.model or 'Unknown'}  ({stable_id or dev.device_id})", flush=True)
+
+    cameras: dict[str, str] = {}
+    for role in ("overhead", "arm"):
+        while True:
+            choice = input(f"\n  Select {role} camera (0-{len(devices)-1}), or 's' to skip: ").strip()
+            if choice.lower() == "s":
+                print(f"    Skipping {role} camera", flush=True)
+                break
+            if choice.isdigit() and 0 <= int(choice) < len(devices):
+                dev = devices[int(choice)]
+                # Prefer stable by-id path, fall back to device_id
+                path = dev.hardware_id if dev.hardware_id else dev.device_id
+                cameras[role] = path
+                print(f"    {role} → {path}", flush=True)
+                break
+            print(f"    Invalid choice. Enter 0-{len(devices)-1} or 's'.")
+
+    if not cameras:
+        raise RuntimeError("At least one camera must be selected.")
+
+    return cameras
+
+
+def parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser(description="Async threaded SO-101 inference (PyTorch or OpenVINO).")
+    p.add_argument("--model-path", type=str, required=True,
+                    help="Path to Pi05 checkpoint (.ckpt) or OpenVINO export directory")
+    p.add_argument("--device", type=str, default="cuda",
+                    help="'cuda' or 'cpu' for torch; 'CPU', 'GPU', 'NPU' for OpenVINO")
+    p.add_argument("--robot-port", type=str, default="/dev/ttyACM1")
+    p.add_argument("--robot-id", type=str, default="my_so101_follower")
+    p.add_argument("--overhead-camera", type=str, default=None,
+                    help="UVC device path for overhead camera (skip interactive selection)")
+    p.add_argument("--arm-camera", type=str, default=None,
+                    help="UVC device path for arm camera (skip interactive selection)")
+    p.add_argument("--task", type=str, default="pick a can and place it in a bowl")
+    p.add_argument("--fps", type=int, default=30)
+    p.add_argument("--queue-threshold", type=float, default=0.5,
+                    help="Request new inference when queue drops below this fraction of chunk size.")
+    p.add_argument("--lerp-duration", type=int, default=5,
+                    help="Number of frames over which to blend old/new chunks.")
+    p.add_argument("--flip-cameras", nargs="*", default=[],
+                    help="Camera names to rotate 180 degrees.")
+    p.add_argument("--infer-timeout", type=float, default=15.0,
+                    help="Seconds before force-resetting a stuck inference thread.")
+    p.add_argument("--max-speed", type=float, default=90.0,
+                    help="Max joint speed in degrees/second. Clamps action deltas per step.")
+    p.add_argument("--ramp-steps", type=int, default=30,
+                    help="Number of initial steps to ramp up speed (starts at 1/4 max-speed).")
+    p.add_argument("--no-compile", action="store_true", default=True,
+                    help="Disable torch.compile (default: disabled, torch only)")
+    # OpenVINO-specific options
+    p.add_argument("--camera-map", nargs="*",
+                    help="Map camera names to model input names, e.g. overhead=top arm=wrist (OV only)")
+    p.add_argument("--blank-cameras", nargs="*",
+                    help="Model camera inputs to fill with zeros, e.g. side (OV only)")
+    return p.parse_args()
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+def main(args: argparse.Namespace) -> None:
+    model_path = Path(args.model_path)
+    if not model_path.exists():
+        raise FileNotFoundError(f"Model not found: {model_path}")
+
+    # Auto-detect backend
+    if model_path.is_dir():
+        backend = "openvino"
+        # Default device for OpenVINO is CPU when user left the torch default
+        if args.device in ("cuda", "cuda:0"):
+            print(f"[init] Warning: OpenVINO backend does not support CUDA. Falling back to CPU.", flush=True)
+            args.device = "CPU"
+        else:
+            args.device = args.device.upper()
+    elif model_path.suffix == ".ckpt":
+        backend = "torch"
+    else:
+        raise ValueError(
+            f"Cannot determine backend from '{model_path}'. "
+            "Use a .ckpt file for PyTorch or a directory for OpenVINO."
+        )
+
+    flip_cameras = set(args.flip_cameras or [])
+    camera_name_map = dict(kv.split("=") for kv in (args.camera_map or []))
+    blank_cameras = list(args.blank_cameras or [])
+
+    print(f"[init] Model: {model_path}", flush=True)
+    print(f"[init] Backend: {backend}", flush=True)
+    print(f"[init] Device: {args.device}", flush=True)
+    print(f"[init] Flip cameras: {flip_cameras}", flush=True)
+    if backend == "openvino":
+        print(f"[init] Camera map: {camera_name_map}", flush=True)
+        print(f"[init] Blank cameras: {blank_cameras}", flush=True)
+
+    # --- Set up cameras ---
+    # Use CLI paths if provided, otherwise run interactive selection
+    if args.overhead_camera or args.arm_camera:
+        camera_paths: dict[str, str] = {}
+        if args.overhead_camera:
+            camera_paths["overhead"] = args.overhead_camera
+        if args.arm_camera:
+            camera_paths["arm"] = args.arm_camera
+    else:
+        camera_paths = select_cameras()
+
+    cameras: dict[str, SharedCamera] = {}
+    for cam_name, cam_device in camera_paths.items():
+        cameras[cam_name] = SharedCamera(
+            "uvc", device=cam_device,
+            width=IMAGE_WIDTH, height=IMAGE_HEIGHT, fps=CAMERA_FPS,
+        )
+
+    for name, cam in cameras.items():
+        cam.connect()
+        print(f"[init] Camera '{name}' connected: {cam.actual_width}x{cam.actual_height} @ {cam.actual_fps}fps", flush=True)
+
+    # --- Load model ---
+    if backend == "torch":
+        import torch
+        from physicalai.policies.pi05 import Pi05
+
+        print("[init] Loading Pi05 model (map_location=cpu) ...", flush=True)
+        model = Pi05.load_from_checkpoint(
+            str(model_path), map_location="cpu", compile_model=not args.no_compile,
+        )
+        print("[init] Checkpoint loaded to CPU, moving to device ...", flush=True)
+        model = model.to(args.device)
+        model.eval()
+        print(f"[init] Model loaded on {args.device}", flush=True)
+        if args.device == "cuda":
+            print(f"[init] VRAM used: {torch.cuda.memory_allocated() / 1e9:.2f} GB", flush=True)
+    else:
+        import openvino_tokenizers  # noqa: F401
+        from physicalai.inference import InferenceModel
+        from physicalai.inference.runners import SinglePass
+
+        print(f"[init] Loading OpenVINO model from {model_path} ...", flush=True)
+        model = InferenceModel(
+            export_dir=str(model_path),
+            device=args.device,
+            runner=SinglePass(),
+        )
+        print("[init] Model loaded", flush=True)
+
+
+    # --- Connect robot (LeRobot SO101Follower, no cameras) ---
+    print(f"[init] Connecting robot on {args.robot_port} ...", flush=True)
+    robot_cfg = SO101FollowerConfig(
+        port=args.robot_port,
+        id=args.robot_id,
+    )
+    robot = SO101Follower(robot_cfg)
+    robot.connect()
+    print(f"[init] Robot connected (id={args.robot_id})", flush=True)
+
+    # --- Warm-up inference to determine chunk size ---
+    print("[init] Getting warm-up observation ...", flush=True)
+    warmup_obs = get_full_observation(robot, cameras)
+    print(f"[init] Got observation, keys: {list(warmup_obs.keys())}", flush=True)
+
+    print("[init] Running warm-up inference ...", flush=True)
+    t0 = time.perf_counter()
+
+    if backend == "torch":
+        import torch
+        from collections import deque
+        from physicalai.data.observation import Observation
+
+        state = torch.tensor(
+            [[warmup_obs[f"{jn}.pos"] for jn in JOINT_NAMES]],
+            dtype=torch.float32, device=args.device,
+        )
+        warmup_images = {}
+        for cam_key in cameras:
+            img = np.ascontiguousarray(warmup_obs[cam_key])
+            if cam_key in flip_cameras:
+                img = cv2.rotate(img, cv2.ROTATE_180)
+            t = torch.from_numpy(img.copy()).float() / 255.0
+            t = t.permute(2, 0, 1).unsqueeze(0).to(args.device)
+            warmup_images[cam_key] = t
+
+        warmup_observation = Observation(state=state, images=warmup_images, task=args.task)
+        model._action_queue = deque()
+        with torch.no_grad():
+            warmup_action = model.select_action(warmup_observation)
+        if args.device == "cuda":
+            torch.cuda.synchronize()
+
+        warmup_actions = [warmup_action.detach().cpu().numpy()]
+        while len(model._action_queue) > 0:
+            a = model._action_queue.popleft()
+            warmup_actions.append(a.detach().cpu().numpy() if isinstance(a, torch.Tensor) else a)
+        warmup_chunk = np.array([np.squeeze(a) for a in warmup_actions], dtype=np.float32)
+    else:
+        # OV warm-up
+        warmup_state = np.array(
+            [[warmup_obs[f"{jn}.pos"] for jn in JOINT_NAMES]], dtype=np.float32,
+        )
+        warmup_input: dict[str, np.ndarray] = {"state": warmup_state, "task": [args.task]}
+        for cam_key in cameras:
+            img = np.ascontiguousarray(warmup_obs[cam_key])
+            if cam_key in flip_cameras:
+                img = cv2.rotate(img, cv2.ROTATE_180)
+            img = cv2.resize(img, (IMAGE_WIDTH, IMAGE_HEIGHT))
+            img = img.transpose(2, 0, 1).astype(np.float32)[np.newaxis] / 255.0
+            model_name = camera_name_map.get(cam_key, cam_key)
+            warmup_input[f"images.{model_name}"] = img
+        for blank_cam in blank_cameras:
+            warmup_input[f"images.{blank_cam}"] = np.zeros(
+                (1, 3, IMAGE_HEIGHT, IMAGE_WIDTH), dtype=np.float32,
+            )
+        warmup_out = model(warmup_input)["action"]
+        warmup_chunk = np.squeeze(warmup_out)
+        if warmup_chunk.ndim == 1:
+            warmup_chunk = warmup_chunk[np.newaxis, :]
+        warmup_chunk = warmup_chunk.astype(np.float32)
+
+    warmup_latency = time.perf_counter() - t0
+    chunk_size = warmup_chunk.shape[0]
+    action_dim = warmup_chunk.shape[1]
+    print(f"[init] Warm-up done: chunk_size={chunk_size}, action_dim={action_dim}, "
+          f"latency={warmup_latency*1000:.1f}ms", flush=True)
+
+    # --- Init components ---
+    queue_mixer = QueueMixer(lerp_duration=args.lerp_duration)
+    queue_mixer.add(warmup_chunk, offset=0)
+
+    inference_thread = InferenceThread(
+        model, backend, args.device, args.task, flip_cameras,
+        camera_name_map=camera_name_map, blank_cameras=blank_cameras,
+    )
+    inference_thread.start()
+
+    goal_time = 1.0 / args.fps
+    threshold = int(chunk_size * args.queue_threshold)
+    max_delta_per_step = args.max_speed / args.fps  # degrees per step
+    step = 0
+    actions_from_queue = 0
+    hold_count = 0
+    last_action: np.ndarray = warmup_chunk[0]
+
+    # Initialize commanded position from current robot state (degrees/[0,100])
+    init_obs = robot.get_observation()
+    commanded_pos = np.array(
+        [init_obs[f"{jn}.pos"] for jn in JOINT_NAMES], dtype=np.float32,
+    )
+    print(f"[init] Current joint pos: {commanded_pos}", flush=True)
+    print(f"[init] Max speed: {args.max_speed:.0f} deg/s → {max_delta_per_step:.1f} per step @ {args.fps}Hz", flush=True)
+    print(f"[init] Ramp-up: {args.ramp_steps} steps", flush=True)
+    print("[run] Starting — Ctrl+C to stop", flush=True)
+
+    try:
+        while True:
+            loop_start = time.perf_counter()
+
+            # 1. Check for inference results
+            result = inference_thread.get_result()
+            if result is not None:
+                chunk, latency = result
+                offset = int(latency * args.fps)
+                logger.info(
+                    "[Control] Got inference #%d: chunk=%s, latency=%.1fms, offset=%d",
+                    inference_thread.inference_count, chunk.shape, latency * 1000, offset,
+                )
+                queue_mixer.add(chunk, offset=offset)
+                queue_mixer.lerp_duration = max(offset, 1)
+
+            # 2. Maybe request new inference (with stuck/dead detection)
+            if queue_mixer.remaining <= threshold:
+                if not inference_thread.alive:
+                    logger.error("[Control] Inference thread DEAD — restarting")
+                    inference_thread = InferenceThread(
+                        model, backend, args.device, args.task, flip_cameras,
+                        camera_name_map=camera_name_map, blank_cameras=blank_cameras,
+                    )
+                    inference_thread.start()
+                elif inference_thread.busy_duration > args.infer_timeout:
+                    logger.warning(
+                        "[Control] Inference stuck for %.0fs — force resetting",
+                        inference_thread.busy_duration,
+                    )
+                    inference_thread.force_reset()
+
+                if not inference_thread.busy:
+                    obs = get_full_observation(robot, cameras)
+                    submitted = inference_thread.request(obs)
+                    if submitted:
+                        logger.info(
+                            "[Control] Requested inference (queue_remaining=%d, threshold=%d)",
+                            queue_mixer.remaining, threshold,
+                        )
+
+            # 3. Pop action from queue
+            action = queue_mixer.pop()
+            if action is not None:
+                last_action = action
+                actions_from_queue += 1
+                hold_count = 0
+            else:
+                action = last_action
+                hold_count += 1
+                if hold_count == 1 or hold_count % 30 == 0:
+                    logger.warning(
+                        "[Control] Queue empty, holding position (hold_count=%d)", hold_count,
+                    )
+
+            # 4. Velocity clamp and send
+            if step < args.ramp_steps:
+                ramp_frac = 0.25 + 0.75 * (step / args.ramp_steps)
+                effective_delta = max_delta_per_step * ramp_frac
+            else:
+                effective_delta = max_delta_per_step
+
+            delta = action - commanded_pos
+            clamped_delta = np.clip(delta, -effective_delta, effective_delta)
+            commanded_pos = (commanded_pos + clamped_delta).astype(np.float32)
+            robot.send_action(action_to_robot_dict(commanded_pos))
+
+            # 5. Timing
+            elapsed = time.perf_counter() - loop_start
+            sleep_time = goal_time - elapsed
+            if sleep_time > 0:
+                time.sleep(sleep_time)
+
+            step += 1
+            if step % 100 == 0:
+                actual_hz = 1.0 / max(time.perf_counter() - loop_start, 1e-6)
+                logger.info(
+                    "[Control] Step %d | %.1f Hz | queue=%d | from_queue=%d | holds=%d | inferences=%d",
+                    step, actual_hz, queue_mixer.remaining,
+                    actions_from_queue, hold_count, inference_thread.inference_count,
+                )
+
+    except KeyboardInterrupt:
+        logger.info("[Control] Interrupted by user")
+    finally:
+        inference_thread.stop()
+        for name, cam in cameras.items():
+            cam.disconnect()
+            logger.info("Camera '%s' disconnected", name)
+        robot.disconnect()
+        logger.info(
+            "[Control] Cleanup complete — %d steps, %d actions from queue, %d holds, %d inferences",
+            step, actions_from_queue, hold_count, inference_thread.inference_count,
+        )
+
+
+if __name__ == "__main__":
+    main(parse_args())
diff --git a/examples/runtime/inference_pi05_gpu.ipynb b/examples/runtime/inference_pi05_gpu.ipynb
new file mode 100644
index 0000000..fcd656c
--- /dev/null
+++ b/examples/runtime/inference_pi05_gpu.ipynb
@@ -0,0 +1,838 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "68d10b89",
+   "metadata": {},
+   "source": [
+    "# Pi05 GPU Inference (PyTorch + CUDA)\n",
+    "\n",
+    "Loads Eugene's finetuned Pi05 checkpoint and runs inference on RTX 3090.\n",
+    "Expected: ~1-5s per action chunk (vs ~90s on CPU with OpenVINO)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cf8f6bdd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import numpy as np\n",
+    "import cv2\n",
+    "from collections import deque\n",
+    "from matplotlib import pyplot as plt\n",
+    "\n",
+    "print(f\"CUDA available: {torch.cuda.is_available()}\")\n",
+    "if torch.cuda.is_available():\n",
+    "    print(f\"GPU: {torch.cuda.get_device_name(0)}\")\n",
+    "    print(f\"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "98f8f8c7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# --- Config ---\n",
+    "CHECKPOINT_PATH = \"pi05/pi05_eugene.ckpt\"\n",
+    "DEVICE = \"cuda\"  # \"cuda\" or \"cpu\"\n",
+    "TASK = \"pick a can and place it in a bowl\"\n",
+    "\n",
+    "JOINT_NAMES = [\n",
+    "    \"shoulder_pan\",\n",
+    "    \"shoulder_lift\",\n",
+    "    \"elbow_flex\",\n",
+    "    \"wrist_flex\",\n",
+    "    \"wrist_roll\",\n",
+    "    \"gripper\",\n",
+    "]\n",
+    "\n",
+    "# Camera config (SharedCamera UVC device indices)\n",
+    "OVERHEAD_CAMERA_INDEX = \"/dev/v4l/by-id/usb-UGREEN_Camera_2K_UGREEN_Camera_2K_SN0001-video-index0\"\n",
+    "ARM_CAMERA_INDEX = \"/dev/v4l/by-id/usb-Innomaker_Innomaker-U20CAM-1080p-S1_SN0001-video-index0\"\n",
+    "FLIP_CAMERAS = {\"arm\"}  # Cameras mounted upside down\n",
+    "\n",
+    "CAMERA_WIDTH = 640\n",
+    "CAMERA_FPS = 30\n",
+    "CAMERA_HEIGHT = 480"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8f32365d",
+   "metadata": {},
+   "source": [
+    "## Load Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9dce2df9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from physicalai.policies.pi05 import Pi05\n",
+    "\n",
+    "# Load to CPU first to avoid OOM if GPU has residual allocations\n",
+    "policy = Pi05.load_from_checkpoint(CHECKPOINT_PATH, map_location=\"cpu\", compile_model=False)\n",
+    "policy = policy.to(DEVICE)\n",
+    "policy.eval()\n",
+    "print(f\"Model loaded on {DEVICE}\")\n",
+    "print(f\"VRAM used: {torch.cuda.memory_allocated() / 1e9:.2f} GB\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ad387d02",
+   "metadata": {},
+   "source": [
+    "## Connect Cameras & Robot"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e022b604",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from cv2_enumerate_cameras import enumerate_cameras\n",
+    "\n",
+    "print(\"Available cameras:\")\n",
+    "for c in enumerate_cameras(apiPreference=cv2.CAP_GSTREAMER):\n",
+    "    print(f\" - {c.name} (index {c.index})\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f8c0a963",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from physicalai.capture import SharedCamera\n",
+    "from lerobot.robots.so_follower import SO101Follower, SO101FollowerConfig\n",
+    "\n",
+    "# Cameras via SharedCamera (UVC)\n",
+    "cameras = {\n",
+    "    \"overhead\": SharedCamera(\"uvc\", device=OVERHEAD_CAMERA_INDEX, width=CAMERA_WIDTH, height=CAMERA_HEIGHT, fps=CAMERA_FPS),\n",
+    "    \"arm\": SharedCamera(\"uvc\", device=ARM_CAMERA_INDEX, width=CAMERA_WIDTH, height=CAMERA_HEIGHT, fps=CAMERA_FPS),\n",
+    "}\n",
+    "for name, cam in cameras.items():\n",
+    "    cam.connect()\n",
+    "    print(f\"Camera '{name}' connected: {cam.actual_width}x{cam.actual_height} @ {cam.actual_fps}fps\")\n",
+    "\n",
+    "# Robot without cameras\n",
+    "robot_cfg = SO101FollowerConfig(\n",
+    "    port=\"/dev/ttyACM1\",\n",
+    "    id=\"my_so101_follower\",\n",
+    "\n",
+    ")\n",
+    "\n",
+    "robot = SO101Follower(robot_cfg)\n",
+    "robot.connect()\n",
+    "print(\"Robot connected\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "049e73d1",
+   "metadata": {},
+   "source": [
+    "## Get Observation & Display"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c1630af6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_full_observation() -> dict:\n",
+    "    \"\"\"Combine robot state + SharedCamera images into one dict.\"\"\"\n",
+    "    obs = robot.get_observation()\n",
+    "    for cam_key, cam in cameras.items():\n",
+    "        frame = cam.read_latest()\n",
+    "        obs[cam_key] = frame.data  # numpy HWC RGB\n",
+    "    return obs\n",
+    "\n",
+    "def viz_observation(obs: dict):\n",
+    "    print(\"Joint positions:\")\n",
+    "    for jn in JOINT_NAMES:\n",
+    "        print(f\"  {jn}: {obs[f'{jn}.pos']:.4f}\")\n",
+    "\n",
+    "    fig, axes = plt.subplots(1, 2, figsize=(12, 4))\n",
+    "    for ax, cam_key in zip(axes, [\"overhead\", \"arm\"]):\n",
+    "        img = np.ascontiguousarray(obs[cam_key])\n",
+    "        if cam_key in FLIP_CAMERAS:\n",
+    "            img = cv2.rotate(img, cv2.ROTATE_180)\n",
+    "        ax.imshow(img)\n",
+    "        ax.set_title(f\"{cam_key} ({img.shape})\")\n",
+    "        ax.axis(\"off\")\n",
+    "\n",
+    "    plt.tight_layout()\n",
+    "    plt.show()\n",
+    "\n",
+    "obs = get_full_observation()\n",
+    "viz_observation(obs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1ef87e90",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c899db7a",
+   "metadata": {},
+   "source": [
+    "## Build Observation for Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eb2ceb63",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from physicalai.data.observation import Observation\n",
+    "\n",
+    "def build_observation(obs: dict) -> Observation:\n",
+    "    \"\"\"Convert robot observation to Pi05 Observation.\"\"\"\n",
+    "    # State: (1, 6) float32\n",
+    "    state = torch.tensor(\n",
+    "        [[obs[f\"{jn}.pos\"] for jn in JOINT_NAMES]], dtype=torch.float32, device=DEVICE\n",
+    "    )\n",
+    "\n",
+    "    # Images: {name: (1, 3, H, W) float32 in [0, 1]}\n",
+    "    def img_to_tensor(img: np.ndarray) -> torch.Tensor:\n",
+    "        t = torch.from_numpy(img.copy()).float() / 255.0  # (H, W, 3) -> [0,1]\n",
+    "        t = t.permute(2, 0, 1).unsqueeze(0)  # (1, 3, H, W)\n",
+    "        return t.to(DEVICE)\n",
+    "\n",
+    "    images = {}\n",
+    "    for cam_key in [\"overhead\", \"arm\"]:\n",
+    "        img = np.ascontiguousarray(obs[cam_key])\n",
+    "        if cam_key in FLIP_CAMERAS:\n",
+    "            img = cv2.rotate(img, cv2.ROTATE_180)\n",
+    "        images[cam_key] = img_to_tensor(img)\n",
+    "\n",
+    "    return Observation(state=state, images=images, task=TASK)\n",
+    "\n",
+    "observation = build_observation(get_full_observation())\n",
+    "print(f\"State shape: {observation.state.shape}\")\n",
+    "for k, v in observation.images.items():\n",
+    "    print(f\"Image '{k}': {v.shape}, device={v.device}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3a0557e9",
+   "metadata": {},
+   "source": [
+    "## Run Inference (single step, timed)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5167ad02",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "\n",
+    "# Reset action queue to get a fresh prediction\n",
+    "policy._action_queue = deque()\n",
+    "\n",
+    "# Warm up GPU\n",
+    "if DEVICE == \"cuda\":\n",
+    "    torch.cuda.synchronize()\n",
+    "\n",
+    "t0 = time.perf_counter()\n",
+    "with torch.no_grad():\n",
+    "    action = policy.select_action(observation)\n",
+    "if DEVICE == \"cuda\":\n",
+    "    torch.cuda.synchronize()\n",
+    "latency = time.perf_counter() - t0\n",
+    "\n",
+    "action_np = action.detach().cpu().numpy()\n",
+    "if action_np.ndim == 2:\n",
+    "    action_np = action_np[0]\n",
+    "\n",
+    "print(f\"Inference latency: {latency*1000:.1f} ms\")\n",
+    "print(f\"Action shape: {action_np.shape}\")\n",
+    "print(f\"Action: {action_np}\")\n",
+    "print()\n",
+    "for i, jn in enumerate(JOINT_NAMES):\n",
+    "    print(f\"  {jn}: {action_np[i]:.4f}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "130d146d",
+   "metadata": {},
+   "source": [
+    "## Benchmark (5 inferences)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0a7dd57d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "latencies = []\n",
+    "for i in range(5):\n",
+    "    policy._action_queue = deque()\n",
+    "    if DEVICE == \"cuda\":\n",
+    "        torch.cuda.synchronize()\n",
+    "    t0 = time.perf_counter()\n",
+    "    with torch.no_grad():\n",
+    "        action = policy.select_action(observation)\n",
+    "    if DEVICE == \"cuda\":\n",
+    "        torch.cuda.synchronize()\n",
+    "    latencies.append(time.perf_counter() - t0)\n",
+    "\n",
+    "print(f\"Latencies: {[f'{l*1000:.1f}ms' for l in latencies]}\")\n",
+    "print(f\"Mean: {np.mean(latencies)*1000:.1f} ms\")\n",
+    "print(f\"Min:  {np.min(latencies)*1000:.1f} ms\")\n",
+    "print(f\"Max:  {np.max(latencies)*1000:.1f} ms\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "72d136ad",
+   "metadata": {},
+   "source": [
+    "## Send Action to Robot\n",
+    "\n",
+    "**Run only when ready to move.**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "21aeb2ac",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "action_dict = {f\"{jn}.pos\": float(action_np[i]) for i, jn in enumerate(JOINT_NAMES)}\n",
+    "robot.send_action(action_dict)\n",
+    "print(f\"Sent: {action_dict}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f7084ab5",
+   "metadata": {},
+   "source": [
+    "## Rollout Loop (observe → infer → execute full chunk)\n",
+    "\n",
+    "Runs a continuous loop: get observation, predict action chunk, execute all actions in the chunk at 30Hz, then repeat. **Interrupt the kernel to stop.**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "66acc85f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# FPS = 30\n",
+    "# CHUNK_SIZE = 50  # Pi05 default action chunk length\n",
+    "\n",
+    "# try:\n",
+    "#     step = 0\n",
+    "#     while True:\n",
+    "#         # Get fresh observation\n",
+    "#         obs = get_full_observation()\n",
+    "#         observation = build_observation(obs)\n",
+    "\n",
+    "#         # Generate full action chunk\n",
+    "#         policy._action_queue = deque()\n",
+    "#         t0 = time.perf_counter()\n",
+    "#         with torch.no_grad():\n",
+    "#             action = policy.select_action(observation)\n",
+    "#         infer_ms = (time.perf_counter() - t0) * 1000\n",
+    "\n",
+    "#         # Drain remaining actions from the queue (select_action returns first, rest are queued)\n",
+    "#         actions = [action.detach().cpu().numpy()]\n",
+    "#         while len(policy._action_queue) > 0:\n",
+    "#             a = policy._action_queue.popleft()\n",
+    "#             actions.append(a.detach().cpu().numpy() if isinstance(a, torch.Tensor) else a)\n",
+    "\n",
+    "#         print(f\"[step {step}] Inferred {len(actions)} actions in {infer_ms:.0f}ms\")\n",
+    "\n",
+    "#         # Execute each action in the chunk at target FPS\n",
+    "#         for i, act in enumerate(actions):\n",
+    "#             t_act = time.perf_counter()\n",
+    "#             act = np.squeeze(act)\n",
+    "#             action_dict = {f\"{jn}.pos\": float(act[j]) for j, jn in enumerate(JOINT_NAMES)}\n",
+    "#             robot.send_action(action_dict)\n",
+    "\n",
+    "#             # Sleep to maintain target FPS\n",
+    "#             elapsed = time.perf_counter() - t_act\n",
+    "#             sleep_time = (1.0 / FPS) - elapsed\n",
+    "#             if sleep_time > 0:\n",
+    "#                 time.sleep(sleep_time)\n",
+    "\n",
+    "#         step += 1\n",
+    "\n",
+    "# except KeyboardInterrupt:\n",
+    "#     print(f\"\\nStopped after {step} chunks\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cd3ce7cc",
+   "metadata": {},
+   "source": [
+    "## Async Threaded Rollout\n",
+    "\n",
+    "Background thread runs GPU inference while the main thread sends actions at 30Hz.\n",
+    "Lerp-blends overlapping chunks for smooth transitions. **Interrupt the kernel to stop.**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1d054a13",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import threading\n",
+    "\n",
+    "class QueueMixer:\n",
+    "    \"\"\"Action queue with lerp blending between chunks.\"\"\"\n",
+    "\n",
+    "    def __init__(self, lerp_duration: int = 5):\n",
+    "        self.queue: np.ndarray | None = None\n",
+    "        self.lerp_duration = lerp_duration\n",
+    "        self.index = 0\n",
+    "\n",
+    "    def add(self, chunk: np.ndarray, offset: int = 0):\n",
+    "        if self.queue is None or self.index >= len(self.queue):\n",
+    "            self.queue = chunk[offset:]\n",
+    "            self.index = 0\n",
+    "            return\n",
+    "\n",
+    "        remaining = self.queue[self.index:]\n",
+    "        incoming = chunk[offset:]\n",
+    "        n_remain = len(remaining)\n",
+    "        lerp_dur = min(n_remain, self.lerp_duration)\n",
+    "\n",
+    "        weights = np.maximum(1.0 - np.arange(n_remain) / max(lerp_dur, 1), 0.0)[:, np.newaxis]\n",
+    "        n_blend = min(n_remain, len(incoming))\n",
+    "        blended = weights[:n_blend] * remaining[:n_blend] + (1.0 - weights[:n_blend]) * incoming[:n_blend]\n",
+    "        self.queue = np.concatenate([blended, incoming[n_blend:]], axis=0).astype(np.float32)\n",
+    "        self.index = 0\n",
+    "\n",
+    "    def pop(self) -> np.ndarray | None:\n",
+    "        if self.queue is None or self.index >= len(self.queue):\n",
+    "            return None\n",
+    "        action = self.queue[self.index]\n",
+    "        self.index += 1\n",
+    "        return action\n",
+    "\n",
+    "    @property\n",
+    "    def remaining(self) -> int:\n",
+    "        if self.queue is None:\n",
+    "            return 0\n",
+    "        return max(len(self.queue) - self.index, 0)\n",
+    "\n",
+    "\n",
+    "class GpuInferenceThread:\n",
+    "    \"\"\"Background thread that runs PyTorch GPU inference on demand.\"\"\"\n",
+    "\n",
+    "    def __init__(self, policy, device: str):\n",
+    "        self.policy = policy\n",
+    "        self.device = device\n",
+    "        self._lock = threading.Lock()\n",
+    "        self._obs_slot: dict | None = None\n",
+    "        self._obs_ready = threading.Event()\n",
+    "        self._running_inference = False\n",
+    "        self._request_time = 0.0\n",
+    "        self._result_lock = threading.Lock()\n",
+    "        self._result_slot: tuple[np.ndarray, float] | None = None\n",
+    "        self._stop = threading.Event()\n",
+    "        self._thread = threading.Thread(target=self._run, name=\"GpuInferenceThread\", daemon=True)\n",
+    "        self.inference_count = 0\n",
+    "\n",
+    "    def start(self):\n",
+    "        self._thread.start()\n",
+    "\n",
+    "    def stop(self):\n",
+    "        self._stop.set()\n",
+    "        self._obs_ready.set()\n",
+    "        self._thread.join(timeout=10.0)\n",
+    "\n",
+    "    def request(self, obs_dict: dict) -> bool:\n",
+    "        with self._lock:\n",
+    "            if self._obs_slot is not None or self._running_inference:\n",
+    "                return False\n",
+    "            self._obs_slot = obs_dict\n",
+    "            self._request_time = time.perf_counter()\n",
+    "        self._obs_ready.set()\n",
+    "        return True\n",
+    "\n",
+    "    @property\n",
+    "    def busy(self) -> bool:\n",
+    "        with self._lock:\n",
+    "            return self._obs_slot is not None or self._running_inference\n",
+    "\n",
+    "    @property\n",
+    "    def alive(self) -> bool:\n",
+    "        return self._thread.is_alive()\n",
+    "\n",
+    "    @property\n",
+    "    def busy_duration(self) -> float:\n",
+    "        \"\"\"Seconds since last request was submitted. 0 if not busy.\"\"\"\n",
+    "        with self._lock:\n",
+    "            if not (self._obs_slot is not None or self._running_inference):\n",
+    "                return 0.0\n",
+    "            return time.perf_counter() - self._request_time\n",
+    "\n",
+    "    def force_reset(self):\n",
+    "        \"\"\"Clear stuck state so new requests can be submitted.\"\"\"\n",
+    "        with self._lock:\n",
+    "            self._obs_slot = None\n",
+    "            self._running_inference = False\n",
+    "        print(\"[InferenceThread] Force reset — cleared stuck state\")\n",
+    "\n",
+    "    def get_result(self) -> tuple[np.ndarray, float] | None:\n",
+    "        with self._result_lock:\n",
+    "            r = self._result_slot\n",
+    "            self._result_slot = None\n",
+    "        return r\n",
+    "\n",
+    "    def _run(self):\n",
+    "        while not self._stop.is_set():\n",
+    "            self._obs_ready.wait()\n",
+    "            self._obs_ready.clear()\n",
+    "            if self._stop.is_set():\n",
+    "                break\n",
+    "\n",
+    "            with self._lock:\n",
+    "                obs_dict = self._obs_slot\n",
+    "                self._obs_slot = None\n",
+    "                self._running_inference = True\n",
+    "\n",
+    "            if obs_dict is None:\n",
+    "                with self._lock:\n",
+    "                    self._running_inference = False\n",
+    "                continue\n",
+    "\n",
+    "            self.inference_count += 1\n",
+    "            t0 = time.perf_counter()\n",
+    "\n",
+    "            try:\n",
+    "                observation = build_observation(obs_dict)\n",
+    "                self.policy._action_queue = deque()\n",
+    "\n",
+    "                with torch.no_grad():\n",
+    "                    action = self.policy.select_action(observation)\n",
+    "                if self.device == \"cuda\":\n",
+    "                    torch.cuda.synchronize()\n",
+    "\n",
+    "                # Collect full chunk\n",
+    "                actions = [action.detach().cpu().numpy()]\n",
+    "                while len(self.policy._action_queue) > 0:\n",
+    "                    a = self.policy._action_queue.popleft()\n",
+    "                    actions.append(a.detach().cpu().numpy() if isinstance(a, torch.Tensor) else a)\n",
+    "\n",
+    "                chunk = np.array([np.squeeze(a) for a in actions], dtype=np.float32)\n",
+    "            except Exception as e:\n",
+    "                import traceback\n",
+    "                print(f\"[InferenceThread] Error: {e}\")\n",
+    "                traceback.print_exc()\n",
+    "                with self._lock:\n",
+    "                    self._running_inference = False\n",
+    "                continue\n",
+    "\n",
+    "            latency = time.perf_counter() - t0\n",
+    "            with self._result_lock:\n",
+    "                self._result_slot = (chunk, latency)\n",
+    "            with self._lock:\n",
+    "                self._running_inference = False\n",
+    "\n",
+    "print(\"QueueMixer and GpuInferenceThread defined\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "664e5887",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ASYNC_FPS = 30\n",
+    "QUEUE_THRESHOLD_FRAC = 0.5  # Request new inference when queue drops below this fraction\n",
+    "LERP_DURATION = 5           # Frames over which to blend old/new chunks\n",
+    "PLOT_INTERVAL = 1           # Seconds between live plot updates\n",
+    "\n",
+    "# --- Warm-up to determine chunk size ---\n",
+    "warmup_obs = get_full_observation()\n",
+    "warmup_observation = build_observation(warmup_obs)\n",
+    "policy._action_queue = deque()\n",
+    "with torch.no_grad():\n",
+    "    warmup_action = policy.select_action(warmup_observation)\n",
+    "warmup_actions = [warmup_action.detach().cpu().numpy()]\n",
+    "while len(policy._action_queue) > 0:\n",
+    "    a = policy._action_queue.popleft()\n",
+    "    warmup_actions.append(a.detach().cpu().numpy() if isinstance(a, torch.Tensor) else a)\n",
+    "warmup_chunk = np.array([np.squeeze(a) for a in warmup_actions], dtype=np.float32)\n",
+    "chunk_size = warmup_chunk.shape[0]\n",
+    "threshold = int(chunk_size * QUEUE_THRESHOLD_FRAC)\n",
+    "print(f\"Chunk size: {chunk_size}, threshold: {threshold}\")\n",
+    "\n",
+    "# --- Telemetry collector ---\n",
+    "telemetry = {\n",
+    "    \"t\": [],                  # wall time per step\n",
+    "    \"actions_sent\": [],       # action_array for each action sent\n",
+    "    \"queue_depth\": [],        # (t, remaining) per step\n",
+    "    \"infer_requests\": [],     # t when inference was requested\n",
+    "    \"infer_completions\": [],  # (t, latency_ms, offset)\n",
+    "    \"chunks_received\": [],    # (t_received, chunk_array, inference_num)\n",
+    "    \"holds\": [],              # t when queue was empty (holding position)\n",
+    "}\n",
+    "t_start = time.perf_counter()\n",
+    "\n",
+    "# --- Live plotting ---\n",
+    "import matplotlib.patches as mpatches\n",
+    "from IPython.display import display\n",
+    "\n",
+    "def render_telemetry(telemetry, threshold, chunk_size):\n",
+    "    \"\"\"Render telemetry to a matplotlib figure.\"\"\"\n",
+    "    actions_arr = np.array(telemetry[\"actions_sent\"])\n",
+    "    t_arr = np.array(telemetry[\"t\"])\n",
+    "    if len(t_arr) < 2:\n",
+    "        return None\n",
+    "\n",
+    "    queue_t, queue_d = zip(*telemetry[\"queue_depth\"])\n",
+    "    n_joints = len(JOINT_NAMES)\n",
+    "    n_chunks = len(telemetry[\"chunks_received\"])\n",
+    "    chunk_cmap = plt.cm.Set2\n",
+    "\n",
+    "    fig, axes = plt.subplots(n_joints + 1, 1, figsize=(16, 2.5 * (n_joints + 1)), sharex=True)\n",
+    "\n",
+    "    for j, jn in enumerate(JOINT_NAMES):\n",
+    "        ax = axes[j]\n",
+    "        ax.plot(t_arr, actions_arr[:, j], color=\"black\", linewidth=1.2, zorder=3)\n",
+    "\n",
+    "        for idx, (t_recv, chunk, _) in enumerate(telemetry[\"chunks_received\"]):\n",
+    "            color = chunk_cmap(idx % 8)\n",
+    "            chunk_t = t_recv + np.arange(len(chunk)) / ASYNC_FPS\n",
+    "            ax.plot(chunk_t, chunk[:, j], color=color, alpha=0.4, linewidth=1.5, zorder=2)\n",
+    "\n",
+    "        for t_req in telemetry[\"infer_requests\"]:\n",
+    "            ax.axvline(t_req, color=\"green\", alpha=0.3, linewidth=0.8, linestyle=\"--\")\n",
+    "        for t_comp, _, _ in telemetry[\"infer_completions\"]:\n",
+    "            ax.axvline(t_comp, color=\"blue\", alpha=0.3, linewidth=0.8, linestyle=\"-.\")\n",
+    "        for ht in telemetry[\"holds\"]:\n",
+    "            ax.axvspan(ht, ht + 1/ASYNC_FPS, color=\"red\", alpha=0.15)\n",
+    "\n",
+    "        ax.set_ylabel(jn, fontsize=9)\n",
+    "        ax.tick_params(labelsize=8)\n",
+    "\n",
+    "    ax_q = axes[-1]\n",
+    "    ax_q.fill_between(queue_t, queue_d, alpha=0.3, color=\"steelblue\")\n",
+    "    ax_q.plot(queue_t, queue_d, color=\"steelblue\", linewidth=1)\n",
+    "    ax_q.axhline(threshold, color=\"orange\", linestyle=\"--\", linewidth=1, label=f\"threshold ({threshold})\")\n",
+    "    for t_req in telemetry[\"infer_requests\"]:\n",
+    "        ax_q.axvline(t_req, color=\"green\", alpha=0.5, linewidth=1, linestyle=\"--\")\n",
+    "    for t_comp, _, _ in telemetry[\"infer_completions\"]:\n",
+    "        ax_q.axvline(t_comp, color=\"blue\", alpha=0.5, linewidth=1, linestyle=\"-.\")\n",
+    "    ax_q.set_ylabel(\"Queue\", fontsize=9)\n",
+    "    ax_q.set_xlabel(\"Time (s)\", fontsize=10)\n",
+    "    ax_q.legend(loc=\"upper right\", fontsize=8)\n",
+    "    ax_q.tick_params(labelsize=8)\n",
+    "\n",
+    "    legend_handles = [\n",
+    "        mpatches.Patch(color=\"black\", label=\"Sent to robot\"),\n",
+    "        mpatches.Patch(color=\"green\", alpha=0.5, label=\"Infer requested\"),\n",
+    "        mpatches.Patch(color=\"blue\", alpha=0.5, label=\"Chunk received\"),\n",
+    "        mpatches.Patch(color=\"red\", alpha=0.3, label=\"Hold\"),\n",
+    "    ]\n",
+    "    for idx in range(min(n_chunks, 6)):\n",
+    "        legend_handles.append(mpatches.Patch(color=chunk_cmap(idx % 8), alpha=0.5, label=f\"Chunk #{idx+1}\"))\n",
+    "    fig.legend(handles=legend_handles, loc=\"upper center\", ncol=min(len(legend_handles), 5), fontsize=8, bbox_to_anchor=(0.5, 1.02))\n",
+    "\n",
+    "    n_holds = len(telemetry[\"holds\"])\n",
+    "    fig.suptitle(f\"Live — {len(t_arr)} steps, {n_chunks} inferences, {n_holds} holds, t={t_arr[-1]:.1f}s\", fontsize=11, y=1.04)\n",
+    "    plt.tight_layout()\n",
+    "    return fig\n",
+    "\n",
+    "# Show initial empty plot placeholder\n",
+    "plot_handle = display(plt.figure(figsize=(16, 2)), display_id=True)\n",
+    "plt.close()\n",
+    "\n",
+    "# --- Init ---\n",
+    "queue = QueueMixer(lerp_duration=LERP_DURATION)\n",
+    "queue.add(warmup_chunk)  # Seed so robot can move immediately\n",
+    "\n",
+    "infer_thread = GpuInferenceThread(policy, DEVICE)\n",
+    "infer_thread.start()\n",
+    "\n",
+    "goal_time = 1.0 / ASYNC_FPS\n",
+    "last_action = warmup_chunk[0]\n",
+    "step = 0\n",
+    "hold_count = 0\n",
+    "last_plot_time = 0.0\n",
+    "\n",
+    "print(f\"Async rollout at {ASYNC_FPS} Hz — interrupt kernel to stop\")\n",
+    "\n",
+    "try:\n",
+    "    while True:\n",
+    "        loop_start = time.perf_counter()\n",
+    "        t_rel = loop_start - t_start\n",
+    "\n",
+    "        # 1. Check for inference result\n",
+    "        result = infer_thread.get_result()\n",
+    "        if result is not None:\n",
+    "            chunk, latency = result\n",
+    "            offset = int(latency * ASYNC_FPS)\n",
+    "            queue.add(chunk, offset=offset)\n",
+    "            queue.lerp_duration = max(offset, 1)\n",
+    "            telemetry[\"infer_completions\"].append((t_rel, latency * 1000, offset))\n",
+    "            telemetry[\"chunks_received\"].append((t_rel, chunk.copy(), infer_thread.inference_count))\n",
+    "            print(f\"[t={t_rel:.1f}s] <<< inference #{infer_thread.inference_count}: \"\n",
+    "                  f\"{latency*1000:.0f}ms, offset={offset}, queue={queue.remaining}\")\n",
+    "\n",
+    "        # 2. Request new inference when queue is low (with stuck/dead detection)\n",
+    "        INFER_TIMEOUT = 15.0\n",
+    "        if queue.remaining <= threshold:\n",
+    "            if not infer_thread.alive:\n",
+    "                print(f\"[t={t_rel:.1f}s] !!! Inference thread DEAD — restarting\")\n",
+    "                infer_thread = GpuInferenceThread(policy, DEVICE)\n",
+    "                infer_thread.start()\n",
+    "            elif infer_thread.busy_duration > INFER_TIMEOUT:\n",
+    "                print(f\"[t={t_rel:.1f}s] !!! Inference stuck for {infer_thread.busy_duration:.0f}s — force resetting\")\n",
+    "                infer_thread.force_reset()\n",
+    "\n",
+    "            if not infer_thread.busy:\n",
+    "                obs = get_full_observation()\n",
+    "                submitted = infer_thread.request(obs)\n",
+    "                if submitted:\n",
+    "                    telemetry[\"infer_requests\"].append(t_rel)\n",
+    "                    print(f\"[t={t_rel:.1f}s] >>> inference requested (queue={queue.remaining}/{chunk_size})\")\n",
+    "\n",
+    "        # 3. Pop action\n",
+    "        action = queue.pop()\n",
+    "        if action is not None:\n",
+    "            last_action = action\n",
+    "            hold_count = 0\n",
+    "        else:\n",
+    "            action = last_action\n",
+    "            hold_count += 1\n",
+    "            telemetry[\"holds\"].append(t_rel)\n",
+    "            if hold_count == 1 or hold_count % 30 == 0:\n",
+    "                print(f\"[t={t_rel:.1f}s] !!! Queue empty, holding (hold={hold_count})\")\n",
+    "\n",
+    "        # 4. Send to robot\n",
+    "        if action is not None:\n",
+    "            action_dict = {f\"{jn}.pos\": float(action[i]) for i, jn in enumerate(JOINT_NAMES)}\n",
+    "            robot.send_action(action_dict)\n",
+    "\n",
+    "        # 5. Record telemetry\n",
+    "        telemetry[\"t\"].append(t_rel)\n",
+    "        telemetry[\"actions_sent\"].append(action.copy() if action is not None else last_action.copy())\n",
+    "        telemetry[\"queue_depth\"].append((t_rel, queue.remaining))\n",
+    "\n",
+    "        # 6. Live plot update (throttled)\n",
+    "        if t_rel - last_plot_time > PLOT_INTERVAL and len(telemetry[\"t\"]) > 10:\n",
+    "            fig = render_telemetry(telemetry, threshold, chunk_size)\n",
+    "            if fig is not None:\n",
+    "                plot_handle.update(fig)\n",
+    "                plt.close(fig)\n",
+    "            last_plot_time = t_rel\n",
+    "\n",
+    "        # 7. Maintain FPS\n",
+    "        elapsed = time.perf_counter() - loop_start\n",
+    "        sleep_time = goal_time - elapsed\n",
+    "        if sleep_time > 0:\n",
+    "            time.sleep(sleep_time)\n",
+    "\n",
+    "        step += 1\n",
+    "\n",
+    "except KeyboardInterrupt:\n",
+    "    print(f\"\\nStopped after {step} steps, {infer_thread.inference_count} inferences, \"\n",
+    "          f\"{len(telemetry['holds'])} holds, duration={time.perf_counter() - t_start:.1f}s\")\n",
+    "\n",
+    "infer_thread.stop()\n",
+    "print(\"Inference thread stopped\")\n",
+    "\n",
+    "# Final plot\n",
+    "fig = render_telemetry(telemetry, threshold, chunk_size)\n",
+    "if fig is not None:\n",
+    "    plot_handle.update(fig)\n",
+    "    plt.close(fig)\n",
+    "\n",
+    "if telemetry[\"infer_completions\"]:\n",
+    "    latencies_ms = [lat for _, lat, _ in telemetry[\"infer_completions\"]]\n",
+    "    t_arr = np.array(telemetry[\"t\"])\n",
+    "    print(f\"\\nInference latencies: mean={np.mean(latencies_ms):.0f}ms, \"\n",
+    "          f\"min={np.min(latencies_ms):.0f}ms, max={np.max(latencies_ms):.0f}ms\")\n",
+    "    print(f\"Queue empty fraction: {len(telemetry['holds'])/max(len(t_arr),1)*100:.1f}%\")\n",
+    "    print(f\"Total duration: {t_arr[-1]:.1f}s, steps: {len(t_arr)}, effective Hz: {len(t_arr)/t_arr[-1]:.1f}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b0e2a88d",
+   "metadata": {},
+   "source": [
+    "## Disconnect"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d6eb8f97",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for name, cam in cameras.items():\n",
+    "    cam.disconnect()\n",
+    "\n",
+    "    print(f\"Camera '{name}' disconnected\")\n",
+    "    \n",
+    "robot.disconnect()\n",
+    "print(\"Robot disconnected\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "physical-ai-studio (3.12.3)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/src/physicalai/runtime/__init__.py b/src/physicalai/runtime/__init__.py
new file mode 100644
index 0000000..4be467a
--- /dev/null
+++ b/src/physicalai/runtime/__init__.py
@@ -0,0 +1,43 @@
+# Copyright (C) 2025-2026 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+"""Runtime system for running trained policies on robot hardware.
+
+Public API::
+
+    from physicalai.runtime import PolicyRuntime, RunStats, RuntimeCallback
+    from physicalai.runtime import SyncExecution, AsyncExecution, Execution, WorkerDiedError
+    from physicalai.runtime import ActionQueue
+    from physicalai.runtime import ChunkSmoother, LerpSmoother, ReplaceSmoother
+    from physicalai.runtime import default_observation_to_input
+"""
+
+from physicalai.runtime._action_queue import ActionQueue  # noqa: PLC2701
+from physicalai.runtime.execution import (
+    AsyncExecution,
+    Execution,
+    SyncExecution,
+    WorkerDiedError,
+)
+from physicalai.runtime.runtime import (
+    PolicyRuntime,
+    RunStats,
+    RuntimeCallback,
+    default_observation_to_input,
+)
+from physicalai.runtime.smoothers import ChunkSmoother, LerpSmoother, ReplaceSmoother
+
+__all__ = [
+    "ActionQueue",
+    "AsyncExecution",
+    "ChunkSmoother",
+    "Execution",
+    "LerpSmoother",
+    "PolicyRuntime",
+    "ReplaceSmoother",
+    "RunStats",
+    "RuntimeCallback",
+    "SyncExecution",
+    "WorkerDiedError",
+    "default_observation_to_input",
+]
diff --git a/src/physicalai/runtime/_action_queue.py b/src/physicalai/runtime/_action_queue.py
new file mode 100644
index 0000000..af768a6
--- /dev/null
+++ b/src/physicalai/runtime/_action_queue.py
@@ -0,0 +1,72 @@
+# Copyright (C) 2025-2026 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import threading
+from collections import deque
+
+import numpy as np
+
+from physicalai.runtime.smoothers import ChunkSmoother, ReplaceSmoother
+
+
+class ActionQueue:
+    """Thread-safe action queue with chunk smoothing."""
+
+    def __init__(self, smoother: ChunkSmoother | None = None) -> None:
+        self._smoother = smoother or ReplaceSmoother()
+        self._deque: deque[np.ndarray] = deque()
+        self._lock = threading.Lock()
+        self._consecutive_holds = 0
+        self._total_holds = 0
+        self._total_pops = 0
+
+    def push_chunk(self, chunk: np.ndarray, offset: int = 0) -> None:
+        """Push an action chunk, blending with remaining actions via the smoother."""
+        with self._lock:
+            remaining = np.stack(list(self._deque)) if self._deque else np.empty((0, chunk.shape[1]), dtype=chunk.dtype)
+            merged = self._smoother.merge(remaining, chunk, offset)
+            self._deque.clear()
+            self._deque.extend(merged)
+
+    def pop(self) -> np.ndarray | None:
+        """Pop the next action.
+
+        Returns:
+            Single action vector, or None if empty.
+        """
+        with self._lock:
+            if not self._deque:
+                self._consecutive_holds += 1
+                self._total_holds += 1
+                return None
+            self._consecutive_holds = 0
+            self._total_pops += 1
+            return self._deque.popleft()
+
+    @property
+    def remaining(self) -> int:
+        with self._lock:
+            return len(self._deque)
+
+    @property
+    def consecutive_holds(self) -> int:
+        return self._consecutive_holds
+
+    @property
+    def total_holds(self) -> int:
+        return self._total_holds
+
+    @property
+    def total_pops(self) -> int:
+        return self._total_pops
+
+    def below_threshold(self, threshold: int) -> bool:
+        with self._lock:
+            return len(self._deque) < threshold
+
+    def clear(self) -> None:
+        with self._lock:
+            self._deque.clear()
+            self._consecutive_holds = 0
diff --git a/src/physicalai/runtime/execution.py b/src/physicalai/runtime/execution.py
new file mode 100644
index 0000000..28ec86c
--- /dev/null
+++ b/src/physicalai/runtime/execution.py
@@ -0,0 +1,250 @@
+# Copyright (C) 2025-2026 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+"""Execution strategies for scheduling policy inference."""
+
+from __future__ import annotations
+
+import logging
+import threading
+import time
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING
+
+import numpy as np
+
+if TYPE_CHECKING:
+    from physicalai.inference.model import InferenceModel
+    from physicalai.runtime._action_queue import ActionQueue
+
+logger = logging.getLogger(__name__)
+
+_NOT_STARTED = "start() must be called before this method"
+
+
+class WorkerDiedError(RuntimeError):
+    """Raised when the inference worker thread dies unexpectedly."""
+
+
+class Execution(ABC):
+    """Decides when and where inference runs. Pushes results into ActionQueue."""
+
+    @abstractmethod
+    def start(self, model: InferenceModel, action_queue: ActionQueue) -> None:
+        """Bind to model and queue. Called once before the loop."""
+        ...
+
+    @abstractmethod
+    def maybe_request(self, observation: dict[str, np.ndarray]) -> None:
+        """Check if new inference is needed. If so, run or schedule it."""
+        ...
+
+    @abstractmethod
+    def warmup(self, sample_observation: dict[str, np.ndarray]) -> None:
+        """Run one inference to discover chunk_size and seed the queue."""
+        ...
+
+    @abstractmethod
+    def stop(self) -> None:
+        """Stop scheduling."""
+        ...
+
+    @property
+    @abstractmethod
+    def chunk_size(self) -> int:
+        """Discovered after warmup()."""
+        ...
+
+
+class SyncExecution(Execution):
+    """Synchronous inference in the control thread."""
+
+    def __init__(self) -> None:  # noqa: D107
+        self._model: InferenceModel | None = None
+        self._queue: ActionQueue | None = None
+        self._chunk_size: int = 0
+
+    def start(self, model: InferenceModel, action_queue: ActionQueue) -> None:
+        """Bind model and queue."""
+        self._model = model
+        self._queue = action_queue
+
+    def warmup(self, sample_observation: dict[str, np.ndarray]) -> None:
+        """Run one inference, seed queue, discover chunk_size.
+
+        Raises:
+            RuntimeError: If start() has not been called.
+        """
+        if self._model is None or self._queue is None:
+            raise RuntimeError(_NOT_STARTED)
+        actions = self._model.predict_action_chunk(sample_observation)
+        self._chunk_size = actions.shape[0]
+        self._queue.push_chunk(actions, offset=0)
+
+    def maybe_request(self, observation: dict[str, np.ndarray]) -> None:
+        """Refill queue synchronously when empty.
+
+        Raises:
+            RuntimeError: If start() has not been called.
+        """
+        if self._model is None or self._queue is None:
+            raise RuntimeError(_NOT_STARTED)
+        if self._queue.below_threshold(1):
+            actions = self._model.predict_action_chunk(observation)
+            self._queue.push_chunk(actions, offset=0)
+
+    def stop(self) -> None:
+        """No-op for synchronous execution."""
+
+    @property
+    def chunk_size(self) -> int:
+        """Return discovered chunk size."""
+        return self._chunk_size
+
+
+class AsyncExecution(Execution):
+    """Async inference in a background thread with health monitoring."""
+
+    def __init__(  # noqa: D107
+        self,
+        threshold: float = 0.5,
+        fps: int = 30,
+        watchdog_timeout_s: float = 30.0,
+        max_consecutive_holds: int | None = None,
+    ) -> None:
+        self._threshold_frac = threshold
+        self._fps = fps
+        self._watchdog_timeout_s = watchdog_timeout_s
+        self._max_consecutive_holds = max_consecutive_holds or 3 * fps
+
+        self._model: InferenceModel | None = None
+        self._queue: ActionQueue | None = None
+        self._chunk_size: int = 0
+        self._threshold_count: int = 0
+
+        self._lock = threading.Lock()
+        self._obs_slot: dict[str, np.ndarray] | None = None
+        self._obs_ready = threading.Event()
+        self._running_inference = False
+        self._request_time: float = 0.0
+        self._stop_event = threading.Event()
+        self._thread: threading.Thread | None = None
+        self._death_cause: BaseException | None = None
+        self._inference_count: int = 0
+
+    def start(self, model: InferenceModel, action_queue: ActionQueue) -> None:
+        """Bind model/queue and spawn inference thread."""
+        self._model = model
+        self._queue = action_queue
+        self._thread = threading.Thread(target=self._run, name="InferenceThread", daemon=True)
+        self._thread.start()
+
+    def warmup(self, sample_observation: dict[str, np.ndarray]) -> None:
+        """Run one inference in main thread, seed queue, discover chunk_size.
+
+        Raises:
+            RuntimeError: If start() has not been called.
+        """
+        if self._model is None or self._queue is None:
+            raise RuntimeError(_NOT_STARTED)
+        actions = self._model.predict_action_chunk(sample_observation)
+        self._chunk_size = actions.shape[0]
+        self._threshold_count = int(self._chunk_size * self._threshold_frac)
+        self._queue.push_chunk(actions, offset=0)
+
+    def maybe_request(self, observation: dict[str, np.ndarray]) -> None:
+        """Submit observation for background inference if queue is low and worker idle.
+
+        Raises:
+            RuntimeError: If start() has not been called.
+            WorkerDiedError: If the inference thread has died.
+        """
+        if self._queue is None:
+            raise RuntimeError(_NOT_STARTED)
+        if self._thread is not None and not self._thread.is_alive() and self._death_cause is not None:
+            msg = f"Inference thread died: {self._death_cause}"
+            raise WorkerDiedError(msg) from self._death_cause
+
+        if self._busy_duration > self._watchdog_timeout_s:
+            logger.warning("Inference stuck for %.0fs — force resetting", self._busy_duration)
+            self._force_reset()
+
+        if self._queue.below_threshold(self._threshold_count) and not self._busy:
+            snapshot = {k: v.copy() if isinstance(v, np.ndarray) else v for k, v in observation.items()}
+            with self._lock:
+                self._obs_slot = snapshot
+                self._request_time = time.perf_counter()
+            self._obs_ready.set()
+
+    def stop(self) -> None:
+        """Signal thread and join with timeout."""
+        if self._thread is not None:
+            self._stop_event.set()
+            self._obs_ready.set()
+            self._thread.join(timeout=10.0)
+
+    @property
+    def chunk_size(self) -> int:
+        """Return discovered chunk size."""
+        return self._chunk_size
+
+    @property
+    def alive(self) -> bool:
+        """Whether the inference thread is alive."""
+        return self._thread is not None and self._thread.is_alive()
+
+    @property
+    def inference_count(self) -> int:
+        """Number of completed inference calls."""
+        return self._inference_count
+
+    @property
+    def _busy(self) -> bool:
+        with self._lock:
+            return self._obs_slot is not None or self._running_inference
+
+    @property
+    def _busy_duration(self) -> float:
+        with self._lock:
+            if not (self._obs_slot is not None or self._running_inference):
+                return 0.0
+            return time.perf_counter() - self._request_time
+
+    def _force_reset(self) -> None:
+        with self._lock:
+            self._obs_slot = None
+            self._running_inference = False
+        logger.warning("Force reset — cleared stuck inference state")
+
+    def _run(self) -> None:
+        try:
+            while not self._stop_event.is_set():
+                self._obs_ready.wait()
+                self._obs_ready.clear()
+
+                if self._stop_event.is_set():
+                    return
+
+                with self._lock:
+                    obs = self._obs_slot
+                    self._obs_slot = None
+                    if obs is None:
+                        continue
+                    self._running_inference = True
+
+                if self._model is None or self._queue is None:
+                    raise RuntimeError(_NOT_STARTED)  # noqa: TRY301
+                t0 = time.perf_counter()
+                actions = self._model.predict_action_chunk(obs)
+                latency = time.perf_counter() - t0
+
+                offset = int(latency * self._fps)
+                self._queue.push_chunk(actions, offset=offset)
+                self._inference_count += 1
+
+                with self._lock:
+                    self._running_inference = False
+
+        except Exception as e:
+            self._death_cause = e
+            logger.exception("Inference thread died")
diff --git a/src/physicalai/runtime/runtime.py b/src/physicalai/runtime/runtime.py
new file mode 100644
index 0000000..bb92225
--- /dev/null
+++ b/src/physicalai/runtime/runtime.py
@@ -0,0 +1,226 @@
+# Copyright (C) 2025-2026 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+"""PolicyRuntime — runs a trained policy on robot hardware."""
+
+from __future__ import annotations
+
+import logging
+import time
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Protocol
+
+import numpy as np
+
+from physicalai.runtime._action_queue import ActionQueue  # noqa: PLC2701
+from physicalai.runtime.execution import Execution, WorkerDiedError
+from physicalai.runtime.smoothers import LerpSmoother
+
+if TYPE_CHECKING:
+    from collections.abc import Callable, Mapping, Sequence
+
+    from physicalai.capture.camera import Camera
+    from physicalai.capture.frame import Frame
+    from physicalai.inference.model import InferenceModel
+    from physicalai.robot.interface import Robot, RobotObservation
+
+logger = logging.getLogger(__name__)
+
+_DEFAULT_LERP_FRAMES = 5
+
+
+def default_observation_to_input(
+    robot_obs: RobotObservation,
+    camera_frames: dict[str, Frame],
+) -> dict[str, Any]:
+    """Convert robot observation and camera frames to model input dict.
+
+    Maps:
+        - ``robot_obs.joint_positions`` → ``"state"`` (as batch dim)
+        - ``frame.data`` per camera → ``"images.{name}"``
+
+    Returns:
+        Model input dictionary.
+    """
+    model_input: dict[str, Any] = {}
+
+    if robot_obs.joint_positions is not None:
+        model_input["state"] = np.array([robot_obs.joint_positions], dtype=np.float32)
+
+    for name, frame in camera_frames.items():
+        model_input[f"images.{name}"] = frame.data
+
+    return model_input
+
+
+class RuntimeCallback(Protocol):
+    """Optional hook points in the PolicyRuntime control loop."""
+
+    def before_send_action(self, *, action: np.ndarray, step: int) -> np.ndarray | None:
+        """Called before sending action. Return modified action or None."""
+        ...
+
+    def on_action_sent(self, *, action: np.ndarray, step: int) -> None:
+        """Called after action is sent to robot."""
+        ...
+
+    def on_hold(self, *, step: int, holds: int) -> None:
+        """Called when action queue is empty and robot holds last position."""
+        ...
+
+
+@dataclass(frozen=True)
+class RunStats:
+    """Statistics from a PolicyRuntime.run() session."""
+
+    steps: int
+    total_pops: int
+    total_holds: int
+    inference_count: int
+    transient_errors: int = 0
+    stale_obs_ticks: int = 0
+
+
+class PolicyRuntime:
+    """Runs a policy on robot hardware.
+
+    Loop: observe → maybe_request → pop → send → sleep.
+    Robot and cameras must be connected before run(). Caller owns lifecycle.
+    """
+
+    def __init__(  # noqa: D107
+        self,
+        robot: Robot,
+        model: InferenceModel,
+        execution: Execution,
+        fps: float,
+        cameras: Mapping[str, Camera] | None = None,
+        action_queue: ActionQueue | None = None,
+        obs_to_input: Callable[[RobotObservation, dict[str, Frame]], dict[str, Any]] | None = None,
+        callbacks: Sequence[RuntimeCallback] = (),
+    ) -> None:
+        if fps <= 0:
+            msg = f"fps must be positive, got {fps}"
+            raise ValueError(msg)
+        self._robot = robot
+        self._model = model
+        self._execution = execution
+        self._fps = fps
+        self._cameras: Mapping[str, Camera] = cameras or {}
+        self._action_queue = action_queue or ActionQueue(smoother=LerpSmoother(duration_frames=_DEFAULT_LERP_FRAMES))
+        self._obs_to_input = obs_to_input or default_observation_to_input
+        self._callbacks = list(callbacks)
+
+    def run(self, *, duration_s: float | None = None) -> RunStats:
+        """Run the control loop.
+
+        Args:
+            duration_s: Maximum duration in seconds. None runs indefinitely.
+
+        Returns:
+            Statistics from the run session.
+
+        Raises:
+            WorkerDiedError: If the inference worker thread dies.
+        """
+        self._execution.start(self._model, self._action_queue)
+        sample_obs = self._build_model_input()
+        self._execution.warmup(sample_obs)
+
+        goal_time = 1.0 / self._fps
+        step = 0
+        last_action: np.ndarray | None = None
+
+        try:
+            while True:
+                if duration_s is not None and step * goal_time >= duration_s:
+                    break
+
+                loop_start = time.perf_counter()
+
+                obs = self._build_model_input()
+                self._execution.maybe_request(obs)
+
+                action = self._action_queue.pop()
+                if action is not None:
+                    last_action = action
+                else:
+                    action = last_action
+                    holds = self._action_queue.consecutive_holds
+                    if holds == 1:
+                        logger.warning("Queue empty — holding position")
+                    elif self._fps > 0 and holds % int(self._fps) == 0:
+                        logger.warning(
+                            "Queue starvation: %d consecutive holds (%.1fs)",
+                            holds,
+                            holds / self._fps,
+                        )
+                    self._invoke_callback("on_hold", step=step, holds=holds)
+
+                if action is None:
+                    logger.error("No action available (warmup may have failed)")
+                    step += 1
+                    continue
+
+                modified = self._invoke_callback("before_send_action", action=action, step=step)
+                if modified is not None:
+                    action = modified
+
+                self._robot.send_action(action)
+                self._invoke_callback("on_action_sent", action=action, step=step)
+
+                elapsed = time.perf_counter() - loop_start
+                sleep_time = goal_time - elapsed
+                if sleep_time > 0:
+                    time.sleep(sleep_time)
+
+                step += 1
+
+        except KeyboardInterrupt:
+            logger.info("Interrupted by user")
+        except WorkerDiedError:
+            logger.exception("Worker died during runtime")
+            raise
+        finally:
+            self._shutdown(step)
+
+        return RunStats(
+            steps=step,
+            total_pops=self._action_queue.total_pops,
+            total_holds=self._action_queue.total_holds,
+            inference_count=getattr(self._execution, "inference_count", 0),
+        )
+
+    def _build_model_input(self) -> dict[str, Any]:
+        robot_obs = self._robot.get_observation()
+        camera_frames = {name: cam.read_latest() for name, cam in self._cameras.items()}
+        return self._obs_to_input(robot_obs, camera_frames)
+
+    def _shutdown(self, step: int) -> None:
+        self._execution.stop()
+
+        remaining = self._action_queue.remaining
+        drain_limit = min(remaining, int(self._fps))
+        for _ in range(drain_limit):
+            action = self._action_queue.pop()
+            if action is not None:
+                self._robot.send_action(action)
+                time.sleep(1.0 / self._fps)
+
+        logger.info(
+            "Shutdown complete — %d steps, %d pops, %d holds",
+            step,
+            self._action_queue.total_pops,
+            self._action_queue.total_holds,
+        )
+
+    def _invoke_callback(self, method: str, **kwargs: Any) -> Any:  # noqa: ANN401
+        result = None
+        for cb in self._callbacks:
+            fn = getattr(cb, method, None)
+            if fn is not None:
+                try:
+                    result = fn(**kwargs)
+                except Exception:
+                    logger.exception("Callback %s.%s raised", type(cb).__name__, method)
+        return result
diff --git a/src/physicalai/runtime/smoothers.py b/src/physicalai/runtime/smoothers.py
new file mode 100644
index 0000000..70d0bd9
--- /dev/null
+++ b/src/physicalai/runtime/smoothers.py
@@ -0,0 +1,71 @@
+# Copyright (C) 2025-2026 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+"""Chunk smoothers for runtime action queues."""
+
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from typing import override
+
+import numpy as np
+
+_NDIM_2 = 2
+_ERR_2D = "remaining and incoming must be 2D arrays"
+_ERR_ACTION_DIM = "remaining and incoming must have the same action_dim"
+
+
+class ChunkSmoother(ABC):
+    """Merges a new action chunk into remaining actions from the previous chunk."""
+
+    @abstractmethod
+    def merge(self, remaining: np.ndarray, incoming: np.ndarray, offset: int = 0) -> np.ndarray:
+        """Merge a previous remainder with a new incoming chunk."""
+        raise NotImplementedError
+
+
+class ReplaceSmoother(ChunkSmoother):
+    """Replace remaining actions with the incoming tail."""
+
+    @override
+    def merge(self, remaining: np.ndarray, incoming: np.ndarray, offset: int = 0) -> np.ndarray:
+        """Return the incoming chunk after skipping the offset."""
+        _validate_inputs(remaining, incoming)
+        return incoming[offset:]
+
+
+class LerpSmoother(ChunkSmoother):
+    """Blend overlapping actions and append the incoming tail."""
+
+    def __init__(self, duration_frames: int = 5) -> None:
+        """Create a smoother with a fallback lerp window."""
+        self.duration_frames = duration_frames
+
+    @override
+    def merge(self, remaining: np.ndarray, incoming: np.ndarray, offset: int = 0) -> np.ndarray:
+        """Merge chunks using queue-mixer-style linear interpolation.
+
+        Returns:
+            The blended action chunk.
+        """
+        _validate_inputs(remaining, incoming)
+
+        lerp_dur = max(offset, 1) if offset > 0 else self.duration_frames
+        incoming = incoming[offset:]
+        n_remain = len(remaining)
+        lerp_dur = min(n_remain, lerp_dur)
+
+        weights = np.maximum(1.0 - np.arange(n_remain) / max(lerp_dur, 1), 0.0)
+        weights = weights[:, np.newaxis]
+
+        n_blend = min(n_remain, len(incoming))
+        blended = weights[:n_blend] * remaining[:n_blend] + (1.0 - weights[:n_blend]) * incoming[:n_blend]
+
+        return np.concatenate([blended, incoming[n_blend:]], axis=0).astype(np.float32)
+
+
+def _validate_inputs(remaining: np.ndarray, incoming: np.ndarray) -> None:
+    if remaining.ndim != _NDIM_2 or incoming.ndim != _NDIM_2:
+        raise ValueError(_ERR_2D)
+    if remaining.shape[1] != incoming.shape[1]:
+        raise ValueError(_ERR_ACTION_DIM)

From 8f2b00a55f3529065b62044394117f6f67e5e5ed Mon Sep 17 00:00:00 2001
From: Max Xiang <xiangxiang.ma@intel.com>
Date: Fri, 15 May 2026 14:25:21 +0200
Subject: [PATCH 02/19] add tests

---
 tests/unit/runtime/__init__.py          |   0
 tests/unit/runtime/test_action_queue.py | 163 +++++++++++++
 tests/unit/runtime/test_execution.py    | 235 +++++++++++++++++++
 tests/unit/runtime/test_runtime.py      | 289 ++++++++++++++++++++++++
 tests/unit/runtime/test_smoothers.py    | 126 +++++++++++
 5 files changed, 813 insertions(+)
 create mode 100644 tests/unit/runtime/__init__.py
 create mode 100644 tests/unit/runtime/test_action_queue.py
 create mode 100644 tests/unit/runtime/test_execution.py
 create mode 100644 tests/unit/runtime/test_runtime.py
 create mode 100644 tests/unit/runtime/test_smoothers.py

diff --git a/tests/unit/runtime/__init__.py b/tests/unit/runtime/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/unit/runtime/test_action_queue.py b/tests/unit/runtime/test_action_queue.py
new file mode 100644
index 0000000..ec60296
--- /dev/null
+++ b/tests/unit/runtime/test_action_queue.py
@@ -0,0 +1,163 @@
+from __future__ import annotations
+
+import threading
+
+import numpy as np
+
+from physicalai.runtime._action_queue import ActionQueue
+from physicalai.runtime.smoothers import LerpSmoother, ReplaceSmoother
+
+
+class TestActionQueue:
+    def test_push_pop_roundtrip(self) -> None:
+        queue = ActionQueue()
+        chunk = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
+        queue.push_chunk(chunk)
+
+        actions = [queue.pop() for _ in range(3)]
+        assert all(a is not None for a in actions)
+        for i, action in enumerate(actions):
+            np.testing.assert_array_equal(action, chunk[i])
+
+    def test_pop_empty_returns_none(self) -> None:
+        queue = ActionQueue()
+        assert queue.pop() is None
+
+    def test_consecutive_holds_increment_and_reset(self) -> None:
+        queue = ActionQueue()
+        queue.pop()
+        queue.pop()
+        assert queue.consecutive_holds == 2
+
+        queue.push_chunk(np.array([[1.0, 2.0]], dtype=np.float32))
+        queue.pop()
+        assert queue.consecutive_holds == 0
+
+    def test_total_counters(self) -> None:
+        queue = ActionQueue()
+        queue.pop()
+        queue.pop()
+        queue.push_chunk(np.array([[1.0], [2.0], [3.0]], dtype=np.float32))
+        queue.pop()
+        queue.pop()
+        queue.pop()
+        queue.pop()
+
+        assert queue.total_holds == 3
+        assert queue.total_pops == 3
+
+    def test_remaining_property(self) -> None:
+        queue = ActionQueue()
+        assert queue.remaining == 0
+
+        queue.push_chunk(np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32))
+        assert queue.remaining == 2
+
+        queue.pop()
+        assert queue.remaining == 1
+
+    def test_below_threshold(self) -> None:
+        queue = ActionQueue()
+        assert queue.below_threshold(1) is True
+
+        queue.push_chunk(np.array([[1.0], [2.0], [3.0]], dtype=np.float32))
+        assert queue.below_threshold(4) is True
+        assert queue.below_threshold(3) is False
+        assert queue.below_threshold(2) is False
+
+    def test_clear(self) -> None:
+        queue = ActionQueue()
+        queue.push_chunk(np.array([[1.0], [2.0]], dtype=np.float32))
+        queue.pop()
+        queue.pop()
+        queue.pop()
+        assert queue.consecutive_holds == 1
+
+        queue.push_chunk(np.array([[3.0], [4.0]], dtype=np.float32))
+        queue.clear()
+
+        assert queue.remaining == 0
+        assert queue.consecutive_holds == 0
+        assert queue.total_holds == 1
+        assert queue.total_pops == 2
+
+    def test_push_with_offset(self) -> None:
+        queue = ActionQueue()
+        chunk = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
+        queue.push_chunk(chunk, offset=2)
+
+        assert queue.remaining == 1
+        action = queue.pop()
+        assert action is not None
+        np.testing.assert_array_equal(action, [5.0, 6.0])
+
+    def test_default_smoother_is_replace(self) -> None:
+        queue = ActionQueue()
+        assert isinstance(queue._smoother, ReplaceSmoother)
+
+    def test_smoother_integration_lerp(self) -> None:
+        queue = ActionQueue(smoother=LerpSmoother(duration_frames=5))
+        first = np.array([[10.0, 10.0], [20.0, 20.0], [30.0, 30.0]], dtype=np.float32)
+        queue.push_chunk(first)
+
+        second = np.array(
+            [[100.0, 100.0], [110.0, 110.0], [120.0, 120.0], [130.0, 130.0]],
+            dtype=np.float32,
+        )
+        queue.push_chunk(second)
+
+        first_action = queue.pop()
+        assert first_action is not None
+        assert not np.array_equal(first_action, second[0]), "LerpSmoother should blend, not replace"
+
+    def test_smoother_integration_replace(self) -> None:
+        queue = ActionQueue(smoother=ReplaceSmoother())
+        first = np.array([[1.0], [2.0], [3.0]], dtype=np.float32)
+        queue.push_chunk(first)
+
+        second = np.array([[10.0], [20.0]], dtype=np.float32)
+        queue.push_chunk(second)
+
+        assert queue.remaining == 2
+        action = queue.pop()
+        assert action is not None
+        np.testing.assert_array_equal(action, [10.0])
+
+
+class TestActionQueueThreadSafety:
+    def test_concurrent_push_pop(self) -> None:
+        queue = ActionQueue()
+        errors: list[Exception] = []
+        action_dim = 4
+        n_pushes = 100
+        chunk_size = 10
+
+        def pusher() -> None:
+            try:
+                for i in range(n_pushes):
+                    chunk = np.full((chunk_size, action_dim), float(i), dtype=np.float32)
+                    queue.push_chunk(chunk)
+            except Exception as exc:
+                errors.append(exc)
+
+        def popper(stop_event: threading.Event) -> None:
+            try:
+                while not stop_event.is_set():
+                    queue.pop()
+            except Exception as exc:
+                errors.append(exc)
+
+        stop = threading.Event()
+        push_thread = threading.Thread(target=pusher)
+        pop_thread = threading.Thread(target=popper, args=(stop,))
+
+        push_thread.start()
+        pop_thread.start()
+
+        push_thread.join(timeout=5.0)
+        stop.set()
+        pop_thread.join(timeout=5.0)
+
+        assert not push_thread.is_alive(), "Push thread deadlocked"
+        assert not pop_thread.is_alive(), "Pop thread deadlocked"
+        assert not errors, f"Thread errors: {errors}"
diff --git a/tests/unit/runtime/test_execution.py b/tests/unit/runtime/test_execution.py
new file mode 100644
index 0000000..08e354e
--- /dev/null
+++ b/tests/unit/runtime/test_execution.py
@@ -0,0 +1,235 @@
+from __future__ import annotations
+
+import threading
+import time
+from unittest.mock import MagicMock, patch
+
+import numpy as np
+import pytest
+
+from physicalai.runtime._action_queue import ActionQueue
+from physicalai.runtime.execution import AsyncExecution, SyncExecution, WorkerDiedError
+
+
+def _make_mock_model(chunk: np.ndarray | None = None) -> MagicMock:
+    model = MagicMock()
+    if chunk is None:
+        chunk = np.random.randn(6, 4).astype(np.float32)
+    model.predict_action_chunk.return_value = chunk
+    return model
+
+
+class TestSyncExecution:
+    def test_warmup_seeds_queue_and_discovers_chunk_size(self) -> None:
+        chunk = np.random.randn(8, 3).astype(np.float32)
+        model = _make_mock_model(chunk)
+        queue = ActionQueue()
+        ex = SyncExecution()
+        obs = {"state": np.zeros(3)}
+
+        ex.start(model, queue)
+        ex.warmup(obs)
+
+        assert ex.chunk_size == 8
+        assert queue.remaining == 8
+        model.predict_action_chunk.assert_called_once_with(obs)
+
+    def test_maybe_request_refills_when_empty(self) -> None:
+        chunk = np.random.randn(4, 2).astype(np.float32)
+        model = _make_mock_model(chunk)
+        queue = ActionQueue()
+        ex = SyncExecution()
+        obs = {"state": np.zeros(2)}
+
+        ex.start(model, queue)
+        ex.warmup(obs)
+
+        for _ in range(4):
+            queue.pop()
+        assert queue.remaining == 0
+
+        model.predict_action_chunk.reset_mock()
+        model.predict_action_chunk.return_value = chunk
+        ex.maybe_request(obs)
+
+        assert queue.remaining == 4
+        model.predict_action_chunk.assert_called_once()
+
+    def test_maybe_request_does_not_refill_when_nonempty(self) -> None:
+        chunk = np.random.randn(4, 2).astype(np.float32)
+        model = _make_mock_model(chunk)
+        queue = ActionQueue()
+        ex = SyncExecution()
+        obs = {"state": np.zeros(2)}
+
+        ex.start(model, queue)
+        ex.warmup(obs)
+        queue.pop()
+
+        model.predict_action_chunk.reset_mock()
+        ex.maybe_request(obs)
+        model.predict_action_chunk.assert_not_called()
+
+    def test_stop_is_noop(self) -> None:
+        ex = SyncExecution()
+        ex.stop()
+
+
+class TestAsyncExecution:
+    def test_start_spawns_thread(self) -> None:
+        model = _make_mock_model()
+        queue = ActionQueue()
+        ex = AsyncExecution(fps=10)
+
+        ex.start(model, queue)
+        assert ex.alive is True
+        ex.stop()
+
+    def test_warmup_seeds_queue(self) -> None:
+        chunk = np.random.randn(6, 4).astype(np.float32)
+        model = _make_mock_model(chunk)
+        queue = ActionQueue()
+        ex = AsyncExecution(fps=10)
+
+        ex.start(model, queue)
+        obs = {"state": np.zeros(4)}
+        ex.warmup(obs)
+
+        assert ex.chunk_size == 6
+        assert queue.remaining == 6
+        ex.stop()
+
+    def test_maybe_request_submits_when_below_threshold(self) -> None:
+        chunk = np.random.randn(10, 2).astype(np.float32)
+        model = _make_mock_model(chunk)
+        queue = ActionQueue()
+        ex = AsyncExecution(threshold=0.5, fps=10)
+
+        ex.start(model, queue)
+        obs = {"state": np.zeros(2)}
+        ex.warmup(obs)
+
+        for _ in range(10):
+            queue.pop()
+
+        model.predict_action_chunk.reset_mock()
+        model.predict_action_chunk.return_value = chunk
+        ex.maybe_request(obs)
+
+        time.sleep(0.3)
+        assert queue.remaining > 0
+        ex.stop()
+
+    def test_defensive_copy_of_observation(self) -> None:
+        chunk = np.random.randn(4, 2).astype(np.float32)
+        model = _make_mock_model(chunk)
+        queue = ActionQueue()
+        ex = AsyncExecution(threshold=0.5, fps=10)
+
+        ex.start(model, queue)
+        obs = {"state": np.zeros(2)}
+        ex.warmup(obs)
+        for _ in range(4):
+            queue.pop()
+
+        model.predict_action_chunk.reset_mock()
+        original_state = np.array([1.0, 2.0])
+        obs_to_submit = {"state": original_state.copy()}
+        ex.maybe_request(obs_to_submit)
+        obs_to_submit["state"][:] = 99.0
+
+        time.sleep(0.3)
+        if model.predict_action_chunk.called:
+            submitted = model.predict_action_chunk.call_args[0][0]["state"]
+            np.testing.assert_array_equal(submitted, original_state)
+        ex.stop()
+
+    def test_worker_death_raises_error(self) -> None:
+        model = _make_mock_model()
+        model.predict_action_chunk.side_effect = [
+            np.random.randn(4, 2).astype(np.float32),
+            ValueError("model exploded"),
+        ]
+        queue = ActionQueue()
+        ex = AsyncExecution(threshold=0.5, fps=10)
+
+        ex.start(model, queue)
+        obs = {"state": np.zeros(2)}
+        ex.warmup(obs)
+
+        for _ in range(4):
+            queue.pop()
+
+        ex.maybe_request(obs)
+        time.sleep(0.5)
+
+        with pytest.raises(WorkerDiedError, match="model exploded"):
+            ex.maybe_request(obs)
+
+        ex.stop()
+
+    def test_stop_signals_and_joins(self) -> None:
+        model = _make_mock_model()
+        queue = ActionQueue()
+        ex = AsyncExecution(fps=10)
+
+        ex.start(model, queue)
+        assert ex.alive is True
+
+        ex.stop()
+        assert ex._thread is not None
+        assert not ex._thread.is_alive()
+
+    def test_health_properties(self) -> None:
+        chunk = np.random.randn(4, 2).astype(np.float32)
+        model = _make_mock_model(chunk)
+        queue = ActionQueue()
+        ex = AsyncExecution(fps=10)
+
+        ex.start(model, queue)
+        obs = {"state": np.zeros(2)}
+        ex.warmup(obs)
+
+        assert ex.inference_count == 0
+
+        for _ in range(4):
+            queue.pop()
+
+        model.predict_action_chunk.reset_mock()
+        model.predict_action_chunk.return_value = chunk
+        ex.maybe_request(obs)
+        time.sleep(0.3)
+
+        assert ex.inference_count >= 1
+        ex.stop()
+
+    def test_watchdog_triggers_force_reset(self) -> None:
+        chunk = np.random.randn(4, 2).astype(np.float32)
+        model = _make_mock_model(chunk)
+
+        call_count = 0
+        original_side_effect = None
+
+        def slow_predict(obs: dict) -> np.ndarray:
+            nonlocal call_count
+            call_count += 1
+            if call_count == 2:
+                time.sleep(100)
+            return chunk
+
+        model.predict_action_chunk.side_effect = slow_predict
+        queue = ActionQueue()
+        ex = AsyncExecution(threshold=0.5, fps=10, watchdog_timeout_s=0.1)
+
+        ex.start(model, queue)
+        obs = {"state": np.zeros(2)}
+        ex.warmup(obs)
+
+        for _ in range(4):
+            queue.pop()
+        ex.maybe_request(obs)
+
+        time.sleep(0.3)
+        ex.maybe_request(obs)
+
+        ex.stop()
diff --git a/tests/unit/runtime/test_runtime.py b/tests/unit/runtime/test_runtime.py
new file mode 100644
index 0000000..6f1b378
--- /dev/null
+++ b/tests/unit/runtime/test_runtime.py
@@ -0,0 +1,289 @@
+# Copyright (C) 2025-2026 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+"""Tests for physicalai.runtime.runtime."""
+
+from __future__ import annotations
+
+import time
+from collections.abc import Callable
+from dataclasses import dataclass
+from typing import Any
+from unittest.mock import MagicMock, patch
+
+import numpy as np
+import pytest
+
+from physicalai.runtime._action_queue import ActionQueue
+from physicalai.runtime.execution import SyncExecution, WorkerDiedError
+from physicalai.runtime.runtime import PolicyRuntime, RunStats, default_observation_to_input
+
+from physicalai.capture import Frame
+
+
+@dataclass
+class FakeRobotObservation:
+    joint_positions: np.ndarray
+    timestamp: float
+    sensor_data: dict[str, np.ndarray] | None
+    images: dict | None
+
+
+def _make_mock_robot(joint_positions: np.ndarray | None = None) -> MagicMock:
+    robot = MagicMock()
+    if joint_positions is None:
+        joint_positions = np.array([0.1, 0.2, 0.3], dtype=np.float32)
+    robot.get_observation.return_value = FakeRobotObservation(
+        joint_positions=joint_positions,
+        timestamp=time.monotonic(),
+        sensor_data=None,
+        images=None,
+    )
+    return robot
+
+
+def _make_mock_model(chunk_size: int = 4, action_dim: int = 3) -> MagicMock:
+    model = MagicMock()
+    model.predict_action_chunk.return_value = np.random.randn(chunk_size, action_dim).astype(np.float32)
+    return model
+
+
+def _exhaustible_side_effect(
+    initial_chunks: list[np.ndarray],
+    action_dim: int = 2,
+) -> Callable[[Any], np.ndarray]:
+    """Return *initial_chunks* in order, then empty arrays forever.
+
+    Prevents StopIteration when SyncExecution refills more times than
+    the test expected.
+    """
+    it = iter(initial_chunks)
+    empty = np.empty((0, action_dim), dtype=np.float32)
+    return lambda _obs: next(it, empty)
+
+
+class TestPolicyRuntime:
+    def test_full_loop_with_duration(self) -> None:
+        robot = _make_mock_robot()
+        model = _make_mock_model(chunk_size=20, action_dim=3)
+        execution = SyncExecution()
+        queue = ActionQueue()
+
+        runtime = PolicyRuntime(
+            robot=robot,
+            model=model,
+            execution=execution,
+            fps=10.0,
+            action_queue=queue,
+        )
+
+        with patch("physicalai.runtime.runtime.time") as mock_time:
+            mock_time.perf_counter.return_value = 0.0
+            mock_time.sleep = MagicMock()
+            stats = runtime.run(duration_s=0.5)
+
+        assert stats.steps == 5
+        assert robot.send_action.call_count >= 5
+
+    def test_hold_fallback_when_queue_empty(self) -> None:
+        robot = _make_mock_robot()
+        chunk = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
+        model = _make_mock_model()
+        model.predict_action_chunk.side_effect = _exhaustible_side_effect([chunk], action_dim=2)
+
+        execution = SyncExecution()
+        queue = ActionQueue()
+
+        runtime = PolicyRuntime(
+            robot=robot,
+            model=model,
+            execution=execution,
+            fps=10.0,
+            action_queue=queue,
+        )
+
+        with patch("physicalai.runtime.runtime.time") as mock_time:
+            mock_time.perf_counter.return_value = 0.0
+            mock_time.sleep = MagicMock()
+            stats = runtime.run(duration_s=0.4)
+
+        assert stats.steps == 4
+        assert robot.send_action.call_count == 4
+
+    def test_worker_died_error_propagation(self) -> None:
+        robot = _make_mock_robot()
+        model = _make_mock_model(chunk_size=4)
+
+        execution = MagicMock()
+        execution.start = MagicMock()
+        execution.warmup = MagicMock()
+        execution.maybe_request.side_effect = WorkerDiedError("dead")
+        execution.stop = MagicMock()
+
+        queue = ActionQueue()
+        queue.push_chunk(np.random.randn(4, 3).astype(np.float32))
+
+        runtime = PolicyRuntime(
+            robot=robot,
+            model=model,
+            execution=execution,
+            fps=10.0,
+            action_queue=queue,
+        )
+
+        with patch("physicalai.runtime.runtime.time") as mock_time, pytest.raises(WorkerDiedError, match="dead"):
+            mock_time.perf_counter.return_value = 0.0
+            mock_time.sleep = MagicMock()
+            runtime.run(duration_s=1.0)
+
+    def test_shutdown_does_not_disconnect(self) -> None:
+        robot = _make_mock_robot()
+        model = _make_mock_model()
+        execution = SyncExecution()
+
+        runtime = PolicyRuntime(
+            robot=robot,
+            model=model,
+            execution=execution,
+            fps=10.0,
+        )
+
+        with patch("physicalai.runtime.runtime.time") as mock_time:
+            mock_time.perf_counter.return_value = 0.0
+            mock_time.sleep = MagicMock()
+            runtime.run(duration_s=0.1)
+
+        robot.disconnect.assert_not_called()
+
+
+class TestRuntimeCallback:
+    def test_before_send_action_called(self) -> None:
+        robot = _make_mock_robot()
+        model = _make_mock_model(chunk_size=10)
+        execution = SyncExecution()
+        callback = MagicMock()
+        callback.before_send_action.return_value = None
+
+        runtime = PolicyRuntime(
+            robot=robot,
+            model=model,
+            execution=execution,
+            fps=10.0,
+            callbacks=[callback],
+        )
+
+        with patch("physicalai.runtime.runtime.time") as mock_time:
+            mock_time.perf_counter.return_value = 0.0
+            mock_time.sleep = MagicMock()
+            runtime.run(duration_s=0.2)
+
+        assert callback.before_send_action.call_count == 2
+
+    def test_callback_raises_does_not_crash_loop(self) -> None:
+        robot = _make_mock_robot()
+        model = _make_mock_model(chunk_size=10)
+        execution = SyncExecution()
+        bad_callback = MagicMock()
+        bad_callback.before_send_action.side_effect = RuntimeError("oops")
+
+        runtime = PolicyRuntime(
+            robot=robot,
+            model=model,
+            execution=execution,
+            fps=10.0,
+            callbacks=[bad_callback],
+        )
+
+        with patch("physicalai.runtime.runtime.time") as mock_time:
+            mock_time.perf_counter.return_value = 0.0
+            mock_time.sleep = MagicMock()
+            stats = runtime.run(duration_s=0.3)
+
+        assert stats.steps == 3
+
+    def test_on_hold_called_when_queue_empty(self) -> None:
+        robot = _make_mock_robot()
+        chunk = np.array([[1.0, 2.0]], dtype=np.float32)
+        model = _make_mock_model()
+        model.predict_action_chunk.side_effect = _exhaustible_side_effect([chunk], action_dim=2)
+
+        execution = SyncExecution()
+        callback = MagicMock()
+        callback.before_send_action.return_value = None
+        callback.on_hold.return_value = None
+
+        runtime = PolicyRuntime(
+            robot=robot,
+            model=model,
+            execution=execution,
+            fps=10.0,
+            callbacks=[callback],
+        )
+
+        with patch("physicalai.runtime.runtime.time") as mock_time:
+            mock_time.perf_counter.return_value = 0.0
+            mock_time.sleep = MagicMock()
+            runtime.run(duration_s=0.3)
+
+        assert callback.on_hold.call_count >= 1
+
+
+class TestRunStats:
+    def test_fields_populated(self) -> None:
+        stats = RunStats(steps=10, total_pops=8, total_holds=2, inference_count=3)
+        assert stats.steps == 10
+        assert stats.total_pops == 8
+        assert stats.total_holds == 2
+        assert stats.inference_count == 3
+
+
+class TestDefaultObservationToInput:
+    def test_extracts_joint_positions(self) -> None:
+        positions = np.array([0.1, 0.2, 0.3], dtype=np.float32)
+        obs = FakeRobotObservation(
+            joint_positions=positions,
+            timestamp=0.0,
+            sensor_data=None,
+            images=None,
+        )
+
+        result = default_observation_to_input(obs, {})
+
+        assert "state" in result
+        np.testing.assert_array_equal(result["state"], np.array([positions], dtype=np.float32))
+
+    def test_extracts_frame_data(self) -> None:
+        obs = FakeRobotObservation(
+            joint_positions=np.zeros(3),
+            timestamp=0.0,
+            sensor_data=None,
+            images=None,
+        )
+        frame = Frame(data=np.zeros((480, 640, 3), dtype=np.uint8), timestamp=0.0, sequence=0)
+
+        result = default_observation_to_input(obs, camera_frames={"cam0": frame})
+
+        assert "images.cam0" in result
+        np.testing.assert_array_equal(result["images.cam0"], frame.data)
+
+    def test_custom_obs_to_input(self) -> None:
+        robot = _make_mock_robot()
+        model = _make_mock_model(chunk_size=10)
+        execution = SyncExecution()
+
+        custom_fn = MagicMock(return_value={"custom_key": np.zeros(3)})
+
+        runtime = PolicyRuntime(
+            robot=robot,
+            model=model,
+            execution=execution,
+            fps=10.0,
+            obs_to_input=custom_fn,
+        )
+
+        with patch("physicalai.runtime.runtime.time") as mock_time:
+            mock_time.perf_counter.return_value = 0.0
+            mock_time.sleep = MagicMock()
+            runtime.run(duration_s=0.1)
+
+        assert custom_fn.call_count >= 2
diff --git a/tests/unit/runtime/test_smoothers.py b/tests/unit/runtime/test_smoothers.py
new file mode 100644
index 0000000..d5c02b0
--- /dev/null
+++ b/tests/unit/runtime/test_smoothers.py
@@ -0,0 +1,126 @@
+from __future__ import annotations
+
+import numpy as np
+import pytest
+
+from physicalai.runtime.smoothers import LerpSmoother, ReplaceSmoother
+
+
+class TestReplaceSmoother:
+    def test_merge_drops_remaining_and_returns_incoming_offset(self) -> None:
+        smoother = ReplaceSmoother()
+        remaining = np.array([[1.0, 1.0], [1.0, 1.0]], dtype=np.float32)
+        incoming = np.array([[2.0, 2.0], [3.0, 3.0], [4.0, 4.0]], dtype=np.float32)
+
+        result = smoother.merge(remaining, incoming, offset=1)
+
+        np.testing.assert_array_equal(
+            result,
+            np.array([[3.0, 3.0], [4.0, 4.0]], dtype=np.float32),
+        )
+
+    def test_offset_zero_returns_all_incoming(self) -> None:
+        smoother = ReplaceSmoother()
+        remaining = np.array([[1.0, 1.0]], dtype=np.float32)
+        incoming = np.array([[2.0, 2.0], [3.0, 3.0]], dtype=np.float32)
+
+        result = smoother.merge(remaining, incoming, offset=0)
+
+        np.testing.assert_array_equal(result, incoming)
+
+    def test_offset_beyond_incoming_returns_empty_array(self) -> None:
+        smoother = ReplaceSmoother()
+        remaining = np.array([[1.0, 1.0]], dtype=np.float32)
+        incoming = np.array([[2.0, 2.0]], dtype=np.float32)
+
+        result = smoother.merge(remaining, incoming, offset=5)
+
+        assert result.shape == (0, 2)
+        assert result.dtype == incoming.dtype
+
+
+class TestLerpSmoother:
+    def test_lerp_weights_match_queue_mixer_formula(self) -> None:
+        smoother = LerpSmoother(duration_frames=5)
+        remaining = np.array([[10.0, 10.0], [20.0, 20.0], [30.0, 30.0]], dtype=np.float32)
+        incoming = np.array(
+            [[100.0, 100.0], [110.0, 110.0], [120.0, 120.0], [130.0, 130.0]],
+            dtype=np.float32,
+        )
+
+        result = smoother.merge(remaining, incoming, offset=0)
+
+        expected = np.array(
+            [[10.0, 10.0], [50.0, 50.0], [90.0, 90.0], [130.0, 130.0]],
+            dtype=np.float32,
+        )
+        np.testing.assert_array_equal(result, expected)
+
+    def test_offset_aware_duration(self) -> None:
+        smoother = LerpSmoother(duration_frames=99)
+        remaining = np.array([[1.0, 1.0], [2.0, 2.0], [3.0, 3.0]], dtype=np.float32)
+        incoming = np.array([[4.0, 4.0], [5.0, 5.0], [6.0, 6.0], [7.0, 7.0]], dtype=np.float32)
+
+        result = smoother.merge(remaining, incoming, offset=2)
+
+        expected = np.array([[1.0, 1.0], [4.5, 4.5]], dtype=np.float32)
+        np.testing.assert_array_equal(result, expected)
+
+    def test_edge_cases_empty_remaining_single_element_and_offset_beyond_chunk(self) -> None:
+        smoother = LerpSmoother(duration_frames=5)
+
+        empty_result = smoother.merge(
+            np.empty((0, 2), dtype=np.float32),
+            np.array([[1.0, 1.0]], dtype=np.float32),
+            offset=0,
+        )
+        np.testing.assert_array_equal(empty_result, np.array([[1.0, 1.0]], dtype=np.float32))
+
+        single_result = smoother.merge(
+            np.array([[1.0, 1.0]], dtype=np.float32),
+            np.array([[2.0, 2.0]], dtype=np.float32),
+            offset=0,
+        )
+        np.testing.assert_array_equal(single_result, np.array([[1.0, 1.0]], dtype=np.float32))
+
+        offset_beyond_result = smoother.merge(
+            np.array([[1.0, 1.0]], dtype=np.float32),
+            np.array([[2.0, 2.0]], dtype=np.float32),
+            offset=5,
+        )
+        assert offset_beyond_result.shape == (0, 2)
+
+    def test_exact_numerical_blend_values(self) -> None:
+        smoother = LerpSmoother(duration_frames=5)
+        remaining = np.array([[1.0, 1.0], [1.0, 1.0], [1.0, 1.0]], dtype=np.float32)
+        incoming = np.array(
+            [[2.0, 2.0], [2.0, 2.0], [2.0, 2.0], [2.0, 2.0]],
+            dtype=np.float32,
+        )
+
+        result = smoother.merge(remaining, incoming, offset=1)
+
+        expected = np.array(
+            [[1.0, 1.0], [2.0, 2.0], [2.0, 2.0]],
+            dtype=np.float32,
+        )
+        np.testing.assert_array_equal(result, expected)
+
+    def test_merge_is_stateless_for_same_arguments(self) -> None:
+        smoother = LerpSmoother(duration_frames=5)
+        remaining = np.array([[1.0, 1.0], [2.0, 2.0]], dtype=np.float32)
+        incoming = np.array([[3.0, 3.0], [4.0, 4.0], [5.0, 5.0]], dtype=np.float32)
+
+        first = smoother.merge(remaining, incoming, offset=1)
+        second = smoother.merge(remaining, incoming, offset=1)
+
+        np.testing.assert_array_equal(first, second)
+
+
+def test_input_validation_mismatched_action_dim_raises_value_error() -> None:
+    smoother = ReplaceSmoother()
+    remaining = np.array([[1.0, 1.0]], dtype=np.float32)
+    incoming = np.array([[2.0, 2.0, 2.0]], dtype=np.float32)
+
+    with pytest.raises(ValueError, match="action_dim"):
+        smoother.merge(remaining, incoming, offset=0)

From ab1bb3141e9324ebb5c9ea222545f3d6269fce30 Mon Sep 17 00:00:00 2001
From: Max X <xiangxiang.ma@intel.com>
Date: Sun, 17 May 2026 16:09:41 +0200
Subject: [PATCH 03/19] add goal time to send action

---
 examples/runtime/inference_pi05_gpu.ipynb | 838 ----------------------
 src/physicalai/robot/__init__.py          |   3 +-
 src/physicalai/runtime/runtime.py         |   3 +-
 3 files changed, 4 insertions(+), 840 deletions(-)
 delete mode 100644 examples/runtime/inference_pi05_gpu.ipynb

diff --git a/examples/runtime/inference_pi05_gpu.ipynb b/examples/runtime/inference_pi05_gpu.ipynb
deleted file mode 100644
index fcd656c..0000000
--- a/examples/runtime/inference_pi05_gpu.ipynb
+++ /dev/null
@@ -1,838 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "68d10b89",
-   "metadata": {},
-   "source": [
-    "# Pi05 GPU Inference (PyTorch + CUDA)\n",
-    "\n",
-    "Loads Eugene's finetuned Pi05 checkpoint and runs inference on RTX 3090.\n",
-    "Expected: ~1-5s per action chunk (vs ~90s on CPU with OpenVINO)."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "cf8f6bdd",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import torch\n",
-    "import numpy as np\n",
-    "import cv2\n",
-    "from collections import deque\n",
-    "from matplotlib import pyplot as plt\n",
-    "\n",
-    "print(f\"CUDA available: {torch.cuda.is_available()}\")\n",
-    "if torch.cuda.is_available():\n",
-    "    print(f\"GPU: {torch.cuda.get_device_name(0)}\")\n",
-    "    print(f\"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "98f8f8c7",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# --- Config ---\n",
-    "CHECKPOINT_PATH = \"pi05/pi05_eugene.ckpt\"\n",
-    "DEVICE = \"cuda\"  # \"cuda\" or \"cpu\"\n",
-    "TASK = \"pick a can and place it in a bowl\"\n",
-    "\n",
-    "JOINT_NAMES = [\n",
-    "    \"shoulder_pan\",\n",
-    "    \"shoulder_lift\",\n",
-    "    \"elbow_flex\",\n",
-    "    \"wrist_flex\",\n",
-    "    \"wrist_roll\",\n",
-    "    \"gripper\",\n",
-    "]\n",
-    "\n",
-    "# Camera config (SharedCamera UVC device indices)\n",
-    "OVERHEAD_CAMERA_INDEX = \"/dev/v4l/by-id/usb-UGREEN_Camera_2K_UGREEN_Camera_2K_SN0001-video-index0\"\n",
-    "ARM_CAMERA_INDEX = \"/dev/v4l/by-id/usb-Innomaker_Innomaker-U20CAM-1080p-S1_SN0001-video-index0\"\n",
-    "FLIP_CAMERAS = {\"arm\"}  # Cameras mounted upside down\n",
-    "\n",
-    "CAMERA_WIDTH = 640\n",
-    "CAMERA_FPS = 30\n",
-    "CAMERA_HEIGHT = 480"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "8f32365d",
-   "metadata": {},
-   "source": [
-    "## Load Model"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "9dce2df9",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from physicalai.policies.pi05 import Pi05\n",
-    "\n",
-    "# Load to CPU first to avoid OOM if GPU has residual allocations\n",
-    "policy = Pi05.load_from_checkpoint(CHECKPOINT_PATH, map_location=\"cpu\", compile_model=False)\n",
-    "policy = policy.to(DEVICE)\n",
-    "policy.eval()\n",
-    "print(f\"Model loaded on {DEVICE}\")\n",
-    "print(f\"VRAM used: {torch.cuda.memory_allocated() / 1e9:.2f} GB\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "ad387d02",
-   "metadata": {},
-   "source": [
-    "## Connect Cameras & Robot"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e022b604",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from cv2_enumerate_cameras import enumerate_cameras\n",
-    "\n",
-    "print(\"Available cameras:\")\n",
-    "for c in enumerate_cameras(apiPreference=cv2.CAP_GSTREAMER):\n",
-    "    print(f\" - {c.name} (index {c.index})\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f8c0a963",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from physicalai.capture import SharedCamera\n",
-    "from lerobot.robots.so_follower import SO101Follower, SO101FollowerConfig\n",
-    "\n",
-    "# Cameras via SharedCamera (UVC)\n",
-    "cameras = {\n",
-    "    \"overhead\": SharedCamera(\"uvc\", device=OVERHEAD_CAMERA_INDEX, width=CAMERA_WIDTH, height=CAMERA_HEIGHT, fps=CAMERA_FPS),\n",
-    "    \"arm\": SharedCamera(\"uvc\", device=ARM_CAMERA_INDEX, width=CAMERA_WIDTH, height=CAMERA_HEIGHT, fps=CAMERA_FPS),\n",
-    "}\n",
-    "for name, cam in cameras.items():\n",
-    "    cam.connect()\n",
-    "    print(f\"Camera '{name}' connected: {cam.actual_width}x{cam.actual_height} @ {cam.actual_fps}fps\")\n",
-    "\n",
-    "# Robot without cameras\n",
-    "robot_cfg = SO101FollowerConfig(\n",
-    "    port=\"/dev/ttyACM1\",\n",
-    "    id=\"my_so101_follower\",\n",
-    "\n",
-    ")\n",
-    "\n",
-    "robot = SO101Follower(robot_cfg)\n",
-    "robot.connect()\n",
-    "print(\"Robot connected\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "049e73d1",
-   "metadata": {},
-   "source": [
-    "## Get Observation & Display"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c1630af6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def get_full_observation() -> dict:\n",
-    "    \"\"\"Combine robot state + SharedCamera images into one dict.\"\"\"\n",
-    "    obs = robot.get_observation()\n",
-    "    for cam_key, cam in cameras.items():\n",
-    "        frame = cam.read_latest()\n",
-    "        obs[cam_key] = frame.data  # numpy HWC RGB\n",
-    "    return obs\n",
-    "\n",
-    "def viz_observation(obs: dict):\n",
-    "    print(\"Joint positions:\")\n",
-    "    for jn in JOINT_NAMES:\n",
-    "        print(f\"  {jn}: {obs[f'{jn}.pos']:.4f}\")\n",
-    "\n",
-    "    fig, axes = plt.subplots(1, 2, figsize=(12, 4))\n",
-    "    for ax, cam_key in zip(axes, [\"overhead\", \"arm\"]):\n",
-    "        img = np.ascontiguousarray(obs[cam_key])\n",
-    "        if cam_key in FLIP_CAMERAS:\n",
-    "            img = cv2.rotate(img, cv2.ROTATE_180)\n",
-    "        ax.imshow(img)\n",
-    "        ax.set_title(f\"{cam_key} ({img.shape})\")\n",
-    "        ax.axis(\"off\")\n",
-    "\n",
-    "    plt.tight_layout()\n",
-    "    plt.show()\n",
-    "\n",
-    "obs = get_full_observation()\n",
-    "viz_observation(obs)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1ef87e90",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "id": "c899db7a",
-   "metadata": {},
-   "source": [
-    "## Build Observation for Model"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "eb2ceb63",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from physicalai.data.observation import Observation\n",
-    "\n",
-    "def build_observation(obs: dict) -> Observation:\n",
-    "    \"\"\"Convert robot observation to Pi05 Observation.\"\"\"\n",
-    "    # State: (1, 6) float32\n",
-    "    state = torch.tensor(\n",
-    "        [[obs[f\"{jn}.pos\"] for jn in JOINT_NAMES]], dtype=torch.float32, device=DEVICE\n",
-    "    )\n",
-    "\n",
-    "    # Images: {name: (1, 3, H, W) float32 in [0, 1]}\n",
-    "    def img_to_tensor(img: np.ndarray) -> torch.Tensor:\n",
-    "        t = torch.from_numpy(img.copy()).float() / 255.0  # (H, W, 3) -> [0,1]\n",
-    "        t = t.permute(2, 0, 1).unsqueeze(0)  # (1, 3, H, W)\n",
-    "        return t.to(DEVICE)\n",
-    "\n",
-    "    images = {}\n",
-    "    for cam_key in [\"overhead\", \"arm\"]:\n",
-    "        img = np.ascontiguousarray(obs[cam_key])\n",
-    "        if cam_key in FLIP_CAMERAS:\n",
-    "            img = cv2.rotate(img, cv2.ROTATE_180)\n",
-    "        images[cam_key] = img_to_tensor(img)\n",
-    "\n",
-    "    return Observation(state=state, images=images, task=TASK)\n",
-    "\n",
-    "observation = build_observation(get_full_observation())\n",
-    "print(f\"State shape: {observation.state.shape}\")\n",
-    "for k, v in observation.images.items():\n",
-    "    print(f\"Image '{k}': {v.shape}, device={v.device}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "3a0557e9",
-   "metadata": {},
-   "source": [
-    "## Run Inference (single step, timed)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "5167ad02",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import time\n",
-    "\n",
-    "# Reset action queue to get a fresh prediction\n",
-    "policy._action_queue = deque()\n",
-    "\n",
-    "# Warm up GPU\n",
-    "if DEVICE == \"cuda\":\n",
-    "    torch.cuda.synchronize()\n",
-    "\n",
-    "t0 = time.perf_counter()\n",
-    "with torch.no_grad():\n",
-    "    action = policy.select_action(observation)\n",
-    "if DEVICE == \"cuda\":\n",
-    "    torch.cuda.synchronize()\n",
-    "latency = time.perf_counter() - t0\n",
-    "\n",
-    "action_np = action.detach().cpu().numpy()\n",
-    "if action_np.ndim == 2:\n",
-    "    action_np = action_np[0]\n",
-    "\n",
-    "print(f\"Inference latency: {latency*1000:.1f} ms\")\n",
-    "print(f\"Action shape: {action_np.shape}\")\n",
-    "print(f\"Action: {action_np}\")\n",
-    "print()\n",
-    "for i, jn in enumerate(JOINT_NAMES):\n",
-    "    print(f\"  {jn}: {action_np[i]:.4f}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "130d146d",
-   "metadata": {},
-   "source": [
-    "## Benchmark (5 inferences)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0a7dd57d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "latencies = []\n",
-    "for i in range(5):\n",
-    "    policy._action_queue = deque()\n",
-    "    if DEVICE == \"cuda\":\n",
-    "        torch.cuda.synchronize()\n",
-    "    t0 = time.perf_counter()\n",
-    "    with torch.no_grad():\n",
-    "        action = policy.select_action(observation)\n",
-    "    if DEVICE == \"cuda\":\n",
-    "        torch.cuda.synchronize()\n",
-    "    latencies.append(time.perf_counter() - t0)\n",
-    "\n",
-    "print(f\"Latencies: {[f'{l*1000:.1f}ms' for l in latencies]}\")\n",
-    "print(f\"Mean: {np.mean(latencies)*1000:.1f} ms\")\n",
-    "print(f\"Min:  {np.min(latencies)*1000:.1f} ms\")\n",
-    "print(f\"Max:  {np.max(latencies)*1000:.1f} ms\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "72d136ad",
-   "metadata": {},
-   "source": [
-    "## Send Action to Robot\n",
-    "\n",
-    "**Run only when ready to move.**"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "21aeb2ac",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "action_dict = {f\"{jn}.pos\": float(action_np[i]) for i, jn in enumerate(JOINT_NAMES)}\n",
-    "robot.send_action(action_dict)\n",
-    "print(f\"Sent: {action_dict}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "f7084ab5",
-   "metadata": {},
-   "source": [
-    "## Rollout Loop (observe → infer → execute full chunk)\n",
-    "\n",
-    "Runs a continuous loop: get observation, predict action chunk, execute all actions in the chunk at 30Hz, then repeat. **Interrupt the kernel to stop.**"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "66acc85f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# FPS = 30\n",
-    "# CHUNK_SIZE = 50  # Pi05 default action chunk length\n",
-    "\n",
-    "# try:\n",
-    "#     step = 0\n",
-    "#     while True:\n",
-    "#         # Get fresh observation\n",
-    "#         obs = get_full_observation()\n",
-    "#         observation = build_observation(obs)\n",
-    "\n",
-    "#         # Generate full action chunk\n",
-    "#         policy._action_queue = deque()\n",
-    "#         t0 = time.perf_counter()\n",
-    "#         with torch.no_grad():\n",
-    "#             action = policy.select_action(observation)\n",
-    "#         infer_ms = (time.perf_counter() - t0) * 1000\n",
-    "\n",
-    "#         # Drain remaining actions from the queue (select_action returns first, rest are queued)\n",
-    "#         actions = [action.detach().cpu().numpy()]\n",
-    "#         while len(policy._action_queue) > 0:\n",
-    "#             a = policy._action_queue.popleft()\n",
-    "#             actions.append(a.detach().cpu().numpy() if isinstance(a, torch.Tensor) else a)\n",
-    "\n",
-    "#         print(f\"[step {step}] Inferred {len(actions)} actions in {infer_ms:.0f}ms\")\n",
-    "\n",
-    "#         # Execute each action in the chunk at target FPS\n",
-    "#         for i, act in enumerate(actions):\n",
-    "#             t_act = time.perf_counter()\n",
-    "#             act = np.squeeze(act)\n",
-    "#             action_dict = {f\"{jn}.pos\": float(act[j]) for j, jn in enumerate(JOINT_NAMES)}\n",
-    "#             robot.send_action(action_dict)\n",
-    "\n",
-    "#             # Sleep to maintain target FPS\n",
-    "#             elapsed = time.perf_counter() - t_act\n",
-    "#             sleep_time = (1.0 / FPS) - elapsed\n",
-    "#             if sleep_time > 0:\n",
-    "#                 time.sleep(sleep_time)\n",
-    "\n",
-    "#         step += 1\n",
-    "\n",
-    "# except KeyboardInterrupt:\n",
-    "#     print(f\"\\nStopped after {step} chunks\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "cd3ce7cc",
-   "metadata": {},
-   "source": [
-    "## Async Threaded Rollout\n",
-    "\n",
-    "Background thread runs GPU inference while the main thread sends actions at 30Hz.\n",
-    "Lerp-blends overlapping chunks for smooth transitions. **Interrupt the kernel to stop.**"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1d054a13",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import threading\n",
-    "\n",
-    "class QueueMixer:\n",
-    "    \"\"\"Action queue with lerp blending between chunks.\"\"\"\n",
-    "\n",
-    "    def __init__(self, lerp_duration: int = 5):\n",
-    "        self.queue: np.ndarray | None = None\n",
-    "        self.lerp_duration = lerp_duration\n",
-    "        self.index = 0\n",
-    "\n",
-    "    def add(self, chunk: np.ndarray, offset: int = 0):\n",
-    "        if self.queue is None or self.index >= len(self.queue):\n",
-    "            self.queue = chunk[offset:]\n",
-    "            self.index = 0\n",
-    "            return\n",
-    "\n",
-    "        remaining = self.queue[self.index:]\n",
-    "        incoming = chunk[offset:]\n",
-    "        n_remain = len(remaining)\n",
-    "        lerp_dur = min(n_remain, self.lerp_duration)\n",
-    "\n",
-    "        weights = np.maximum(1.0 - np.arange(n_remain) / max(lerp_dur, 1), 0.0)[:, np.newaxis]\n",
-    "        n_blend = min(n_remain, len(incoming))\n",
-    "        blended = weights[:n_blend] * remaining[:n_blend] + (1.0 - weights[:n_blend]) * incoming[:n_blend]\n",
-    "        self.queue = np.concatenate([blended, incoming[n_blend:]], axis=0).astype(np.float32)\n",
-    "        self.index = 0\n",
-    "\n",
-    "    def pop(self) -> np.ndarray | None:\n",
-    "        if self.queue is None or self.index >= len(self.queue):\n",
-    "            return None\n",
-    "        action = self.queue[self.index]\n",
-    "        self.index += 1\n",
-    "        return action\n",
-    "\n",
-    "    @property\n",
-    "    def remaining(self) -> int:\n",
-    "        if self.queue is None:\n",
-    "            return 0\n",
-    "        return max(len(self.queue) - self.index, 0)\n",
-    "\n",
-    "\n",
-    "class GpuInferenceThread:\n",
-    "    \"\"\"Background thread that runs PyTorch GPU inference on demand.\"\"\"\n",
-    "\n",
-    "    def __init__(self, policy, device: str):\n",
-    "        self.policy = policy\n",
-    "        self.device = device\n",
-    "        self._lock = threading.Lock()\n",
-    "        self._obs_slot: dict | None = None\n",
-    "        self._obs_ready = threading.Event()\n",
-    "        self._running_inference = False\n",
-    "        self._request_time = 0.0\n",
-    "        self._result_lock = threading.Lock()\n",
-    "        self._result_slot: tuple[np.ndarray, float] | None = None\n",
-    "        self._stop = threading.Event()\n",
-    "        self._thread = threading.Thread(target=self._run, name=\"GpuInferenceThread\", daemon=True)\n",
-    "        self.inference_count = 0\n",
-    "\n",
-    "    def start(self):\n",
-    "        self._thread.start()\n",
-    "\n",
-    "    def stop(self):\n",
-    "        self._stop.set()\n",
-    "        self._obs_ready.set()\n",
-    "        self._thread.join(timeout=10.0)\n",
-    "\n",
-    "    def request(self, obs_dict: dict) -> bool:\n",
-    "        with self._lock:\n",
-    "            if self._obs_slot is not None or self._running_inference:\n",
-    "                return False\n",
-    "            self._obs_slot = obs_dict\n",
-    "            self._request_time = time.perf_counter()\n",
-    "        self._obs_ready.set()\n",
-    "        return True\n",
-    "\n",
-    "    @property\n",
-    "    def busy(self) -> bool:\n",
-    "        with self._lock:\n",
-    "            return self._obs_slot is not None or self._running_inference\n",
-    "\n",
-    "    @property\n",
-    "    def alive(self) -> bool:\n",
-    "        return self._thread.is_alive()\n",
-    "\n",
-    "    @property\n",
-    "    def busy_duration(self) -> float:\n",
-    "        \"\"\"Seconds since last request was submitted. 0 if not busy.\"\"\"\n",
-    "        with self._lock:\n",
-    "            if not (self._obs_slot is not None or self._running_inference):\n",
-    "                return 0.0\n",
-    "            return time.perf_counter() - self._request_time\n",
-    "\n",
-    "    def force_reset(self):\n",
-    "        \"\"\"Clear stuck state so new requests can be submitted.\"\"\"\n",
-    "        with self._lock:\n",
-    "            self._obs_slot = None\n",
-    "            self._running_inference = False\n",
-    "        print(\"[InferenceThread] Force reset — cleared stuck state\")\n",
-    "\n",
-    "    def get_result(self) -> tuple[np.ndarray, float] | None:\n",
-    "        with self._result_lock:\n",
-    "            r = self._result_slot\n",
-    "            self._result_slot = None\n",
-    "        return r\n",
-    "\n",
-    "    def _run(self):\n",
-    "        while not self._stop.is_set():\n",
-    "            self._obs_ready.wait()\n",
-    "            self._obs_ready.clear()\n",
-    "            if self._stop.is_set():\n",
-    "                break\n",
-    "\n",
-    "            with self._lock:\n",
-    "                obs_dict = self._obs_slot\n",
-    "                self._obs_slot = None\n",
-    "                self._running_inference = True\n",
-    "\n",
-    "            if obs_dict is None:\n",
-    "                with self._lock:\n",
-    "                    self._running_inference = False\n",
-    "                continue\n",
-    "\n",
-    "            self.inference_count += 1\n",
-    "            t0 = time.perf_counter()\n",
-    "\n",
-    "            try:\n",
-    "                observation = build_observation(obs_dict)\n",
-    "                self.policy._action_queue = deque()\n",
-    "\n",
-    "                with torch.no_grad():\n",
-    "                    action = self.policy.select_action(observation)\n",
-    "                if self.device == \"cuda\":\n",
-    "                    torch.cuda.synchronize()\n",
-    "\n",
-    "                # Collect full chunk\n",
-    "                actions = [action.detach().cpu().numpy()]\n",
-    "                while len(self.policy._action_queue) > 0:\n",
-    "                    a = self.policy._action_queue.popleft()\n",
-    "                    actions.append(a.detach().cpu().numpy() if isinstance(a, torch.Tensor) else a)\n",
-    "\n",
-    "                chunk = np.array([np.squeeze(a) for a in actions], dtype=np.float32)\n",
-    "            except Exception as e:\n",
-    "                import traceback\n",
-    "                print(f\"[InferenceThread] Error: {e}\")\n",
-    "                traceback.print_exc()\n",
-    "                with self._lock:\n",
-    "                    self._running_inference = False\n",
-    "                continue\n",
-    "\n",
-    "            latency = time.perf_counter() - t0\n",
-    "            with self._result_lock:\n",
-    "                self._result_slot = (chunk, latency)\n",
-    "            with self._lock:\n",
-    "                self._running_inference = False\n",
-    "\n",
-    "print(\"QueueMixer and GpuInferenceThread defined\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "664e5887",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "ASYNC_FPS = 30\n",
-    "QUEUE_THRESHOLD_FRAC = 0.5  # Request new inference when queue drops below this fraction\n",
-    "LERP_DURATION = 5           # Frames over which to blend old/new chunks\n",
-    "PLOT_INTERVAL = 1           # Seconds between live plot updates\n",
-    "\n",
-    "# --- Warm-up to determine chunk size ---\n",
-    "warmup_obs = get_full_observation()\n",
-    "warmup_observation = build_observation(warmup_obs)\n",
-    "policy._action_queue = deque()\n",
-    "with torch.no_grad():\n",
-    "    warmup_action = policy.select_action(warmup_observation)\n",
-    "warmup_actions = [warmup_action.detach().cpu().numpy()]\n",
-    "while len(policy._action_queue) > 0:\n",
-    "    a = policy._action_queue.popleft()\n",
-    "    warmup_actions.append(a.detach().cpu().numpy() if isinstance(a, torch.Tensor) else a)\n",
-    "warmup_chunk = np.array([np.squeeze(a) for a in warmup_actions], dtype=np.float32)\n",
-    "chunk_size = warmup_chunk.shape[0]\n",
-    "threshold = int(chunk_size * QUEUE_THRESHOLD_FRAC)\n",
-    "print(f\"Chunk size: {chunk_size}, threshold: {threshold}\")\n",
-    "\n",
-    "# --- Telemetry collector ---\n",
-    "telemetry = {\n",
-    "    \"t\": [],                  # wall time per step\n",
-    "    \"actions_sent\": [],       # action_array for each action sent\n",
-    "    \"queue_depth\": [],        # (t, remaining) per step\n",
-    "    \"infer_requests\": [],     # t when inference was requested\n",
-    "    \"infer_completions\": [],  # (t, latency_ms, offset)\n",
-    "    \"chunks_received\": [],    # (t_received, chunk_array, inference_num)\n",
-    "    \"holds\": [],              # t when queue was empty (holding position)\n",
-    "}\n",
-    "t_start = time.perf_counter()\n",
-    "\n",
-    "# --- Live plotting ---\n",
-    "import matplotlib.patches as mpatches\n",
-    "from IPython.display import display\n",
-    "\n",
-    "def render_telemetry(telemetry, threshold, chunk_size):\n",
-    "    \"\"\"Render telemetry to a matplotlib figure.\"\"\"\n",
-    "    actions_arr = np.array(telemetry[\"actions_sent\"])\n",
-    "    t_arr = np.array(telemetry[\"t\"])\n",
-    "    if len(t_arr) < 2:\n",
-    "        return None\n",
-    "\n",
-    "    queue_t, queue_d = zip(*telemetry[\"queue_depth\"])\n",
-    "    n_joints = len(JOINT_NAMES)\n",
-    "    n_chunks = len(telemetry[\"chunks_received\"])\n",
-    "    chunk_cmap = plt.cm.Set2\n",
-    "\n",
-    "    fig, axes = plt.subplots(n_joints + 1, 1, figsize=(16, 2.5 * (n_joints + 1)), sharex=True)\n",
-    "\n",
-    "    for j, jn in enumerate(JOINT_NAMES):\n",
-    "        ax = axes[j]\n",
-    "        ax.plot(t_arr, actions_arr[:, j], color=\"black\", linewidth=1.2, zorder=3)\n",
-    "\n",
-    "        for idx, (t_recv, chunk, _) in enumerate(telemetry[\"chunks_received\"]):\n",
-    "            color = chunk_cmap(idx % 8)\n",
-    "            chunk_t = t_recv + np.arange(len(chunk)) / ASYNC_FPS\n",
-    "            ax.plot(chunk_t, chunk[:, j], color=color, alpha=0.4, linewidth=1.5, zorder=2)\n",
-    "\n",
-    "        for t_req in telemetry[\"infer_requests\"]:\n",
-    "            ax.axvline(t_req, color=\"green\", alpha=0.3, linewidth=0.8, linestyle=\"--\")\n",
-    "        for t_comp, _, _ in telemetry[\"infer_completions\"]:\n",
-    "            ax.axvline(t_comp, color=\"blue\", alpha=0.3, linewidth=0.8, linestyle=\"-.\")\n",
-    "        for ht in telemetry[\"holds\"]:\n",
-    "            ax.axvspan(ht, ht + 1/ASYNC_FPS, color=\"red\", alpha=0.15)\n",
-    "\n",
-    "        ax.set_ylabel(jn, fontsize=9)\n",
-    "        ax.tick_params(labelsize=8)\n",
-    "\n",
-    "    ax_q = axes[-1]\n",
-    "    ax_q.fill_between(queue_t, queue_d, alpha=0.3, color=\"steelblue\")\n",
-    "    ax_q.plot(queue_t, queue_d, color=\"steelblue\", linewidth=1)\n",
-    "    ax_q.axhline(threshold, color=\"orange\", linestyle=\"--\", linewidth=1, label=f\"threshold ({threshold})\")\n",
-    "    for t_req in telemetry[\"infer_requests\"]:\n",
-    "        ax_q.axvline(t_req, color=\"green\", alpha=0.5, linewidth=1, linestyle=\"--\")\n",
-    "    for t_comp, _, _ in telemetry[\"infer_completions\"]:\n",
-    "        ax_q.axvline(t_comp, color=\"blue\", alpha=0.5, linewidth=1, linestyle=\"-.\")\n",
-    "    ax_q.set_ylabel(\"Queue\", fontsize=9)\n",
-    "    ax_q.set_xlabel(\"Time (s)\", fontsize=10)\n",
-    "    ax_q.legend(loc=\"upper right\", fontsize=8)\n",
-    "    ax_q.tick_params(labelsize=8)\n",
-    "\n",
-    "    legend_handles = [\n",
-    "        mpatches.Patch(color=\"black\", label=\"Sent to robot\"),\n",
-    "        mpatches.Patch(color=\"green\", alpha=0.5, label=\"Infer requested\"),\n",
-    "        mpatches.Patch(color=\"blue\", alpha=0.5, label=\"Chunk received\"),\n",
-    "        mpatches.Patch(color=\"red\", alpha=0.3, label=\"Hold\"),\n",
-    "    ]\n",
-    "    for idx in range(min(n_chunks, 6)):\n",
-    "        legend_handles.append(mpatches.Patch(color=chunk_cmap(idx % 8), alpha=0.5, label=f\"Chunk #{idx+1}\"))\n",
-    "    fig.legend(handles=legend_handles, loc=\"upper center\", ncol=min(len(legend_handles), 5), fontsize=8, bbox_to_anchor=(0.5, 1.02))\n",
-    "\n",
-    "    n_holds = len(telemetry[\"holds\"])\n",
-    "    fig.suptitle(f\"Live — {len(t_arr)} steps, {n_chunks} inferences, {n_holds} holds, t={t_arr[-1]:.1f}s\", fontsize=11, y=1.04)\n",
-    "    plt.tight_layout()\n",
-    "    return fig\n",
-    "\n",
-    "# Show initial empty plot placeholder\n",
-    "plot_handle = display(plt.figure(figsize=(16, 2)), display_id=True)\n",
-    "plt.close()\n",
-    "\n",
-    "# --- Init ---\n",
-    "queue = QueueMixer(lerp_duration=LERP_DURATION)\n",
-    "queue.add(warmup_chunk)  # Seed so robot can move immediately\n",
-    "\n",
-    "infer_thread = GpuInferenceThread(policy, DEVICE)\n",
-    "infer_thread.start()\n",
-    "\n",
-    "goal_time = 1.0 / ASYNC_FPS\n",
-    "last_action = warmup_chunk[0]\n",
-    "step = 0\n",
-    "hold_count = 0\n",
-    "last_plot_time = 0.0\n",
-    "\n",
-    "print(f\"Async rollout at {ASYNC_FPS} Hz — interrupt kernel to stop\")\n",
-    "\n",
-    "try:\n",
-    "    while True:\n",
-    "        loop_start = time.perf_counter()\n",
-    "        t_rel = loop_start - t_start\n",
-    "\n",
-    "        # 1. Check for inference result\n",
-    "        result = infer_thread.get_result()\n",
-    "        if result is not None:\n",
-    "            chunk, latency = result\n",
-    "            offset = int(latency * ASYNC_FPS)\n",
-    "            queue.add(chunk, offset=offset)\n",
-    "            queue.lerp_duration = max(offset, 1)\n",
-    "            telemetry[\"infer_completions\"].append((t_rel, latency * 1000, offset))\n",
-    "            telemetry[\"chunks_received\"].append((t_rel, chunk.copy(), infer_thread.inference_count))\n",
-    "            print(f\"[t={t_rel:.1f}s] <<< inference #{infer_thread.inference_count}: \"\n",
-    "                  f\"{latency*1000:.0f}ms, offset={offset}, queue={queue.remaining}\")\n",
-    "\n",
-    "        # 2. Request new inference when queue is low (with stuck/dead detection)\n",
-    "        INFER_TIMEOUT = 15.0\n",
-    "        if queue.remaining <= threshold:\n",
-    "            if not infer_thread.alive:\n",
-    "                print(f\"[t={t_rel:.1f}s] !!! Inference thread DEAD — restarting\")\n",
-    "                infer_thread = GpuInferenceThread(policy, DEVICE)\n",
-    "                infer_thread.start()\n",
-    "            elif infer_thread.busy_duration > INFER_TIMEOUT:\n",
-    "                print(f\"[t={t_rel:.1f}s] !!! Inference stuck for {infer_thread.busy_duration:.0f}s — force resetting\")\n",
-    "                infer_thread.force_reset()\n",
-    "\n",
-    "            if not infer_thread.busy:\n",
-    "                obs = get_full_observation()\n",
-    "                submitted = infer_thread.request(obs)\n",
-    "                if submitted:\n",
-    "                    telemetry[\"infer_requests\"].append(t_rel)\n",
-    "                    print(f\"[t={t_rel:.1f}s] >>> inference requested (queue={queue.remaining}/{chunk_size})\")\n",
-    "\n",
-    "        # 3. Pop action\n",
-    "        action = queue.pop()\n",
-    "        if action is not None:\n",
-    "            last_action = action\n",
-    "            hold_count = 0\n",
-    "        else:\n",
-    "            action = last_action\n",
-    "            hold_count += 1\n",
-    "            telemetry[\"holds\"].append(t_rel)\n",
-    "            if hold_count == 1 or hold_count % 30 == 0:\n",
-    "                print(f\"[t={t_rel:.1f}s] !!! Queue empty, holding (hold={hold_count})\")\n",
-    "\n",
-    "        # 4. Send to robot\n",
-    "        if action is not None:\n",
-    "            action_dict = {f\"{jn}.pos\": float(action[i]) for i, jn in enumerate(JOINT_NAMES)}\n",
-    "            robot.send_action(action_dict)\n",
-    "\n",
-    "        # 5. Record telemetry\n",
-    "        telemetry[\"t\"].append(t_rel)\n",
-    "        telemetry[\"actions_sent\"].append(action.copy() if action is not None else last_action.copy())\n",
-    "        telemetry[\"queue_depth\"].append((t_rel, queue.remaining))\n",
-    "\n",
-    "        # 6. Live plot update (throttled)\n",
-    "        if t_rel - last_plot_time > PLOT_INTERVAL and len(telemetry[\"t\"]) > 10:\n",
-    "            fig = render_telemetry(telemetry, threshold, chunk_size)\n",
-    "            if fig is not None:\n",
-    "                plot_handle.update(fig)\n",
-    "                plt.close(fig)\n",
-    "            last_plot_time = t_rel\n",
-    "\n",
-    "        # 7. Maintain FPS\n",
-    "        elapsed = time.perf_counter() - loop_start\n",
-    "        sleep_time = goal_time - elapsed\n",
-    "        if sleep_time > 0:\n",
-    "            time.sleep(sleep_time)\n",
-    "\n",
-    "        step += 1\n",
-    "\n",
-    "except KeyboardInterrupt:\n",
-    "    print(f\"\\nStopped after {step} steps, {infer_thread.inference_count} inferences, \"\n",
-    "          f\"{len(telemetry['holds'])} holds, duration={time.perf_counter() - t_start:.1f}s\")\n",
-    "\n",
-    "infer_thread.stop()\n",
-    "print(\"Inference thread stopped\")\n",
-    "\n",
-    "# Final plot\n",
-    "fig = render_telemetry(telemetry, threshold, chunk_size)\n",
-    "if fig is not None:\n",
-    "    plot_handle.update(fig)\n",
-    "    plt.close(fig)\n",
-    "\n",
-    "if telemetry[\"infer_completions\"]:\n",
-    "    latencies_ms = [lat for _, lat, _ in telemetry[\"infer_completions\"]]\n",
-    "    t_arr = np.array(telemetry[\"t\"])\n",
-    "    print(f\"\\nInference latencies: mean={np.mean(latencies_ms):.0f}ms, \"\n",
-    "          f\"min={np.min(latencies_ms):.0f}ms, max={np.max(latencies_ms):.0f}ms\")\n",
-    "    print(f\"Queue empty fraction: {len(telemetry['holds'])/max(len(t_arr),1)*100:.1f}%\")\n",
-    "    print(f\"Total duration: {t_arr[-1]:.1f}s, steps: {len(t_arr)}, effective Hz: {len(t_arr)/t_arr[-1]:.1f}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "b0e2a88d",
-   "metadata": {},
-   "source": [
-    "## Disconnect"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "d6eb8f97",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "for name, cam in cameras.items():\n",
-    "    cam.disconnect()\n",
-    "\n",
-    "    print(f\"Camera '{name}' disconnected\")\n",
-    "    \n",
-    "robot.disconnect()\n",
-    "print(\"Robot disconnected\")"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "physical-ai-studio (3.12.3)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.12.3"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/src/physicalai/robot/__init__.py b/src/physicalai/robot/__init__.py
index 1465750..110f90a 100644
--- a/src/physicalai/robot/__init__.py
+++ b/src/physicalai/robot/__init__.py
@@ -14,13 +14,14 @@
 from __future__ import annotations
 
 from physicalai.robot.connect import connect
-from physicalai.robot.interface import Robot
+from physicalai.robot.interface import Robot, RobotObservation
 from physicalai.robot.verify import verify_robot
 
 __all__ = [
     "Robot",
     "connect",
     "verify_robot",
+    "RobotObservation",
 ]
 
 
diff --git a/src/physicalai/runtime/runtime.py b/src/physicalai/runtime/runtime.py
index bb92225..b0b7e2b 100644
--- a/src/physicalai/runtime/runtime.py
+++ b/src/physicalai/runtime/runtime.py
@@ -110,6 +110,7 @@ def __init__(  # noqa: D107
         self._action_queue = action_queue or ActionQueue(smoother=LerpSmoother(duration_frames=_DEFAULT_LERP_FRAMES))
         self._obs_to_input = obs_to_input or default_observation_to_input
         self._callbacks = list(callbacks)
+        self._goal_time = (1.0 / fps) * 3
 
     def run(self, *, duration_s: float | None = None) -> RunStats:
         """Run the control loop.
@@ -166,7 +167,7 @@ def run(self, *, duration_s: float | None = None) -> RunStats:
                 if modified is not None:
                     action = modified
 
-                self._robot.send_action(action)
+                self._robot.send_action(action, goal_time=self._goal_time)
                 self._invoke_callback("on_action_sent", action=action, step=step)
 
                 elapsed = time.perf_counter() - loop_start

From 07790a77f0be950325be8139bc417ba0b753aca2 Mon Sep 17 00:00:00 2001
From: Max Xiang <xiangxiang.ma@intel.com>
Date: Sun, 17 May 2026 22:00:52 +0200
Subject: [PATCH 04/19] remove plan

---
 .../runtime-system/implementation_plan.md     | 1146 -----------------
 1 file changed, 1146 deletions(-)
 delete mode 100644 docs/design/components/runtime-system/implementation_plan.md

diff --git a/docs/design/components/runtime-system/implementation_plan.md b/docs/design/components/runtime-system/implementation_plan.md
deleted file mode 100644
index 07b1473..0000000
--- a/docs/design/components/runtime-system/implementation_plan.md
+++ /dev/null
@@ -1,1146 +0,0 @@
-# Runtime System — Implementation Plan
-
-This document is the implementation plan for `physicalai.runtime`. It refines the original [policy_runtime_design.md](./policy_runtime_design.md) based on codebase exploration, bug analysis, and architecture review.
-
-Read [policy_runtime_design.md](./policy_runtime_design.md) first for API shape and ownership rules. This document covers what to build, in what order, and why.
-
-## Reference Implementation
-
-The golden reference is `physicalai/examples/runtime/inference_async.py` — a working async prototype with QueueMixer, InferenceThread, velocity clamping, and camera discovery. Every runtime component must match or exceed its behavior.
-
----
-
-## Phase 1: Critical Bug Fixes (half day) ✅
-
-Fix bugs on the code path Phase 2 depends on. Phase 2 defines a public runtime contract — workarounds would calcify into permanent API shape. `predict_action_chunk()` currently raises `RuntimeError` without these fixes because Bug 2's inverted guard blocks the runtime's call to `model.predict_action_chunk(obs)`. This is not stylistic — it is a hard blocker.
-
-### Bug 1: `use_action_queue` checks manifest, ignores runtime runner
-
-**File**: `physicalai/src/physicalai/inference/model.py` — `use_action_queue` property
-
-**Problem**: Reads `self.manifest.model.runner` class_path. Ignores `self.runner` passed at construction or set at runtime.
-
-**Fix**: Check `isinstance(self.runner, ActionChunking)` instead:
-
-```python
-@property
-def use_action_queue(self) -> bool:
-    from physicalai.inference.runners.action_chunking import ActionChunking
-    return isinstance(self.runner, ActionChunking)
-```
-
-### Bug 2: `select_action()` / `predict_action_chunk()` guards inverted
-
-**File**: `physicalai/src/physicalai/inference/model.py`
-
-**Problem**: `select_action()` raises when `not use_action_queue`. `predict_action_chunk()` raises when `use_action_queue`. Both are backwards — `select_action()` should be the generic one-action API, `predict_action_chunk()` should return raw chunks.
-
-**Fix**: Remove guards entirely. Both methods should work for any runner (shape-stable contract per design doc §3):
-
-| Runner          | `select_action()`  | `predict_action_chunk()` |
-| --------------- | ------------------ | ------------------------ |
-| single-pass     | runner output      | wrap as `(1, D)` chunk   |
-| chunk-producing | pop one via cursor | runner output            |
-
-### Bug 3: ACT export manifest declares SinglePass instead of ActionChunking
-
-**File**: `library/src/physicalai/export/mixin_policy.py` — `_build_manifest()`
-
-**Problem**: Checks `metadata.get("use_action_queue", False)` but ACT never sets this metadata flag. Manifest always gets `SinglePass` runner.
-
-**Fix**: ACT export must pass `use_action_queue=True` and `chunk_size=<config.chunk_size>` in its metadata kwargs. Same fix needed for Pi0.5 (Bug 5).
-
-### Deferred Bugs (document as issues, do not block Phase 2)
-
-| Bug   | Summary                                                               | Why deferred                                            |
-| ----- | --------------------------------------------------------------------- | ------------------------------------------------------- |
-| Bug 4 | Manifest missing `hardware` section (`RobotSpec`, `CameraSpec`)       | Not on inference code path                              |
-| Bug 5 | Pi0.5 export also declares SinglePass                                 | Same root cause as Bug 3, fix together                  |
-| Bug 7 | Pi0.5 normalization not baked into graph (external pre/postprocessor) | By design — manifest `preprocessors_specs` handles this |
-| Bug 8 | Pi0.5 denoising loop not exportable cleanly (11x graph size)          | Export concern, not runtime concern                     |
-| Bug 9 | `OVTokenizer` may need `import openvino_tokenizers` for custom ops    | Needs verification — may already work via adapter       |
-
----
-
-## Phase 2: Runtime System (2–3 days) ✅
-
-New package: `physicalai/src/physicalai/runtime/`
-
-```text
-physicalai/src/physicalai/runtime/
-├── __init__.py              # exports: PolicyRuntime, SyncExecution, AsyncExecution,
-│                            #          ActionQueue, LerpSmoother, ReplaceSmoother, RunStats
-├── smoothers.py             # ChunkSmoother ABC, ReplaceSmoother, LerpSmoother
-├── _action_queue.py         # ActionQueue (public via __init__, internal module)
-├── execution.py             # Execution ABC, SyncExecution, AsyncExecution, WorkerDiedError
-└── runtime.py               # PolicyRuntime, RunStats, default_observation_to_input
-```
-
-Dependency order: `smoothers.py` → `_action_queue.py` → `execution.py` → `runtime.py` → `__init__.py`
-
-### Architectural Decisions
-
-**ActionQueue is owned by PolicyRuntime, not hidden inside Execution.**
-
-The original design doc keeps Execution (scheduling) and ActionQueue (buffering) as separate concerns. This is correct — when `AsyncExecution(transport="process")` or `RemoteExecution` arrive, they should push chunks into the same ActionQueue without duplicating buffer logic.
-
-Users get a clean default API:
-
-```python
-runtime = PolicyRuntime(
-    robot=robot,
-    model=model,
-    execution=AsyncExecution(threshold=0.5),
-    fps=30,
-)
-runtime.run(duration_s=60)
-```
-
-Power users can override buffering:
-
-```python
-runtime = PolicyRuntime(
-    robot=robot,
-    model=model,
-    execution=AsyncExecution(threshold=0.5),
-    action_queue=ActionQueue(smoother=LerpSmoother(duration_frames=10)),
-    fps=30,
-)
-```
-
-**Execution is a scheduler, not a buffer.** It decides when/where inference runs and pushes results into ActionQueue. It does not own pop, remaining, or chunk_size.
-
-**InferenceModel must NOT import ActionQueue.** Per design doc §4: if both layers need pop-from-chunk mechanics, they share `ActionChunkCursor`, not `ActionQueue`.
-
-### 2.1 `smoothers.py`
-
-Extracted from `QueueMixer.add()` in inference_async.py.
-
-```python
-class ChunkSmoother(ABC):
-    """Merges a new action chunk into remaining actions from the previous chunk."""
-
-    @abstractmethod
-    def merge(
-        self,
-        remaining: np.ndarray,    # (R, action_dim) — unconsumed actions from previous chunk
-        incoming: np.ndarray,     # (H, action_dim) — new chunk from inference
-        offset: int,              # skip first N actions of incoming (latency compensation)
-    ) -> np.ndarray:
-        """Return merged actions array. Called by ActionQueue.push_chunk()."""
-        ...
-
-
-class ReplaceSmoother(ChunkSmoother):
-    """Drop remaining actions, use incoming[offset:]."""
-
-    def merge(self, remaining, incoming, offset):
-        return incoming[offset:]
-
-
-class LerpSmoother(ChunkSmoother):
-    """Lerp-blend overlapping region, then append non-overlapping tail.
-
-    Stateless merge — no hidden mutation. duration_frames is the fallback
-    blending window used when offset is 0. When offset > 0, the blending
-    window is computed from offset directly: lerp_dur = max(offset, 1).
-
-    Matches QueueMixer.add() from inference_async.py:
-    - Weights: w_i = max(1.0 - i / lerp_dur, 0.0) for old actions
-    - Overlap region: blended = w * remaining + (1 - w) * incoming
-    """
-
-    def __init__(self, duration_frames: int = 5) -> None:
-        self.duration_frames = duration_frames
-
-    def merge(self, remaining, incoming, offset):
-        lerp_dur = max(offset, 1) if offset > 0 else self.duration_frames
-
-        incoming = incoming[offset:]
-        n_remain = len(remaining)
-        lerp_dur = min(n_remain, lerp_dur)
-
-        weights = np.maximum(1.0 - np.arange(n_remain) / max(lerp_dur, 1), 0.0)
-        weights = weights[:, np.newaxis]
-
-        n_blend = min(n_remain, len(incoming))
-        blended = weights[:n_blend] * remaining[:n_blend] + (1.0 - weights[:n_blend]) * incoming[:n_blend]
-
-        return np.concatenate([blended, incoming[n_blend:]], axis=0).astype(np.float32)
-```
-
-Key: `offset` is not just "skip N actions." It is latency compensation — `offset = int(inference_latency * fps)`. The smoother must handle this, not the caller.
-
-### 2.2 `_action_queue.py` (public API, internal module)
-
-Thread-safe action buffer with smoother integration and hold telemetry.
-
-```python
-class ActionQueue:
-    """Thread-safe action buffer with chunk smoothing and starvation telemetry.
-
-    Public API — exported from physicalai.runtime. Power users can override
-    the default ActionQueue on PolicyRuntime to customize smoothing behavior.
-    Execution pushes chunks into it; PolicyRuntime pops actions from it.
-    """
-
-    def __init__(self, smoother: ChunkSmoother | None = None) -> None:
-        self._smoother = smoother or ReplaceSmoother()
-        self._deque: deque[np.ndarray] = deque()
-        self._lock = threading.Lock()
-        self._consecutive_holds: int = 0
-        self._total_holds: int = 0
-        self._total_pops: int = 0
-
-    def push_chunk(self, chunk: np.ndarray, offset: int = 0) -> None:
-        """Merge a new chunk into the queue. Thread-safe."""
-        with self._lock:
-            remaining = (np.stack(list(self._deque))
-                         if self._deque
-                         else np.empty((0, chunk.shape[1]), dtype=chunk.dtype))
-            merged = self._smoother.merge(remaining, chunk, offset)
-            self._deque.clear()
-            self._deque.extend(merged)
-
-    def pop(self) -> np.ndarray | None:
-        """Pop next action, or None if empty. Thread-safe."""
-        with self._lock:
-            if not self._deque:
-                self._consecutive_holds += 1
-                self._total_holds += 1
-                return None
-            self._consecutive_holds = 0
-            self._total_pops += 1
-            return self._deque.popleft()
-
-    @property
-    def remaining(self) -> int:
-        with self._lock:
-            return len(self._deque)
-
-    def below_threshold(self, threshold: int) -> bool:
-        return self.remaining < threshold
-
-    def clear(self) -> None:
-        with self._lock:
-            self._queue = None
-            self._index = 0
-
-    # --- Telemetry ---
-    @property
-    def consecutive_holds(self) -> int:
-        return self._consecutive_holds
-
-    @property
-    def total_holds(self) -> int:
-        return self._total_holds
-
-    @property
-    def total_pops(self) -> int:
-        return self._total_pops
-```
-
-### 2.3 `execution.py`
-
-**Execution ABC** — scheduler only. Pushes chunks into ActionQueue, does not own pop/remaining.
-
-```python
-class Execution(ABC):
-    """Decides when and where inference runs. Pushes results into ActionQueue."""
-
-    @abstractmethod
-    def start(self, model: InferenceModel, action_queue: ActionQueue) -> None:
-        """Bind to model and queue. Called once before the loop."""
-        ...
-
-    @abstractmethod
-    def maybe_request(self, observation: dict[str, np.ndarray]) -> None:
-        """Check if new inference is needed. If so, run or schedule it."""
-        ...
-
-    @abstractmethod
-    def warmup(self, sample_observation: dict[str, np.ndarray]) -> None:
-        """Run one inference to discover chunk_size and seed the queue.
-
-        After warmup():
-        - action_queue has one chunk ready (robot starts moving immediately)
-        - self.chunk_size is set
-        - self.action_dim is set
-        """
-        ...
-
-    @abstractmethod
-    def stop(self) -> None:
-        """Stop scheduling. For async: signal thread, join with timeout."""
-        ...
-
-    @property
-    @abstractmethod
-    def chunk_size(self) -> int:
-        """Discovered after warmup(). Used to compute threshold."""
-        ...
-```
-
-**SyncExecution** — blocks on inference when queue runs low.
-
-```python
-class SyncExecution(Execution):
-    """Synchronous inference in the control thread."""
-
-    def __init__(self) -> None:
-        self._model: InferenceModel | None = None
-        self._queue: ActionQueue | None = None
-        self._chunk_size: int = 0
-
-    def start(self, model, action_queue):
-        self._model = model
-        self._queue = action_queue
-
-    def warmup(self, sample_observation):
-        actions = self._model.predict_action_chunk(sample_observation)
-        self._chunk_size = actions.shape[0]
-        self._queue.push_chunk(actions, offset=0)
-
-    def maybe_request(self, observation):
-        if self._queue.below_threshold(1):  # refill when empty
-            actions = self._model.predict_action_chunk(observation)
-            self._queue.push_chunk(actions, offset=0)
-
-    def stop(self):
-        pass
-
-    @property
-    def chunk_size(self):
-        return self._chunk_size
-```
-
-**AsyncExecution** — background thread with health monitoring. Maps to `InferenceThread` from inference_async.py.
-
-```python
-class WorkerDiedError(RuntimeError):
-    """Raised when the inference worker thread dies unexpectedly."""
-    pass
-
-
-class AsyncExecution(Execution):
-    """Async inference in a background thread.
-
-    Thread architecture (matches inference_async.py):
-
-        Control thread (main):              Inference thread (background):
-        ─────────────────────               ────────────────────────────
-        loop at fps:                        loop:
-          obs = robot.get_observation()       wait for obs_slot
-          execution.maybe_request(obs)        chunk = model.predict_action_chunk(obs)
-          action = queue.pop()                offset = int(latency * fps)
-          robot.send_action(action)           queue.push_chunk(chunk, offset)
-    """
-
-    def __init__(
-        self,
-        threshold: float = 0.5,
-        fps: int = 30,
-        watchdog_timeout_s: float = 30.0,
-        max_consecutive_holds: int | None = None,   # default: 3 * fps (3 seconds)
-    ) -> None:
-        self._threshold_frac = threshold
-        self._fps = fps
-        self._watchdog_timeout_s = watchdog_timeout_s
-        self._max_consecutive_holds = max_consecutive_holds or 3 * fps
-
-        # Set during start()
-        self._model: InferenceModel | None = None
-        self._queue: ActionQueue | None = None
-        self._chunk_size: int = 0
-        self._threshold_count: int = 0
-
-        # Thread state
-        self._lock = threading.Lock()
-        self._obs_slot: dict | None = None
-        self._obs_ready = threading.Event()
-        self._running_inference = False
-        self._request_time: float = 0.0
-        self._stop_event = threading.Event()
-        self._thread: threading.Thread | None = None
-        self._death_cause: BaseException | None = None
-        self._inference_count: int = 0
-
-    def start(self, model, action_queue):
-        self._model = model
-        self._queue = action_queue
-        self._thread = threading.Thread(target=self._run, name="InferenceThread", daemon=True)
-        self._thread.start()
-
-    def warmup(self, sample_observation):
-        actions = self._model.predict_action_chunk(sample_observation)
-        self._chunk_size = actions.shape[0]
-        self._threshold_count = int(self._chunk_size * self._threshold_frac)
-        self._queue.push_chunk(actions, offset=0)
-
-    def maybe_request(self, observation):
-        # Check for worker death — raise, don't silently continue
-        if self._thread is not None and not self._thread.is_alive():
-            if self._death_cause is not None:
-                raise WorkerDiedError(
-                    f"Inference thread died: {self._death_cause}"
-                ) from self._death_cause
-
-        # Check for stuck inference
-        if self._busy_duration > self._watchdog_timeout_s:
-            logger.warning(
-                "Inference stuck for %.0fs — force resetting", self._busy_duration,
-            )
-            self._force_reset()
-
-        # Submit if queue is low and worker is idle
-        if self._queue.below_threshold(self._threshold_count):
-            if not self._busy:
-                # Defensive copy — observation may be reused by caller
-                snapshot = {
-                    k: v.copy() if isinstance(v, np.ndarray) else v
-                    for k, v in observation.items()
-                }
-                with self._lock:
-                    self._obs_slot = snapshot
-                    self._request_time = time.perf_counter()
-                self._obs_ready.set()
-
-    def stop(self):
-        if self._thread is not None:
-            self._stop_event.set()
-            self._obs_ready.set()   # unblock wait
-            self._thread.join(timeout=10.0)
-
-    @property
-    def chunk_size(self):
-        return self._chunk_size
-
-    # --- Health properties ---
-
-    @property
-    def alive(self) -> bool:
-        return self._thread is not None and self._thread.is_alive()
-
-    @property
-    def _busy(self) -> bool:
-        with self._lock:
-            return self._obs_slot is not None or self._running_inference
-
-    @property
-    def _busy_duration(self) -> float:
-        with self._lock:
-            if not (self._obs_slot is not None or self._running_inference):
-                return 0.0
-            return time.perf_counter() - self._request_time
-
-    @property
-    def inference_count(self) -> int:
-        return self._inference_count
-
-    # --- Internal ---
-
-    def _force_reset(self) -> None:
-        with self._lock:
-            self._obs_slot = None
-            self._running_inference = False
-        logger.warning("Force reset — cleared stuck inference state")
-
-    def _run(self) -> None:
-        """Inference thread main loop."""
-        try:
-            while not self._stop_event.is_set():
-                self._obs_ready.wait()
-                self._obs_ready.clear()
-
-                if self._stop_event.is_set():
-                    return
-
-                with self._lock:
-                    obs = self._obs_slot
-                    self._obs_slot = None
-                    if obs is None:
-                        continue
-                    self._running_inference = True
-
-                t0 = time.perf_counter()
-                actions = self._model.predict_action_chunk(obs)
-                latency = time.perf_counter() - t0
-
-                offset = int(latency * self._fps)
-
-                self._queue.push_chunk(actions, offset=offset)
-                self._inference_count += 1
-
-                with self._lock:
-                    self._running_inference = False
-
-        except Exception as e:
-            self._death_cause = e
-            logger.error("Inference thread died: %s", e, exc_info=True)
-```
-
-### 2.4 `runtime.py`
-
-```python
-from __future__ import annotations
-
-import logging
-import time
-from collections.abc import Callable, Mapping, Sequence
-from typing import Any, Protocol
-
-import numpy as np
-
-from physicalai.capture.camera import Camera, Frame
-from physicalai.inference import InferenceModel
-from physicalai.robot.interface import Robot, RobotObservation
-from physicalai.runtime._action_queue import ActionQueue
-from physicalai.runtime.execution import Execution, WorkerDiedError
-from physicalai.runtime.smoothers import LerpSmoother
-
-logger = logging.getLogger(__name__)
-
-def default_observation_to_input(
-    robot_obs: RobotObservation,
-    camera_frames: dict[str, Frame],
-) -> dict[str, Any]:
-    """Default observation-to-model-input conversion.
-
-    Maps:
-    - Joint positions → "state" array
-    - Camera frames → "images.{name}" arrays
-
-    For Pi0.5 or other models needing custom keys (e.g. "task"),
-    pass a custom obs_to_input callable to PolicyRuntime.
-    """
-    model_input: dict[str, Any] = {}
-
-    # Collect joint positions into "state" vector
-    if robot_obs.joint_positions:
-        model_input["state"] = np.array([robot_obs.joint_positions], dtype=np.float32)
-
-    # Map camera frames to "images.{name}"
-    for name, frame in camera_frames.items():
-        model_input[f"images.{name}"] = frame.data
-
-    return model_input
-
-
-class RuntimeCallback(Protocol):
-    """Optional hook points in the PolicyRuntime control loop."""
-
-    def before_send_action(self, *, action: np.ndarray, step: int) -> np.ndarray | None:
-        """Called before sending action. Return modified action or None to use original."""
-        ...
-
-    def on_action_sent(self, *, action: np.ndarray, step: int) -> None:
-        """Called after action is sent to robot."""
-        ...
-
-    def on_hold(self, *, step: int, holds: int) -> None:
-        """Called when action queue is empty and robot holds last position."""
-        ...
-
-
-class PolicyRuntime:
-    """Runs a policy on robot hardware.
-
-    Loop shape (matches inference_async.py):
-        obs = robot.get_observation()
-        model_input = obs_to_input(obs, cameras)
-        execution.maybe_request(model_input)
-        action = action_queue.pop()
-        if action is None: hold position
-        robot.send_action(action)
-        sleep_until_next_tick()
-    """
-
-    def __init__(
-        self,
-        robot: Robot,
-        model: InferenceModel,
-        execution: Execution,
-        fps: float,
-        cameras: Mapping[str, Camera] | None = None,
-        action_queue: ActionQueue | None = None,
-        obs_to_input: Callable[[RobotObservation, dict[str, Frame]], dict[str, Any]] | None = None,
-        callbacks: Sequence[RuntimeCallback] = (),
-    ) -> None:
-        self._robot = robot
-        self._model = model
-        self._execution = execution
-        self._fps = fps
-        self._cameras = cameras or {}
-        self._action_queue = action_queue or ActionQueue(smoother=LerpSmoother(duration_frames=5))
-        self._obs_to_input = obs_to_input or default_observation_to_input
-        self._callbacks = list(callbacks)
-
-    def run(self, *, duration_s: float | None = None) -> RunStats:
-        """Run the control loop.
-
-        1. Warm up — run one inference, seed queue, discover chunk_size
-        2. Loop — observe, maybe_request, pop, send, sleep
-        3. Shutdown — stop execution, drain
-        """
-        # --- Init ---
-        self._execution.start(self._model, self._action_queue)
-
-        sample_obs = self._build_model_input()
-        self._execution.warmup(sample_obs)
-
-        goal_time = 1.0 / self._fps
-        step = 0
-        last_action: np.ndarray | None = None
-
-        try:
-            while True:
-                if duration_s is not None and step * goal_time >= duration_s:
-                    break
-
-                loop_start = time.perf_counter()
-
-                # 1. Observe
-                obs = self._build_model_input()
-
-                # 2. Maybe request inference
-                self._execution.maybe_request(obs)
-
-                # 3. Pop action
-                action = self._action_queue.pop()
-                if action is not None:
-                    last_action = action
-                else:
-                    action = last_action
-                    holds = self._action_queue.consecutive_holds
-                    if holds == 1:
-                        logger.warning("Queue empty — holding position")
-                    elif holds % self._fps == 0:
-                        logger.warning(
-                            "Queue starvation: %d consecutive holds (%.1fs)",
-                            holds, holds / self._fps,
-                        )
-                    self._invoke_callback("on_hold", step=step, holds=holds)
-
-                if action is None:
-                    # No warmup result and no previous action — skip
-                    logger.error("No action available (warmup may have failed)")
-                    continue
-
-                # 4. Callbacks
-                action = self._invoke_callback("before_send_action", action=action, step=step) or action
-
-                # 5. Send
-                self._robot.send_action(action)
-                self._invoke_callback("on_action_sent", action=action, step=step)
-
-                # 6. Timing
-                elapsed = time.perf_counter() - loop_start
-                sleep_time = goal_time - elapsed
-                if sleep_time > 0:
-                    time.sleep(sleep_time)
-
-                step += 1
-
-        except KeyboardInterrupt:
-            logger.info("Interrupted by user")
-        except WorkerDiedError as e:
-            logger.error("Worker died during runtime: %s", e)
-            raise
-        finally:
-            self._shutdown(step)
-
-        return RunStats(
-            steps=step,
-            total_pops=self._action_queue.total_pops,
-            total_holds=self._action_queue.total_holds,
-            inference_count=getattr(self._execution, "inference_count", 0),
-        )
-
-    def _build_model_input(self) -> dict[str, Any]:
-        robot_obs = self._robot.get_observation()
-        camera_frames = {name: cam.read_latest() for name, cam in self._cameras.items()}
-        return self._obs_to_input(robot_obs, camera_frames)
-
-    def _shutdown(self, step: int) -> None:
-        """Robot and cameras must be connected before run(). Caller owns lifecycle."""
-        # 1. Stop inference scheduling
-        self._execution.stop()
-
-        # 2. Drain remaining actions (up to 1s) for smooth stop
-        remaining = self._action_queue.remaining
-        drain_limit = min(remaining, int(self._fps))
-        for _ in range(drain_limit):
-            action = self._action_queue.pop()
-            if action is not None:
-                self._robot.send_action(action)
-                time.sleep(1.0 / self._fps)
-
-        logger.info(
-            "Shutdown complete — %d steps, %d pops, %d holds",
-            step, self._action_queue.total_pops, self._action_queue.total_holds,
-        )
-
-    def _invoke_callback(self, method: str, **kwargs):
-        result = None
-        for cb in self._callbacks:
-            fn = getattr(cb, method, None)
-            if fn is not None:
-                result = fn(**kwargs)
-        return result
-```
-
-### 2.5 Tests
-
-```text
-physicalai/tests/unit/runtime/
-├── test_smoothers.py        # ReplaceSmoother, LerpSmoother: offset handling, lerp weights,
-│                            #   dynamic duration, edge cases (empty remaining, offset > chunk)
-├── test_action_queue.py     # push/pop, smoother integration, thread safety (concurrent
-│                            #   push+pop from 2 threads), hold counters, clear()
-├── test_execution.py        # SyncExecution: warmup seeds queue, maybe_request refills
-│                            # AsyncExecution: mock model, health monitoring (alive/busy/
-│                            #   busy_duration), WorkerDiedError propagation, force_reset
-└── test_runtime.py          # PolicyRuntime with mock robot + mock model:
-                             #   full loop, hold fallback, shutdown drain, callbacks,
-                             #   WorkerDiedError propagation, duration_s limit
-```
-
-All tests use mock `InferenceModel` and mock `Robot` — no hardware, no exported models.
-
----
-
-## Phase 3: CLI and Integration (1–2 days)
-
-1. `physicalai run --config so101_pi05.yaml` CLI command
-2. YAML config loader (`PolicyRuntime.from_config()`)
-3. Observation builder (bridges `Robot` protocol + `Camera` → model input dict)
-4. Migrate `inference_async.py` to use `PolicyRuntime` (becomes ~20 lines)
-
-### Velocity clamping and camera discovery stay outside core
-
-Velocity clamping (`max_speed`, `ramp_steps`, `commanded_pos` tracking) is SO-101-specific. Camera discovery (interactive selection, name mapping, blank cameras, flip) is app-specific.
-
-These belong in:
-
-- Example scripts (`examples/runtime/`)
-- CLI helpers (`physicalai.cli.run`)
-- User callbacks
-
-Not in `PolicyRuntime` or `Execution`. If a reusable pattern emerges across 2+ robots, promote to a formal action-transform layer later.
-
----
-
-## Phase 3.5: Runtime Telemetry (1–2 days)
-
-Streaming observability for the runtime control loop. The runtime process must not be bottlenecked by telemetry — all emission is fire-and-forget over zenoh pub-sub. A separate observer process handles visualization, aggregation, and persistence.
-
-### Constraint
-
-Two-week release deadline. Scope: emitter in runtime, observer CLI, zenoh-only transport. No OpenTelemetry, no Studio UI integration, no remote telemetry. Use OTel-compatible metric naming so a future OTLP bridge is trivial.
-
-### Architecture
-
-```text
-┌──────────────────────────────────────────┐
-│  Runtime Process (real-time budget)      │
-│                                          │
-│  PolicyRuntime.run()                     │
-│    └─ TelemetryEmitter                   │
-│         zenoh.put() per tick/inference   │
-│         no-op if zenoh not installed     │
-└──────────┬───────────────────────────────┘
-           │ zenoh pub-sub (local SHM)
-           ▼
-┌──────────────────────────────────────────┐
-│  Observer Process (separate, optional)   │
-│                                          │
-│  TelemetrySubscriber                     │
-│    ├─ Live console dashboard             │
-│    ├─ JSONL file sink (--record)         │
-│    └─ Future: WebSocket → Studio UI      │
-└──────────────────────────────────────────┘
-```
-
-### Topic Schema
-
-All topics prefixed with `physicalai/rt/{session_id}/`. Session ID is a short hex string generated at `run()` start.
-
-| Topic       | Frequency                            | Payload (msgpack)                                                                                                            | Purpose                                   |
-| ----------- | ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------- |
-| `tick`      | Every control loop tick (30 Hz)      | `step`, `timestamp`, `joint_positions: (D,)`, `action_sent: (D,)`, `queue_remaining: int`, `loop_duration_s`, `sleep_time_s` | Core loop health + executed trajectory    |
-| `inference` | Per inference completion (~2–3 Hz)   | `latency_s`, `offset`, `chunk: (H, D)`                                                                                       | Latency monitoring + predicted trajectory |
-| `lifecycle` | On start / warmup / shutdown / error | `event: str`, `metadata: dict`                                                                                               | Session boundaries                        |
-
-**Serialization**: msgpack with numpy arrays encoded as `{"__np__": true, "dtype": str, "shape": list, "data": bytes}`. Encode cost is ~50 μs per tick event, ~100 μs per inference event. Negligible in a 33 ms tick budget.
-
-**Naming convention**: Scalar fields use OTel-compatible names (`physicalai.runtime.loop_duration_s`, `physicalai.runtime.inference_latency_s`) so a future OTLP exporter in the observer can re-export without renaming.
-
-### New Files
-
-```text
-physicalai/src/physicalai/runtime/
-├── _telemetry.py                    # TelemetryEmitter (zenoh publisher, no-op fallback)
-└── observer/
-    ├── __init__.py
-    ├── __main__.py                  # python -m physicalai.runtime.observer
-    ├── _subscriber.py               # TelemetrySubscriber (zenoh sub + dispatch)
-    ├── _console.py                  # Live console dashboard
-    └── _recorder.py                 # JSONL file sink for offline replay
-```
-
-### Dependency
-
-```toml
-# pyproject.toml
-[project.optional-dependencies]
-telemetry = ["zenoh>=1.0", "msgpack>=1.0"]
-```
-
-zenoh is optional. `TelemetryEmitter` gracefully degrades to no-op when zenoh is not installed. Observer process requires `physicalai[telemetry]`.
-
-### ActionQueue Change
-
-Add `peek_remaining()` — returns a copy of queued actions as `(R, D)` array without consuming them. Called once per inference event (not per tick) to snapshot the post-smooth future trajectory.
-
-```python
-def peek_remaining(self) -> np.ndarray | None:
-    """Return copy of remaining actions without consuming them. Thread-safe."""
-    with self._lock:
-        if not self._deque:
-            return None
-        return np.stack(list(self._deque))
-```
-
-### Integration Points
-
-**PolicyRuntime** (4 touch points in `runtime.py`):
-
-1. Accept optional `telemetry: TelemetryEmitter | None` in `__init__`
-2. `emit_lifecycle("start")` at top of `run()`
-3. `emit_tick(...)` at end of each tick
-4. `emit_lifecycle("shutdown")` in `_shutdown()`
-
-**AsyncExecution** (1 touch point in `execution.py`):
-
-1. Accept optional `telemetry: TelemetryEmitter | None` in `__init__`
-2. `emit_inference(...)` after `push_chunk()` in the `_run()` thread
-
-No changes to `smoothers.py` or `__init__.py` exports. `TelemetryEmitter` is internal wiring, not user-facing API.
-
-### Out of Scope
-
-- Remote telemetry (zenoh is network-transparent; enabling it is config, not code)
-- Camera frame streaming (stays on iceoryx2)
-- OpenTelemetry / OTLP exporter (future observer plugin)
-- Studio UI WebSocket bridge
-- Model input snapshot recording (expensive, needs opt-in design)
-- Smoothing delta visualization (derivable in observer from `inference.chunk` vs `tick.action_sent`)
-
----
-
-## Phase 3.6: Fault Tolerance (1 day)
-
-Robot connections over USB/serial and camera feeds are fragile. USB hubs lose power, serial timeouts occur, cameras drop frames. The runtime must not crash on transient hardware errors — it must retry and recover, or degrade gracefully.
-
-### Current Problem
-
-`PolicyRuntime._build_model_input()` calls `robot.get_observation()` and `cam.read_latest()` with no error handling. `robot.send_action()` is also unprotected. Any `ConnectionError`, `OSError`, or `serial.SerialException` from the SO-101 serial bus kills the control loop.
-
-### Design Principles
-
-1. **Never crash the loop on a transient error.** Retry the operation. If retries are exhausted, hold position and log — do not raise.
-2. **Distinguish transient vs fatal.** USB disconnect that resolves after replug = transient. `ValueError` from wrong action shape = fatal (programmer error). `KeyboardInterrupt` = always propagate.
-3. **No silent degradation.** Every recovery must log a warning and emit a telemetry lifecycle event. The operator must know the robot hiccupped.
-4. **Bound retry duration.** Retries must not exceed the tick budget. If `get_observation()` fails 3 times within the tick, use the last known observation and move on.
-5. **Camera failure ≠ robot failure.** A dropped camera frame should use the last known frame (stale but safe). A robot communication failure is more serious but still retryable.
-
-### Implementation: `_resilient_observe()` and `_resilient_send()`
-
-Replace the raw calls in the control loop with retry-wrapped variants:
-
-```python
-_MAX_OBS_RETRIES = 3
-_MAX_SEND_RETRIES = 2
-_RETRY_BACKOFF_S = 0.001  # 1ms between retries — bounded, won't bust 33ms budget
-
-def _resilient_observe(self) -> dict[str, Any]:
-    """Build model input with retry on transient hardware errors."""
-    # Robot observation
-    robot_obs = None
-    for attempt in range(_MAX_OBS_RETRIES):
-        try:
-            robot_obs = self._robot.get_observation()
-            self._consecutive_error_ticks = 0
-            break
-        except (ConnectionError, OSError) as e:
-            logger.warning("Robot observation failed (attempt %d/%d): %s",
-                           attempt + 1, _MAX_OBS_RETRIES, e)
-            if self._telemetry:
-                self._telemetry.emit_lifecycle("obs_error", error=str(e),
-                                               attempt=attempt + 1)
-            if attempt < _MAX_OBS_RETRIES - 1:
-                time.sleep(_RETRY_BACKOFF_S)
-
-    if robot_obs is None:
-        self._consecutive_error_ticks += 1
-        if self._consecutive_error_ticks >= self._max_consecutive_error_ticks:
-            msg = (f"Robot unreachable for {self._consecutive_error_ticks} consecutive ticks "
-                   f"({self._consecutive_error_ticks / self._fps:.1f}s)")
-            if self._telemetry:
-                self._telemetry.emit_lifecycle("connection_lost", error=msg)
-            raise ConnectionError(msg)
-        if self._last_robot_obs is not None:
-            logger.warning("Using stale robot observation (error tick %d/%d)",
-                           self._consecutive_error_ticks,
-                           self._max_consecutive_error_ticks)
-            robot_obs = self._last_robot_obs
-            self._stale_obs_ticks += 1
-        else:
-            raise ConnectionError("Robot observation failed and no fallback available")
-    self._last_robot_obs = robot_obs
-
-    # Camera frames — independent per camera, each can fail independently
-    camera_frames: dict[str, Frame] = {}
-    for name, cam in self._cameras.items():
-        try:
-            camera_frames[name] = cam.read_latest()
-            self._last_camera_frames[name] = camera_frames[name]
-        except (ConnectionError, OSError) as e:
-            logger.warning("Camera '%s' read failed: %s — using last frame", name, e)
-            if name in self._last_camera_frames:
-                camera_frames[name] = self._last_camera_frames[name]
-                self._stale_obs_ticks += 1
-
-    return self._obs_to_input(robot_obs, camera_frames)
-
-
-def _resilient_send(self, action: np.ndarray) -> None:
-    """Send action with retry on transient errors."""
-    for attempt in range(_MAX_SEND_RETRIES):
-        try:
-            self._robot.send_action(action)
-            self._consecutive_error_ticks = 0
-            return
-        except (ConnectionError, OSError) as e:
-            logger.warning("send_action failed (attempt %d/%d): %s",
-                           attempt + 1, _MAX_SEND_RETRIES, e)
-            if self._telemetry:
-                self._telemetry.emit_lifecycle("send_error", error=str(e),
-                                               attempt=attempt + 1)
-            if attempt < _MAX_SEND_RETRIES - 1:
-                time.sleep(_RETRY_BACKOFF_S)
-    self._consecutive_error_ticks += 1
-    self._transient_errors += 1
-    logger.error("send_action failed after %d attempts — skipping tick",
-                 _MAX_SEND_RETRIES)
-```
-
-### Error Classification
-
-| Exception type               | Source                                                    | Treatment                         |
-| ---------------------------- | --------------------------------------------------------- | --------------------------------- |
-| `ConnectionError`            | SO-101 serial port closed, servo not responding           | Retry, then stale fallback        |
-| `OSError`                    | USB disconnect, file descriptor error, camera device lost | Retry, then stale fallback        |
-| `TimeoutError`               | Serial read timeout (subclass of `OSError`)               | Retry, then stale fallback        |
-| `ValueError`, `RuntimeError` | Wrong action shape, uncalibrated mode, programming error  | **Fatal** — propagate immediately |
-| `WorkerDiedError`            | Inference thread crash                                    | **Fatal** — propagate immediately |
-| `KeyboardInterrupt`          | User Ctrl+C                                               | **Always propagate**              |
-
-### State Tracking
-
-`PolicyRuntime` gains fields for stale-observation fallback and error escalation:
-
-```python
-self._last_robot_obs: RobotObservation | None = None
-self._last_camera_frames: dict[str, Frame] = {}
-self._consecutive_error_ticks: int = 0
-self._max_consecutive_error_ticks: int = int(3 * fps)  # ~3 seconds
-self._stale_obs_ticks: int = 0
-self._transient_errors: int = 0
-```
-
-`_consecutive_error_ticks` is reset to 0 on any successful `get_observation()` or `send_action()`. It increments when all retries within a tick fail. When it reaches `max_consecutive_error_ticks`, the runtime raises `ConnectionError`.
-
-`RunStats` gains fault tolerance metrics:
-
-```python
-@dataclass(frozen=True)
-class RunStats:
-    steps: int
-    total_pops: int
-    total_holds: int
-    inference_count: int
-    transient_errors: int    # total retried hardware errors
-    stale_obs_ticks: int     # ticks where stale observation was used
-```
-
-The telemetry `tick` event gains a `stale_obs: bool` field so the observer can flag degraded ticks.
-
-### Warmup Resilience
-
-Warmup is the most fragile moment — USB may not be fully enumerated, servos may still be initializing. The first call to `_build_model_input()` has no stale fallback.
-
-Warmup gets its own retry loop with longer timeout:
-
-```python
-_WARMUP_RETRIES = 5
-_WARMUP_BACKOFF_S = 1.0  # 1 second between warmup retries
-
-def _warmup_with_retry(self) -> None:
-    for attempt in range(_WARMUP_RETRIES):
-        try:
-            sample_obs = self._build_model_input()
-            self._execution.warmup(sample_obs)
-            return
-        except (ConnectionError, OSError) as e:
-            logger.warning("Warmup failed (attempt %d/%d): %s",
-                           attempt + 1, _WARMUP_RETRIES, e)
-            if attempt < _WARMUP_RETRIES - 1:
-                time.sleep(_WARMUP_BACKOFF_S)
-    msg = f"Warmup failed after {_WARMUP_RETRIES} attempts — robot or cameras unreachable"
-    raise ConnectionError(msg)
-```
-
-Total warmup retry budget: 5 seconds. Long enough for USB enumeration, short enough to not confuse the user.
-
-### Reconnection
-
-Retry within a tick handles brief glitches (serial timeout, dropped USB packet). For sustained disconnects (USB cable pulled), the robot's `is_connected()` returns `False` and consecutive retries will keep failing.
-
-Full reconnection (calling `robot.disconnect()` then `robot.connect()`) is **not** done automatically in the runtime. Reconnecting a serial bus mid-loop is dangerous — it resets servo state, torque settings, and calibration. This must be an explicit user decision.
-
-Instead, the runtime:
-
-1. Logs an error with reconnection instructions
-2. Continues holding position (stale obs + last action)
-3. Emits `lifecycle("connection_lost")` telemetry event
-4. After `max_consecutive_errors` (default: `3 * fps` = 3 seconds of failures), raises `ConnectionError` with a clear message
-
-This gives the user time to replug USB without the loop crashing, but doesn't silently run for minutes with a dead robot.
-
-### Camera Recovery
-
-Cameras are more resilient than robots — `SharedCamera` (iceoryx2) handles publisher death and re-spawn. Direct camera backends (`RealsenseCamera`, `UVCCamera`) may need explicit reconnection.
-
-The runtime treats camera failure as soft: use last frame, log warning, continue. If a camera has never produced a frame (failure on first read), omit it from the model input dict. The model's behavior with missing camera keys is model-specific — not the runtime's concern.
-
-### Tests
-
-```text
-physicalai/tests/unit/runtime/
-└── test_fault_tolerance.py   # Mock robot that raises on get_observation/send_action:
-                              #   - transient error → retry succeeds
-                              #   - sustained error → stale fallback
-                              #   - fatal error → propagates
-                              #   - camera failure → stale frame used
-                              #   - max_consecutive_errors → ConnectionError raised
-```
-
----
-
-## Phase 4: Advanced (later)
-
-1. `AsyncExecution(transport="process")` for PyTorch CPU (GIL contention)
-2. RTC guidance correction (requires split export from `library/`)
-3. `RemoteExecution` + gRPC `PolicyServer` (see [policy_server_design.md](./policy_server_design.md))
-
----
-
-## Component Mapping: inference_async.py → Library
-
-| Script component                            | Library target                                             | Notes                                                   |
-| ------------------------------------------- | ---------------------------------------------------------- | ------------------------------------------------------- |
-| `QueueMixer`                                | `ActionQueue` + `LerpSmoother`                             | Offset-aware blending extracted into smoother           |
-| `QueueMixer.lerp_duration = max(offset, 1)` | `LerpSmoother.merge()` computes from offset                | Stateless — `duration_frames` is fallback when offset=0 |
-| `InferenceThread`                           | `AsyncExecution`                                           | Same thread architecture: obs_slot + result push        |
-| `InferenceThread.force_reset()`             | `AsyncExecution._force_reset()`                            | Clears stuck state                                      |
-| `InferenceThread.busy_duration`             | `AsyncExecution._busy_duration`                            | Watchdog timeout trigger                                |
-| `InferenceThread.alive`                     | `AsyncExecution.alive`                                     | Dead thread detection                                   |
-| `get_full_observation()`                    | `default_observation_to_input()` + `obs_to_input` callable | Separates robot obs format from model input format      |
-| `action_to_robot_dict()`                    | `Robot.send_action(ndarray)`                               | Robot protocol handles conversion                       |
-| `main()` while-loop                         | `PolicyRuntime.run()`                                      | Same 5-step structure                                   |
-| Velocity clamping + ramp                    | User callback or example code                              | Too robot-specific for core runtime                     |
-| Camera discovery + `SharedCamera`           | User code / CLI (Phase 3)                                  | App-specific                                            |
-| Warm-up inference + queue seeding           | `Execution.warmup()`                                       | Seeds queue so loop starts with actions                 |
-| `inference_thread.alive` check + restart    | `AsyncExecution.maybe_request()` raises `WorkerDiedError`  | Raise instead of silent restart                         |
-| `force_reset()` for stuck thread            | `AsyncExecution._force_reset()` via watchdog               | Auto-triggered when `busy_duration > timeout`           |
-| Hold position + `hold_count`                | `ActionQueue.consecutive_holds` + `PolicyRuntime` logging  | Telemetry via queue counters                            |
-| Two-backend support (torch/OV)              | `InferenceModel` abstraction                               | Runtime only calls `predict_action_chunk()`             |
-| (none — ad-hoc print/log)                   | `TelemetryEmitter` + zenoh pub-sub                         | Fire-and-forget, no-op when zenoh absent (Phase 3.5)    |
-| (none — loop crashes on USB error)          | `_resilient_observe()` / `_resilient_send()`               | Retry + stale fallback for transient errors (Phase 3.6) |
-
----
-
-## Design Gaps Addressed
-
-Gaps identified during architecture review, with resolutions.
-
-### Gap 1: Observation ownership in async
-
-**Problem**: Main thread passes observation dict to `maybe_request()`. If main thread reuses camera buffers, inference thread reads corrupted data.
-
-**Resolution**: `AsyncExecution.maybe_request()` performs defensive copy before submitting:
-
-```python
-snapshot = {
-    k: v.copy() if isinstance(v, np.ndarray) else v
-    for k, v in observation.items()
-}
-```
-
-Cost: one dict of numpy copies per inference request (not per tick — only when threshold triggers). At 30fps with threshold=0.5 and chunk_size=50, that's roughly once every ~0.8s.
-
-### Gap 2: Empty-queue telemetry
-
-**Problem**: "Hold last action" silently masks queue starvation.
-
-**Resolution**: `ActionQueue` tracks `consecutive_holds` and `total_holds`. `PolicyRuntime` logs warnings on first hold and every `fps` consecutive holds (1 per second). `on_hold` callback exposes starvation events to user code.
-
-### Gap 3: Graceful shutdown
-
-**Problem**: Hard stop can jerk the robot.
-
-**Resolution**: `PolicyRuntime._shutdown()` drains up to 1 second of remaining actions at loop FPS. Beyond 1s, hard stop — the user pressed Ctrl+C for a reason. Robot and cameras stay connected here; caller owns connect/disconnect lifecycle.
-
-### Gap 4: Error propagation
-
-**Problem**: Inference thread exceptions silently swallowed — `pop_action()` returns None forever.
-
-**Resolution**: `AsyncExecution` stores `_death_cause` on thread exception. `maybe_request()` checks `alive` and raises `WorkerDiedError` with original traceback preserved via `raise ... from`. PolicyRuntime catches and re-raises.
-
-### Gap 5: Two-backend support
-
-**Problem**: inference_async.py handles PyTorch and OpenVINO with different preprocessing.
-
-**Resolution**: Not a runtime concern. `InferenceModel` abstracts backend differences. Runtime only calls `model.predict_action_chunk(obs)`. The torch bypass in inference_async.py exists because the script bypasses `InferenceModel` for direct Pi05 policy access — the library runtime won't need this.
-
----
-
-## Resolved Questions
-
-1. **`PolicyRuntime.run()` returns `RunStats`.** `@dataclass` with 4 core fields (steps, total_pops, total_holds, inference_count) plus 2 fault tolerance fields (transient_errors, stale_obs_ticks) added in Phase 3.6. Useful for testing and logging.
-
-2. **Warm-up happens inside `run()`.** User doesn't forget, no "did I call warmup?" failure mode.
-
-3. **Dead worker raises `WorkerDiedError`.** Let caller decide recovery strategy. Auto-restart can mask systematic failures.
-
----
-
-## Relationship to Existing Design Docs
-
-This plan refines [policy_runtime_design.md](./policy_runtime_design.md). Key differences:
-
-| Topic                           | Original design doc                                              | This plan                                                                                                  |
-| ------------------------------- | ---------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------- |
-| ActionQueue visibility          | Public parameter on PolicyRuntime                                | Public parameter with sensible default. Internal `_action_queue.py` module, exported via `__init__`.       |
-| Execution ABC                   | `start(action_queue, model)`, `maybe_request(obs, action_queue)` | `start(model, action_queue)`, `maybe_request(obs)` — queue stored internally on start, not passed per call |
-| Warmup                          | `warmup(sample_observation, n=2)`                                | `warmup(sample_observation)` — one call, seeds queue                                                       |
-| Health monitoring               | Not specified                                                    | First-class: alive, busy, busy_duration, WorkerDiedError, watchdog                                         |
-| Smoothing                       | `LerpChunkSmoother`, `ReplaceMerger`                             | `LerpSmoother`, `ReplaceSmoother` — stateless merge, offset-aware                                          |
-| Observation bridge              | Not specified                                                    | `obs_to_input` callable with `default_observation_to_input` fallback                                       |
-| `predict_action_chunk()` return | `Mapping[str, Any]` with `"actions"` key                         | `np.ndarray` directly (matches actual implementation)                                                      |
-| Bug fixes                       | Not applicable                                                   | Phase 1 prerequisite — bugs 1-3 on critical path                                                           |
-| Telemetry                       | Not specified                                                    | Phase 3.5: zenoh pub-sub emitter (fire-and-forget), separate observer process, optional dep                |
-| Fault tolerance                 | Not specified                                                    | Phase 3.6: retry + stale fallback for transient HW errors, error classification, bounded retry             |
-
-All ownership rules, boundary constraints, and deferred-until-needed decisions from the original design doc remain in effect.

From c79b0e9537954ffd675f7561a2c06dd59f7cb6e2 Mon Sep 17 00:00:00 2001
From: Max Xiang <xiangxiang.ma@intel.com>
Date: Sun, 17 May 2026 22:14:24 +0200
Subject: [PATCH 05/19] fix style

---
 src/physicalai/robot/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/physicalai/robot/__init__.py b/src/physicalai/robot/__init__.py
index 110f90a..d2159e9 100644
--- a/src/physicalai/robot/__init__.py
+++ b/src/physicalai/robot/__init__.py
@@ -19,9 +19,9 @@
 
 __all__ = [
     "Robot",
+    "RobotObservation",
     "connect",
     "verify_robot",
-    "RobotObservation",
 ]
 
 

From e6a986365f5bd80200abe540e97db1963fd8773f Mon Sep 17 00:00:00 2001
From: Max Xiang <xiangxiang.ma@intel.com>
Date: Sun, 17 May 2026 22:22:39 +0200
Subject: [PATCH 06/19] remove outdated example

---
 examples/runtime/inference_async.py | 764 ----------------------------
 1 file changed, 764 deletions(-)
 delete mode 100644 examples/runtime/inference_async.py

diff --git a/examples/runtime/inference_async.py b/examples/runtime/inference_async.py
deleted file mode 100644
index 9eade0a..0000000
--- a/examples/runtime/inference_async.py
+++ /dev/null
@@ -1,764 +0,0 @@
-# pyrefly: ignore-errors
-# Copyright (C) 2026 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-"""Async threaded inference for SO-101 with lerp-blended action chunking.
-
-Supports two backends:
-- **PyTorch/CUDA**: Pi05 checkpoint (.ckpt file) — GPU inference via select_action.
-- **OpenVINO**: Exported model directory — CPU/GPU inference via InferenceModel.
-
-The backend is auto-detected from the checkpoint path:
-- File ending in .ckpt → PyTorch Pi05
-- Directory → OpenVINO InferenceModel
-
-The main thread runs the robot control loop at a fixed FPS.
-A background thread runs model inference and pushes action chunks
-into a shared queue with lerp blending for smooth transitions.
-
-Uses LeRobot SO101Follower (degrees) for robot control and
-physicalai SharedCamera (iceoryx2) for camera input.
-
-Usage (PyTorch):
-    python inference_async_pi05.py \
-        --model-path pi05/pi05_eugene.ckpt \
-        --device cuda \
-        --robot-port /dev/ttyACM1 \
-        --task "pick a can and place it in a bowl"
-
-Usage (OpenVINO):
-    python inference_async_pi05.py \
-        --model-path exports/pi05_ov \
-        --device CPU \
-        --robot-port /dev/ttyACM1 \
-        --camera-map overhead=top arm=wrist \
-        --task "pick a can and place it in a bowl"
-"""
-
-from __future__ import annotations
-
-import argparse
-import logging
-import threading
-import time
-from pathlib import Path
-
-import cv2
-import numpy as np
-
-from lerobot.robots.so_follower import SO101Follower, SO101FollowerConfig
-from physicalai.capture import SharedCamera
-from physicalai.capture.discovery import DeviceInfo, discover_all
-
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s %(levelname)s [%(threadName)s] %(name)s: %(message)s",
-)
-logger = logging.getLogger(__name__)
-
-JOINT_NAMES = [
-    "shoulder_pan",
-    "shoulder_lift",
-    "elbow_flex",
-    "wrist_flex",
-    "wrist_roll",
-    "gripper",
-]
-
-IMAGE_WIDTH = 640
-IMAGE_HEIGHT = 480
-CAMERA_FPS = 30
-
-
-# ---------------------------------------------------------------------------
-# Queue Mixer — lerp-blends old and new action chunks
-# ---------------------------------------------------------------------------
-
-class QueueMixer:
-    """Action queue with linear-interpolation blending between chunks.
-
-    When a new chunk arrives while old actions remain, the overlapping
-    region is lerp-blended so the robot doesn't jerk.  An *offset*
-    parameter lets us skip the first N actions of the new chunk to
-    compensate for inference latency.
-    """
-
-    def __init__(self, lerp_duration: int = 5) -> None:
-        self.queue: np.ndarray | None = None
-        self.lerp_duration = lerp_duration
-        self.index = 0
-
-    def add(self, chunk: np.ndarray, offset: int = 0) -> None:
-        """Merge *chunk* (H, action_dim) into the queue."""
-        if self.queue is None or self.index >= len(self.queue):
-            self.queue = chunk[offset:]
-            self.index = 0
-            logger.info(
-                "[QueueMixer] First/exhausted → replaced queue (len=%d, offset=%d)",
-                len(self.queue), offset,
-            )
-            return
-
-        remaining = self.queue[self.index:]
-        incoming = chunk[offset:]
-        n_remain = len(remaining)
-        lerp_dur = min(n_remain, self.lerp_duration)
-
-        weights = np.maximum(1.0 - np.arange(n_remain) / max(lerp_dur, 1), 0.0)
-        weights = weights[:, np.newaxis]
-
-        n_blend = min(n_remain, len(incoming))
-        blended = weights[:n_blend] * remaining[:n_blend] + (1.0 - weights[:n_blend]) * incoming[:n_blend]
-
-        self.queue = np.concatenate([blended, incoming[n_blend:]], axis=0).astype(np.float32)
-        self.index = 0
-        logger.info(
-            "[QueueMixer] Blended chunk (blended=%d, appended=%d, total=%d, lerp=%d, offset=%d)",
-            n_blend, max(len(incoming) - n_blend, 0), len(self.queue), lerp_dur, offset,
-        )
-
-    def pop(self) -> np.ndarray | None:
-        """Pop the next action, or return None if empty."""
-        if self.queue is None or self.index >= len(self.queue):
-            return None
-        action = self.queue[self.index]
-        self.index += 1
-        return action
-
-    @property
-    def remaining(self) -> int:
-        if self.queue is None:
-            return 0
-        return max(len(self.queue) - self.index, 0)
-
-    @property
-    def empty(self) -> bool:
-        return self.remaining == 0
-
-
-# ---------------------------------------------------------------------------
-# Inference Thread (PyTorch / CUDA)
-# ---------------------------------------------------------------------------
-
-class InferenceThread:
-    """Background thread that runs model inference on demand.
-
-    Supports two backends:
-    - "torch": Pi05 GPU inference via select_action + action_queue draining.
-    - "openvino": InferenceModel inference via model(dict)["action"].
-
-    The control loop writes an observation dict into `obs_slot` and sets
-    `obs_ready`.  The inference thread picks it up, runs inference, and
-    pushes the result (chunk + timing) into `result_slot`.
-    """
-
-    def __init__(
-        self,
-        model: object,
-        backend: str,
-        device: str,
-        task: str,
-        flip_cameras: set[str],
-        camera_name_map: dict[str, str] | None = None,
-        blank_cameras: list[str] | None = None,
-    ) -> None:
-        self.model = model
-        self.backend = backend
-        self.device = device
-        self.task = task
-        self.flip_cameras = flip_cameras
-        self.camera_name_map = camera_name_map or {}
-        self.blank_cameras = blank_cameras or []
-
-        self._lock = threading.Lock()
-        self._obs_slot: dict | None = None
-        self._obs_ready = threading.Event()
-        self._running_inference = False
-        self._request_time = 0.0
-
-        self._result_lock = threading.Lock()
-        self._result_slot: tuple[np.ndarray, float] | None = None
-
-        self._stop = threading.Event()
-        self._thread = threading.Thread(target=self._run, name="InferenceThread", daemon=True)
-        self.inference_count = 0
-
-    def start(self) -> None:
-        logger.info("[InferenceThread] Starting background thread")
-        self._thread.start()
-
-    def stop(self) -> None:
-        logger.info("[InferenceThread] Requesting stop")
-        self._stop.set()
-        self._obs_ready.set()
-        self._thread.join(timeout=10.0)
-        logger.info("[InferenceThread] Stopped (ran %d inferences)", self.inference_count)
-
-    def request(self, observation: dict) -> bool:
-        """Submit an observation dict for inference. Returns False if busy."""
-        with self._lock:
-            if self._obs_slot is not None or self._running_inference:
-                return False
-            self._obs_slot = observation
-            self._request_time = time.perf_counter()
-        self._obs_ready.set()
-        logger.debug("[InferenceThread] Observation submitted")
-        return True
-
-    @property
-    def busy(self) -> bool:
-        with self._lock:
-            return self._obs_slot is not None or self._running_inference
-
-    @property
-    def alive(self) -> bool:
-        return self._thread.is_alive()
-
-    @property
-    def busy_duration(self) -> float:
-        """Seconds since last request was submitted. 0 if not busy."""
-        with self._lock:
-            if not (self._obs_slot is not None or self._running_inference):
-                return 0.0
-            return time.perf_counter() - self._request_time
-
-    def force_reset(self) -> None:
-        """Clear stuck state so new requests can be submitted."""
-        with self._lock:
-            self._obs_slot = None
-            self._running_inference = False
-        logger.warning("[InferenceThread] Force reset — cleared stuck state")
-
-    def get_result(self) -> tuple[np.ndarray, float] | None:
-        """Non-blocking fetch of the latest result. Returns (chunk, latency_s) or None."""
-        with self._result_lock:
-            r = self._result_slot
-            self._result_slot = None
-        return r
-
-    def _build_torch_observation(self, obs_dict: dict) -> object:
-        """Convert raw observation dict to Pi05 Observation on device (torch)."""
-        import torch
-        from physicalai.data.observation import Observation
-
-        state = torch.tensor(
-            [[obs_dict[f"{jn}.pos"] for jn in JOINT_NAMES]],
-            dtype=torch.float32,
-            device=self.device,
-        )
-
-        images = {}
-        for cam_key in [k for k in obs_dict if isinstance(obs_dict[k], np.ndarray) and obs_dict[k].ndim == 3]:
-            img = np.ascontiguousarray(obs_dict[cam_key])
-            if cam_key in self.flip_cameras:
-                img = cv2.rotate(img, cv2.ROTATE_180)
-            t = torch.from_numpy(img.copy()).float() / 255.0
-            t = t.permute(2, 0, 1).unsqueeze(0)
-            images[cam_key] = t.to(self.device)
-
-        return Observation(state=state, images=images, task=self.task)
-
-    def _build_ov_observation(self, obs_dict: dict) -> dict[str, np.ndarray]:
-        """Convert raw observation dict to InferenceModel input (numpy)."""
-        state = np.array([[obs_dict[f"{jn}.pos"] for jn in JOINT_NAMES]], dtype=np.float32)
-        observation: dict[str, np.ndarray] = {"state": state, "task": [self.task]}
-
-        for cam_key in [k for k in obs_dict if isinstance(obs_dict[k], np.ndarray) and obs_dict[k].ndim == 3]:
-            img = np.ascontiguousarray(obs_dict[cam_key])
-            if cam_key in self.flip_cameras:
-                img = cv2.rotate(img, cv2.ROTATE_180)
-            img = cv2.resize(img, (IMAGE_WIDTH, IMAGE_HEIGHT))
-            img = img.transpose(2, 0, 1).astype(np.float32)[np.newaxis] / 255.0
-            model_name = self.camera_name_map.get(cam_key, cam_key)
-            observation[f"images.{model_name}"] = img
-
-        for blank_cam in self.blank_cameras:
-            observation[f"images.{blank_cam}"] = np.zeros(
-                (1, 3, IMAGE_HEIGHT, IMAGE_WIDTH), dtype=np.float32,
-            )
-
-        return observation
-
-    def _infer_torch(self, obs_dict: dict) -> np.ndarray:
-        """Run Pi05 inference and return the full action chunk as numpy."""
-        import torch
-        from collections import deque
-
-        observation = self._build_torch_observation(obs_dict)
-        self.model._action_queue = deque()
-
-        with torch.no_grad():
-            action = self.model.select_action(observation)
-        if self.device == "cuda":
-            torch.cuda.synchronize()
-
-        actions = [action.detach().cpu().numpy()]
-        while len(self.model._action_queue) > 0:
-            a = self.model._action_queue.popleft()
-            actions.append(a.detach().cpu().numpy() if isinstance(a, torch.Tensor) else a)
-
-        return np.array([np.squeeze(a) for a in actions], dtype=np.float32)
-
-    def _infer_ov(self, obs_dict: dict) -> np.ndarray:
-        """Run OpenVINO inference and return the full action chunk as numpy."""
-        model_input = self._build_ov_observation(obs_dict)
-        output = self.model(model_input)["action"]
-        chunk = np.squeeze(output)
-        if chunk.ndim == 1:
-            chunk = chunk[np.newaxis, :]
-        return chunk.astype(np.float32)
-
-    def _run(self) -> None:
-        logger.info("[InferenceThread] Thread started, waiting for observations")
-        while not self._stop.is_set():
-            self._obs_ready.wait()
-            self._obs_ready.clear()
-
-            if self._stop.is_set():
-                break
-
-            with self._lock:
-                obs_dict = self._obs_slot
-                self._obs_slot = None
-                self._running_inference = True
-
-            if obs_dict is None:
-                with self._lock:
-                    self._running_inference = False
-                continue
-
-            self.inference_count += 1
-            logger.info("[InferenceThread] >>> Inference #%d START", self.inference_count)
-            t0 = time.perf_counter()
-
-            try:
-                if self.backend == "torch":
-                    chunk = self._infer_torch(obs_dict)
-                else:
-                    chunk = self._infer_ov(obs_dict)
-            except Exception:
-                logger.exception("[InferenceThread] Inference failed")
-                with self._lock:
-                    self._running_inference = False
-                continue
-
-            latency = time.perf_counter() - t0
-            logger.info(
-                "[InferenceThread] <<< Inference #%d DONE  latency=%.1fms  chunk_shape=%s",
-                self.inference_count, latency * 1000, chunk.shape,
-            )
-
-            with self._result_lock:
-                self._result_slot = (chunk, latency)
-            with self._lock:
-                self._running_inference = False
-
-
-# ---------------------------------------------------------------------------
-# Observation helpers
-# ---------------------------------------------------------------------------
-
-def get_full_observation(
-    robot: SO101Follower,
-    cameras: dict[str, SharedCamera],
-) -> dict:
-    """Combine robot state + SharedCamera images into one dict.
-
-    LeRobot SO101Follower returns joint values in the correct
-    model domain (body joints in degrees, gripper in [0, 100]).
-    """
-    obs = robot.get_observation()
-    for cam_key, cam in cameras.items():
-        frame = cam.read_latest()
-        obs[cam_key] = frame.data
-    return obs
-
-
-def action_to_robot_dict(action: np.ndarray) -> dict[str, float]:
-    """Convert action array to a LeRobot send_action dict."""
-    return {f"{jn}.pos": float(action[i]) for i, jn in enumerate(JOINT_NAMES)}
-
-
-# ---------------------------------------------------------------------------
-# CLI
-# ---------------------------------------------------------------------------
-
-# ---------------------------------------------------------------------------
-# Camera discovery
-# ---------------------------------------------------------------------------
-
-def select_cameras() -> dict[str, str]:
-    """Discover cameras and interactively ask user to assign overhead/arm."""
-    print("\n[cameras] Discovering cameras ...", flush=True)
-
-    all_devices = discover_all()
-
-    # Flatten into a single list (deduplicate across drivers by hardware_id)
-    seen: set[str] = set()
-    devices: list[DeviceInfo] = []
-    for driver_devices in all_devices.values():
-        for dev in driver_devices:
-            key = dev.hardware_id or f"{dev.driver}:{dev.device_id}"
-            if key not in seen:
-                seen.add(key)
-                devices.append(dev)
-
-    if not devices:
-        raise RuntimeError("No cameras found. Check connections and permissions.")
-
-    print(f"\n  Found {len(devices)} camera(s):", flush=True)
-    for i, dev in enumerate(devices):
-        stable_id = dev.hardware_id or ""
-        print(f"    [{i}] {dev.name or dev.model or 'Unknown'}  ({stable_id or dev.device_id})", flush=True)
-
-    cameras: dict[str, str] = {}
-    for role in ("overhead", "arm"):
-        while True:
-            choice = input(f"\n  Select {role} camera (0-{len(devices)-1}), or 's' to skip: ").strip()
-            if choice.lower() == "s":
-                print(f"    Skipping {role} camera", flush=True)
-                break
-            if choice.isdigit() and 0 <= int(choice) < len(devices):
-                dev = devices[int(choice)]
-                # Prefer stable by-id path, fall back to device_id
-                path = dev.hardware_id if dev.hardware_id else dev.device_id
-                cameras[role] = path
-                print(f"    {role} → {path}", flush=True)
-                break
-            print(f"    Invalid choice. Enter 0-{len(devices)-1} or 's'.")
-
-    if not cameras:
-        raise RuntimeError("At least one camera must be selected.")
-
-    return cameras
-
-
-def parse_args() -> argparse.Namespace:
-    p = argparse.ArgumentParser(description="Async threaded SO-101 inference (PyTorch or OpenVINO).")
-    p.add_argument("--model-path", type=str, required=True,
-                    help="Path to Pi05 checkpoint (.ckpt) or OpenVINO export directory")
-    p.add_argument("--device", type=str, default="cuda",
-                    help="'cuda' or 'cpu' for torch; 'CPU', 'GPU', 'NPU' for OpenVINO")
-    p.add_argument("--robot-port", type=str, default="/dev/ttyACM1")
-    p.add_argument("--robot-id", type=str, default="my_so101_follower")
-    p.add_argument("--overhead-camera", type=str, default=None,
-                    help="UVC device path for overhead camera (skip interactive selection)")
-    p.add_argument("--arm-camera", type=str, default=None,
-                    help="UVC device path for arm camera (skip interactive selection)")
-    p.add_argument("--task", type=str, default="pick a can and place it in a bowl")
-    p.add_argument("--fps", type=int, default=30)
-    p.add_argument("--queue-threshold", type=float, default=0.5,
-                    help="Request new inference when queue drops below this fraction of chunk size.")
-    p.add_argument("--lerp-duration", type=int, default=5,
-                    help="Number of frames over which to blend old/new chunks.")
-    p.add_argument("--flip-cameras", nargs="*", default=[],
-                    help="Camera names to rotate 180 degrees.")
-    p.add_argument("--infer-timeout", type=float, default=15.0,
-                    help="Seconds before force-resetting a stuck inference thread.")
-    p.add_argument("--max-speed", type=float, default=90.0,
-                    help="Max joint speed in degrees/second. Clamps action deltas per step.")
-    p.add_argument("--ramp-steps", type=int, default=30,
-                    help="Number of initial steps to ramp up speed (starts at 1/4 max-speed).")
-    p.add_argument("--no-compile", action="store_true", default=True,
-                    help="Disable torch.compile (default: disabled, torch only)")
-    # OpenVINO-specific options
-    p.add_argument("--camera-map", nargs="*",
-                    help="Map camera names to model input names, e.g. overhead=top arm=wrist (OV only)")
-    p.add_argument("--blank-cameras", nargs="*",
-                    help="Model camera inputs to fill with zeros, e.g. side (OV only)")
-    return p.parse_args()
-
-
-# ---------------------------------------------------------------------------
-# Main
-# ---------------------------------------------------------------------------
-
-def main(args: argparse.Namespace) -> None:
-    model_path = Path(args.model_path)
-    if not model_path.exists():
-        raise FileNotFoundError(f"Model not found: {model_path}")
-
-    # Auto-detect backend
-    if model_path.is_dir():
-        backend = "openvino"
-        # Default device for OpenVINO is CPU when user left the torch default
-        if args.device in ("cuda", "cuda:0"):
-            print(f"[init] Warning: OpenVINO backend does not support CUDA. Falling back to CPU.", flush=True)
-            args.device = "CPU"
-        else:
-            args.device = args.device.upper()
-    elif model_path.suffix == ".ckpt":
-        backend = "torch"
-    else:
-        raise ValueError(
-            f"Cannot determine backend from '{model_path}'. "
-            "Use a .ckpt file for PyTorch or a directory for OpenVINO."
-        )
-
-    flip_cameras = set(args.flip_cameras or [])
-    camera_name_map = dict(kv.split("=") for kv in (args.camera_map or []))
-    blank_cameras = list(args.blank_cameras or [])
-
-    print(f"[init] Model: {model_path}", flush=True)
-    print(f"[init] Backend: {backend}", flush=True)
-    print(f"[init] Device: {args.device}", flush=True)
-    print(f"[init] Flip cameras: {flip_cameras}", flush=True)
-    if backend == "openvino":
-        print(f"[init] Camera map: {camera_name_map}", flush=True)
-        print(f"[init] Blank cameras: {blank_cameras}", flush=True)
-
-    # --- Set up cameras ---
-    # Use CLI paths if provided, otherwise run interactive selection
-    if args.overhead_camera or args.arm_camera:
-        camera_paths: dict[str, str] = {}
-        if args.overhead_camera:
-            camera_paths["overhead"] = args.overhead_camera
-        if args.arm_camera:
-            camera_paths["arm"] = args.arm_camera
-    else:
-        camera_paths = select_cameras()
-
-    cameras: dict[str, SharedCamera] = {}
-    for cam_name, cam_device in camera_paths.items():
-        cameras[cam_name] = SharedCamera(
-            "uvc", device=cam_device,
-            width=IMAGE_WIDTH, height=IMAGE_HEIGHT, fps=CAMERA_FPS,
-        )
-
-    for name, cam in cameras.items():
-        cam.connect()
-        print(f"[init] Camera '{name}' connected: {cam.actual_width}x{cam.actual_height} @ {cam.actual_fps}fps", flush=True)
-
-    # --- Load model ---
-    if backend == "torch":
-        import torch
-        from physicalai.policies.pi05 import Pi05
-
-        print("[init] Loading Pi05 model (map_location=cpu) ...", flush=True)
-        model = Pi05.load_from_checkpoint(
-            str(model_path), map_location="cpu", compile_model=not args.no_compile,
-        )
-        print("[init] Checkpoint loaded to CPU, moving to device ...", flush=True)
-        model = model.to(args.device)
-        model.eval()
-        print(f"[init] Model loaded on {args.device}", flush=True)
-        if args.device == "cuda":
-            print(f"[init] VRAM used: {torch.cuda.memory_allocated() / 1e9:.2f} GB", flush=True)
-    else:
-        import openvino_tokenizers  # noqa: F401
-        from physicalai.inference import InferenceModel
-        from physicalai.inference.runners import SinglePass
-
-        print(f"[init] Loading OpenVINO model from {model_path} ...", flush=True)
-        model = InferenceModel(
-            export_dir=str(model_path),
-            device=args.device,
-            runner=SinglePass(),
-        )
-        print("[init] Model loaded", flush=True)
-
-
-    # --- Connect robot (LeRobot SO101Follower, no cameras) ---
-    print(f"[init] Connecting robot on {args.robot_port} ...", flush=True)
-    robot_cfg = SO101FollowerConfig(
-        port=args.robot_port,
-        id=args.robot_id,
-    )
-    robot = SO101Follower(robot_cfg)
-    robot.connect()
-    print(f"[init] Robot connected (id={args.robot_id})", flush=True)
-
-    # --- Warm-up inference to determine chunk size ---
-    print("[init] Getting warm-up observation ...", flush=True)
-    warmup_obs = get_full_observation(robot, cameras)
-    print(f"[init] Got observation, keys: {list(warmup_obs.keys())}", flush=True)
-
-    print("[init] Running warm-up inference ...", flush=True)
-    t0 = time.perf_counter()
-
-    if backend == "torch":
-        import torch
-        from collections import deque
-        from physicalai.data.observation import Observation
-
-        state = torch.tensor(
-            [[warmup_obs[f"{jn}.pos"] for jn in JOINT_NAMES]],
-            dtype=torch.float32, device=args.device,
-        )
-        warmup_images = {}
-        for cam_key in cameras:
-            img = np.ascontiguousarray(warmup_obs[cam_key])
-            if cam_key in flip_cameras:
-                img = cv2.rotate(img, cv2.ROTATE_180)
-            t = torch.from_numpy(img.copy()).float() / 255.0
-            t = t.permute(2, 0, 1).unsqueeze(0).to(args.device)
-            warmup_images[cam_key] = t
-
-        warmup_observation = Observation(state=state, images=warmup_images, task=args.task)
-        model._action_queue = deque()
-        with torch.no_grad():
-            warmup_action = model.select_action(warmup_observation)
-        if args.device == "cuda":
-            torch.cuda.synchronize()
-
-        warmup_actions = [warmup_action.detach().cpu().numpy()]
-        while len(model._action_queue) > 0:
-            a = model._action_queue.popleft()
-            warmup_actions.append(a.detach().cpu().numpy() if isinstance(a, torch.Tensor) else a)
-        warmup_chunk = np.array([np.squeeze(a) for a in warmup_actions], dtype=np.float32)
-    else:
-        # OV warm-up
-        warmup_state = np.array(
-            [[warmup_obs[f"{jn}.pos"] for jn in JOINT_NAMES]], dtype=np.float32,
-        )
-        warmup_input: dict[str, np.ndarray] = {"state": warmup_state, "task": [args.task]}
-        for cam_key in cameras:
-            img = np.ascontiguousarray(warmup_obs[cam_key])
-            if cam_key in flip_cameras:
-                img = cv2.rotate(img, cv2.ROTATE_180)
-            img = cv2.resize(img, (IMAGE_WIDTH, IMAGE_HEIGHT))
-            img = img.transpose(2, 0, 1).astype(np.float32)[np.newaxis] / 255.0
-            model_name = camera_name_map.get(cam_key, cam_key)
-            warmup_input[f"images.{model_name}"] = img
-        for blank_cam in blank_cameras:
-            warmup_input[f"images.{blank_cam}"] = np.zeros(
-                (1, 3, IMAGE_HEIGHT, IMAGE_WIDTH), dtype=np.float32,
-            )
-        warmup_out = model(warmup_input)["action"]
-        warmup_chunk = np.squeeze(warmup_out)
-        if warmup_chunk.ndim == 1:
-            warmup_chunk = warmup_chunk[np.newaxis, :]
-        warmup_chunk = warmup_chunk.astype(np.float32)
-
-    warmup_latency = time.perf_counter() - t0
-    chunk_size = warmup_chunk.shape[0]
-    action_dim = warmup_chunk.shape[1]
-    print(f"[init] Warm-up done: chunk_size={chunk_size}, action_dim={action_dim}, "
-          f"latency={warmup_latency*1000:.1f}ms", flush=True)
-
-    # --- Init components ---
-    queue_mixer = QueueMixer(lerp_duration=args.lerp_duration)
-    queue_mixer.add(warmup_chunk, offset=0)
-
-    inference_thread = InferenceThread(
-        model, backend, args.device, args.task, flip_cameras,
-        camera_name_map=camera_name_map, blank_cameras=blank_cameras,
-    )
-    inference_thread.start()
-
-    goal_time = 1.0 / args.fps
-    threshold = int(chunk_size * args.queue_threshold)
-    max_delta_per_step = args.max_speed / args.fps  # degrees per step
-    step = 0
-    actions_from_queue = 0
-    hold_count = 0
-    last_action: np.ndarray = warmup_chunk[0]
-
-    # Initialize commanded position from current robot state (degrees/[0,100])
-    init_obs = robot.get_observation()
-    commanded_pos = np.array(
-        [init_obs[f"{jn}.pos"] for jn in JOINT_NAMES], dtype=np.float32,
-    )
-    print(f"[init] Current joint pos: {commanded_pos}", flush=True)
-    print(f"[init] Max speed: {args.max_speed:.0f} deg/s → {max_delta_per_step:.1f} per step @ {args.fps}Hz", flush=True)
-    print(f"[init] Ramp-up: {args.ramp_steps} steps", flush=True)
-    print("[run] Starting — Ctrl+C to stop", flush=True)
-
-    try:
-        while True:
-            loop_start = time.perf_counter()
-
-            # 1. Check for inference results
-            result = inference_thread.get_result()
-            if result is not None:
-                chunk, latency = result
-                offset = int(latency * args.fps)
-                logger.info(
-                    "[Control] Got inference #%d: chunk=%s, latency=%.1fms, offset=%d",
-                    inference_thread.inference_count, chunk.shape, latency * 1000, offset,
-                )
-                queue_mixer.add(chunk, offset=offset)
-                queue_mixer.lerp_duration = max(offset, 1)
-
-            # 2. Maybe request new inference (with stuck/dead detection)
-            if queue_mixer.remaining <= threshold:
-                if not inference_thread.alive:
-                    logger.error("[Control] Inference thread DEAD — restarting")
-                    inference_thread = InferenceThread(
-                        model, backend, args.device, args.task, flip_cameras,
-                        camera_name_map=camera_name_map, blank_cameras=blank_cameras,
-                    )
-                    inference_thread.start()
-                elif inference_thread.busy_duration > args.infer_timeout:
-                    logger.warning(
-                        "[Control] Inference stuck for %.0fs — force resetting",
-                        inference_thread.busy_duration,
-                    )
-                    inference_thread.force_reset()
-
-                if not inference_thread.busy:
-                    obs = get_full_observation(robot, cameras)
-                    submitted = inference_thread.request(obs)
-                    if submitted:
-                        logger.info(
-                            "[Control] Requested inference (queue_remaining=%d, threshold=%d)",
-                            queue_mixer.remaining, threshold,
-                        )
-
-            # 3. Pop action from queue
-            action = queue_mixer.pop()
-            if action is not None:
-                last_action = action
-                actions_from_queue += 1
-                hold_count = 0
-            else:
-                action = last_action
-                hold_count += 1
-                if hold_count == 1 or hold_count % 30 == 0:
-                    logger.warning(
-                        "[Control] Queue empty, holding position (hold_count=%d)", hold_count,
-                    )
-
-            # 4. Velocity clamp and send
-            if step < args.ramp_steps:
-                ramp_frac = 0.25 + 0.75 * (step / args.ramp_steps)
-                effective_delta = max_delta_per_step * ramp_frac
-            else:
-                effective_delta = max_delta_per_step
-
-            delta = action - commanded_pos
-            clamped_delta = np.clip(delta, -effective_delta, effective_delta)
-            commanded_pos = (commanded_pos + clamped_delta).astype(np.float32)
-            robot.send_action(action_to_robot_dict(commanded_pos))
-
-            # 5. Timing
-            elapsed = time.perf_counter() - loop_start
-            sleep_time = goal_time - elapsed
-            if sleep_time > 0:
-                time.sleep(sleep_time)
-
-            step += 1
-            if step % 100 == 0:
-                actual_hz = 1.0 / max(time.perf_counter() - loop_start, 1e-6)
-                logger.info(
-                    "[Control] Step %d | %.1f Hz | queue=%d | from_queue=%d | holds=%d | inferences=%d",
-                    step, actual_hz, queue_mixer.remaining,
-                    actions_from_queue, hold_count, inference_thread.inference_count,
-                )
-
-    except KeyboardInterrupt:
-        logger.info("[Control] Interrupted by user")
-    finally:
-        inference_thread.stop()
-        for name, cam in cameras.items():
-            cam.disconnect()
-            logger.info("Camera '%s' disconnected", name)
-        robot.disconnect()
-        logger.info(
-            "[Control] Cleanup complete — %d steps, %d actions from queue, %d holds, %d inferences",
-            step, actions_from_queue, hold_count, inference_thread.inference_count,
-        )
-
-
-if __name__ == "__main__":
-    main(parse_args())

From 4368f33e046158ea800df2fc530a107200e56b73 Mon Sep 17 00:00:00 2001
From: Max X <xiangxiang.ma@intel.com>
Date: Tue, 19 May 2026 13:31:35 +0200
Subject: [PATCH 07/19] add example script

---
 examples/runtime/async_inference.py | 128 ++++++++++++++++++++++++++++
 1 file changed, 128 insertions(+)
 create mode 100644 examples/runtime/async_inference.py

diff --git a/examples/runtime/async_inference.py b/examples/runtime/async_inference.py
new file mode 100644
index 0000000..834fe00
--- /dev/null
+++ b/examples/runtime/async_inference.py
@@ -0,0 +1,128 @@
+#!/usr/bin/env python3
+# Copyright (C) 2026 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+"""Async inference with PolicyRuntime.
+
+python examples/runtime/async_inference.py \
+  --model ./exports/pi05_cans_openvino \
+  --device GPU.0 \
+  --port /dev/ttyACM0 \
+  --calibration /home/max/.cache/physicalai/robots/a8d8d997-a59e-4423-9006-5d991d223887/calibrations/0b2f185a-8ab2-4956-91c2-3a2ac2dbd8c1.json \
+  --overhead-camera /dev/v4l/by-id/usb-UGREEN_Camera_2K_UGREEN_Camera_2K_SN0001-video-index0 \
+  --arm-camera 353322271391 \
+  --front-camera /dev/v4l/by-id/usb-Innomaker_Innomaker-U20CAM-1080p-S1_SN0001-video-index0 \
+  --width 640 \
+  --height 480 \
+  --fps 30 \
+  --duration-s 60
+"""
+
+from __future__ import annotations
+
+import argparse
+from typing import Any
+
+import openvino as ov
+import numpy as np
+from numpy import core
+
+from physicalai.capture import Frame, discover_all
+from physicalai.capture.transport import SharedCamera
+from physicalai.inference import InferenceModel
+from physicalai.robot import SO101
+from physicalai.robot.interface import RobotObservation
+from physicalai.runtime import (
+    ActionQueue,
+    AsyncExecution,
+    LerpSmoother,
+    PolicyRuntime,
+)
+
+
+def obs_to_input(
+    robot_obs: RobotObservation,
+    camera_frames: dict[str, Frame],
+) -> dict[str, Any]:
+    """Convert robot observation + camera frames to model input dict."""
+    state = np.asarray(robot_obs.joint_positions, dtype=np.float32)
+    if state.ndim == 1:
+        state = state[np.newaxis]  # (state_dim,) -> (1, state_dim)
+    model_input: dict[str, Any] = {
+        "state": state,
+    }
+    for name, frame in camera_frames.items():
+        # (H, W, C) -> (1, C, H, W) float32 [0, 1]
+        img = frame.data.transpose(2, 0, 1).astype(np.float32)[np.newaxis] / 255.0
+        model_input[f"images.{name}"] = img
+    return model_input
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Run policy with PolicyRuntime")
+    parser.add_argument("--model", required=True, help="Exported model directory")
+    parser.add_argument("--device", default="GPU.0", help="OpenVINO device")
+    parser.add_argument("--port", default="/dev/ttyACM0", help="Robot serial port")
+    parser.add_argument("--calibration", required=True, help="Robot calibration file")
+    parser.add_argument("--overhead-camera", required=True, help="Overhead camera device path")
+    parser.add_argument("--arm-camera", required=True, help="Arm camera serial number")
+    parser.add_argument("--front-camera", required=True, help="Front camera device path")
+    parser.add_argument("--width", type=int, default=640)
+    parser.add_argument("--height", type=int, default=480)
+    parser.add_argument("--duration-s", type=float, default=60.0)
+    parser.add_argument("--fps", type=float, default=30.0)
+    args = parser.parse_args()
+
+    import openvino_tokenizers  # noqa: F401 — registers OV tokenizer ops
+
+    print(f"Available devices:")
+    core = ov.Core()
+    devices = core.available_devices
+    for dev in devices:
+        print(f"  {dev}: {core.get_property(dev, 'FULL_DEVICE_NAME')}")
+    print(f"Selected device: {args.device}")
+
+    model = InferenceModel.load(args.model, device=args.device)
+    robot = SO101(port=args.port, calibration=args.calibration, role="follower")
+    cameras = {
+        "overhead": SharedCamera("uvc", device=args.overhead_camera, width=args.width, height=args.height, fps=int(args.fps)),
+        "front": SharedCamera("uvc", device=args.front_camera, width=args.width, height=args.height, fps=int(args.fps)),
+        "arm": SharedCamera("realsense", serial_number=args.arm_camera, width=args.width, height=args.height, fps=int(args.fps)),
+    }
+
+    runtime = PolicyRuntime(
+        robot=robot,
+        model=model,
+        execution=AsyncExecution(threshold=0.3, fps=int(args.fps)),
+        action_queue=ActionQueue(smoother=LerpSmoother(duration_frames=5)),
+        cameras=cameras,
+        fps=args.fps,
+        obs_to_input=obs_to_input,
+    )
+
+    robot.connect()
+    for name, cam in cameras.items():
+        try:
+            cam.connect()
+        except Exception as e:
+            print(f"Failed to connect camera '{name}': {e}")
+            print("Available cameras:")
+            for c in discover_all():
+                print(f"Driver: {c.driver}, Device: {c.device}, Info: {c.info}")
+            return
+        print(f"Camera '{name}' connected: {cam.actual_width}x{cam.actual_height} @ {cam.actual_fps}fps")
+
+    print("Starting policy runtime...")
+    try:
+        stats = runtime.run(duration_s=args.duration_s)
+        print(f"\nDone — {stats.steps} steps, {stats.inference_count} inferences, {stats.total_holds} holds")
+    finally:
+        for name, cam in cameras.items():
+            cam.disconnect()
+            print(f"Camera '{name}' disconnected")
+        robot.disconnect()
+        print("Robot disconnected")
+
+
+if __name__ == "__main__":
+    main()

From ed9fda70386629c4974580719ae18d2c554193ca Mon Sep 17 00:00:00 2001
From: Max X <xiangxiang.ma@intel.com>
Date: Tue, 19 May 2026 16:15:06 +0200
Subject: [PATCH 08/19] replace ActionCursor with deque, return 1D and 2D shape
 for select_action() and predict_action_chunk()

---
 src/physicalai/inference/model.py             |  27 ++---
 src/physicalai/inference/utils/__init__.py    |   8 --
 .../inference/utils/action_cursor.py          |  90 ---------------
 tests/unit/inference/test_action_cursor.py    | 107 ------------------
 4 files changed, 14 insertions(+), 218 deletions(-)
 delete mode 100644 src/physicalai/inference/utils/__init__.py
 delete mode 100644 src/physicalai/inference/utils/action_cursor.py
 delete mode 100644 tests/unit/inference/test_action_cursor.py

diff --git a/src/physicalai/inference/model.py b/src/physicalai/inference/model.py
index 39f4fc4..8d943a9 100644
--- a/src/physicalai/inference/model.py
+++ b/src/physicalai/inference/model.py
@@ -7,9 +7,11 @@
 
 import json
 import warnings
+from collections import deque
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Self
 
+import numpy as np
 import yaml
 
 from physicalai.inference.adapters import adapter_registry, get_adapter
@@ -17,11 +19,8 @@
 from physicalai.inference.constants import ACTION
 from physicalai.inference.manifest import ComponentSpec, Manifest
 from physicalai.inference.runners import get_runner
-from physicalai.inference.utils import ActionCursor
 
 if TYPE_CHECKING:
-    import numpy as np
-
     from physicalai.inference.adapters.base import RuntimeAdapter
     from physicalai.inference.callbacks.base import Callback
     from physicalai.inference.postprocessors.base import Postprocessor
@@ -125,7 +124,7 @@ def __init__(
         for callback in self.callbacks:
             callback.on_load(self)
 
-        self.cursor = ActionCursor()
+        self._action_buffer: deque[np.ndarray] = deque()
 
     @property
     def chunk_size(self) -> int:
@@ -207,31 +206,33 @@ def select_action(self, observation: dict[str, np.ndarray]) -> np.ndarray:
             observation: Observation dict mapping names to numpy arrays.
 
         Returns:
-            Action array to execute.
+            1-D action vector with shape ``(action_dim,)``.
 
         Examples:
             >>> obs = env.reset()
             >>> action = policy.select_action(obs)
             >>> next_obs, reward, done = env.step(action)
         """
-        if self.cursor.empty:
-            self.cursor.push_chunk(self(observation)[ACTION])
-
-        return self.cursor.pop()
+        if not self._action_buffer:
+            self._action_buffer.extend(self.predict_action_chunk(observation))
+        return self._action_buffer.popleft()
 
     def predict_action_chunk(self, observation: dict[str, np.ndarray]) -> np.ndarray:
         """Predict a chunk of actions for the given observation.
 
-        Delegates to ``__call__`` and extracts the ``"action_chunk"`` key.
+        Delegates to ``__call__`` and extracts the ``"action"`` key.
 
         Args:
             observation: Observation dict mapping names to numpy arrays.
 
         Returns:
-            Chunk of actions to execute.
+            2-D action chunk with shape ``(chunk_size, action_dim)``.
         """
         outputs = self(observation)
-        return outputs[ACTION]
+        actions = np.asarray(outputs[ACTION])
+        # Drop the adapter's batch dimension so the return value matches
+        # the documented contract: a 2-D (chunk, action_dim) sequence.
+        return np.atleast_2d(np.squeeze(actions))
 
     def reset(self) -> None:
         """Reset policy state for new episode.
@@ -250,7 +251,7 @@ def reset(self) -> None:
             ...         obs, reward, done = env.step(action)
         """
         self.runner.reset()
-        self.cursor.reset()
+        self._action_buffer.clear()
         for callback in self.callbacks:
             callback.on_reset()
 
diff --git a/src/physicalai/inference/utils/__init__.py b/src/physicalai/inference/utils/__init__.py
deleted file mode 100644
index c576e49..0000000
--- a/src/physicalai/inference/utils/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-# Copyright (C) 2026 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-"""Inference utilities."""
-
-from physicalai.inference.utils.action_cursor import ActionCursor
-
-__all__ = ["ActionCursor"]
diff --git a/src/physicalai/inference/utils/action_cursor.py b/src/physicalai/inference/utils/action_cursor.py
deleted file mode 100644
index a219124..0000000
--- a/src/physicalai/inference/utils/action_cursor.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# Copyright (C) 2026 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-"""ActionCursor: action-chunk buffer for temporal action dispensing."""
-
-from __future__ import annotations
-
-from collections import deque
-
-import numpy as np
-
-
-class ActionCursor:
-    """Buffer that queues an action chunk and dispenses one timestep per call.
-
-    Call :meth:`push_chunk` with the full action output from the runner
-    (shape ``(batch, T, action_dim)``), then call :meth:`pop` repeatedly
-    to retrieve individual timestep actions (shape ``(batch, action_dim)``).
-    When the buffer is exhausted, :attr:`empty` is ``True`` and a new
-    chunk should be pushed.
-
-    Examples:
-            >>> cursor = ActionCursor()
-            >>> cursor.empty
-            True
-            >>> chunk = np.random.randn(1, 10, 7)  # batch=1, T=10, action_dim=7
-            >>> cursor.push_chunk(chunk)
-            >>> cursor.empty
-            False
-            >>> action = cursor.pop()  # shape (1, 7)
-            >>> cursor.reset()
-            >>> cursor.empty
-            True
-    """
-
-    def __init__(self) -> None:
-        """Initialize an empty ActionCursor with no buffered actions."""
-        self._queue: deque[np.ndarray] = deque()
-
-    @property
-    def empty(self) -> bool:
-        """True when there are no buffered actions remaining."""
-        return len(self._queue) == 0
-
-    def push_chunk(self, chunk: np.ndarray) -> None:
-        """Queue all timestep slices from an action chunk.
-
-        Args:
-                chunk: Action array with shape ``(batch, T, action_dim)`` or
-                        ``(T, action_dim)``.  Each of the ``T`` timestep slices is
-                        enqueued individually.
-
-        Raises:
-                ValueError: If ``chunk.ndim`` is not 2 or 3.
-        """
-        min_batched_action_dim = 2
-        batched_temporal_dim = 3
-        if chunk.ndim not in {min_batched_action_dim, batched_temporal_dim}:
-            msg = f"Chunk must be a 2-D or 3-D array, got ndim={chunk.ndim}."
-            raise ValueError(msg)
-
-        if chunk.ndim == min_batched_action_dim:
-            # (T, action_dim) - no batch dimension
-            self._queue.extend(chunk)
-        else:
-            # (batch, T, action_dim) - transpose to (T, batch, action_dim)
-            self._queue.extend(np.transpose(chunk, (1, 0, 2)))
-
-    def pop(self) -> np.ndarray:
-        """Return and remove the next buffered action.
-
-        Returns:
-                Action array for the current timestep, shape ``(batch, action_dim)``
-                or ``(action_dim,)`` depending on what was pushed.
-
-        Raises:
-                IndexError: If the queue is empty.
-        """
-        if self.empty:
-            msg = "ActionCursor is empty; call push_chunk before pop."
-            raise IndexError(msg)
-        return self._queue.popleft()
-
-    def reset(self) -> None:
-        """Clear all buffered actions."""
-        self._queue.clear()
-
-    def __repr__(self) -> str:
-        """Return string representation."""
-        return f"ActionCursor(buffered={len(self._queue)})"
diff --git a/tests/unit/inference/test_action_cursor.py b/tests/unit/inference/test_action_cursor.py
deleted file mode 100644
index 3a82b87..0000000
--- a/tests/unit/inference/test_action_cursor.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Copyright (C) 2026 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-from __future__ import annotations
-
-import numpy as np
-import pytest
-
-from physicalai.inference.utils.action_cursor import ActionCursor
-
-
-class TestActionCursor:
-    def test_init_starts_empty_and_repr(self) -> None:
-        cursor = ActionCursor()
-        assert cursor.empty is True
-        assert repr(cursor) == "ActionCursor(buffered=0)"
-
-    def test_push_chunk_2d_pops_in_row_order(self) -> None:
-        cursor = ActionCursor()
-        chunk = np.array(
-            [
-                [1.0, 2.0],
-                [3.0, 4.0],
-                [5.0, 6.0],
-            ],
-            dtype=np.float32,
-        )
-
-        cursor.push_chunk(chunk)
-
-        np.testing.assert_array_equal(cursor.pop(), np.array([1.0, 2.0], dtype=np.float32))
-        np.testing.assert_array_equal(cursor.pop(), np.array([3.0, 4.0], dtype=np.float32))
-        np.testing.assert_array_equal(cursor.pop(), np.array([5.0, 6.0], dtype=np.float32))
-        assert cursor.empty is True
-
-    def test_push_chunk_3d_pops_time_slices(self) -> None:
-        cursor = ActionCursor()
-        chunk = np.array(
-            [
-                [[1.0, 10.0], [2.0, 20.0], [3.0, 30.0]],
-                [[4.0, 40.0], [5.0, 50.0], [6.0, 60.0]],
-            ],
-            dtype=np.float32,
-        )
-
-        cursor.push_chunk(chunk)
-
-        np.testing.assert_array_equal(
-            cursor.pop(),
-            np.array([[1.0, 10.0], [4.0, 40.0]], dtype=np.float32),
-        )
-        np.testing.assert_array_equal(
-            cursor.pop(),
-            np.array([[2.0, 20.0], [5.0, 50.0]], dtype=np.float32),
-        )
-        np.testing.assert_array_equal(
-            cursor.pop(),
-            np.array([[3.0, 30.0], [6.0, 60.0]], dtype=np.float32),
-        )
-        assert cursor.empty is True
-
-    def test_pop_empty_raises_index_error(self) -> None:
-        cursor = ActionCursor()
-        with pytest.raises(IndexError, match="ActionCursor is empty; call push_chunk before pop"):
-            cursor.pop()
-
-    def test_reset_clears_buffered_actions(self) -> None:
-        cursor = ActionCursor()
-        cursor.push_chunk(np.array([[1.0], [2.0]], dtype=np.float32))
-
-        cursor.reset()
-
-        assert cursor.empty is True
-        with pytest.raises(IndexError):
-            cursor.pop()
-
-    def test_multiple_pushes_append_in_fifo_order(self) -> None:
-        cursor = ActionCursor()
-        first = np.array([[1.0, 1.0], [2.0, 2.0]], dtype=np.float32)
-        second = np.array(
-            [
-                [[3.0, 3.0], [4.0, 4.0]],
-            ],
-            dtype=np.float32,
-        )
-
-        cursor.push_chunk(first)
-        cursor.push_chunk(second)
-
-        np.testing.assert_array_equal(cursor.pop(), np.array([1.0, 1.0], dtype=np.float32))
-        np.testing.assert_array_equal(cursor.pop(), np.array([2.0, 2.0], dtype=np.float32))
-        np.testing.assert_array_equal(cursor.pop(), np.array([[3.0, 3.0]], dtype=np.float32))
-        np.testing.assert_array_equal(cursor.pop(), np.array([[4.0, 4.0]], dtype=np.float32))
-        assert cursor.empty is True
-
-    @pytest.mark.parametrize(
-        "chunk",
-        [
-            np.array([1.0, 2.0], dtype=np.float32),
-            np.zeros((1, 2, 3, 4), dtype=np.float32),
-        ],
-    )
-    def test_push_chunk_invalid_ndim_raises_value_error(self, chunk: np.ndarray) -> None:
-        cursor = ActionCursor()
-
-        with pytest.raises(ValueError, match=r"Chunk must be a 2-D or 3-D array, got ndim="):
-            cursor.push_chunk(chunk)

From cdb3fab2b81e2324ea58bd67cba5e6c20c0937c0 Mon Sep 17 00:00:00 2001
From: Max X <xiangxiang.ma@intel.com>
Date: Tue, 19 May 2026 22:10:08 +0200
Subject: [PATCH 09/19] remove obs_to_input, move inside policy runtime

---
 examples/runtime/async_inference.py           | 24 +--------
 .../inference/preprocessors/pi05.py           |  8 +--
 src/physicalai/runtime/__init__.py            |  3 --
 src/physicalai/runtime/runtime.py             | 45 ++++++----------
 tests/unit/runtime/test_runtime.py            | 54 +------------------
 5 files changed, 22 insertions(+), 112 deletions(-)

diff --git a/examples/runtime/async_inference.py b/examples/runtime/async_inference.py
index 834fe00..b2e9bb8 100644
--- a/examples/runtime/async_inference.py
+++ b/examples/runtime/async_inference.py
@@ -21,17 +21,14 @@
 from __future__ import annotations
 
 import argparse
-from typing import Any
 
 import openvino as ov
 import numpy as np
-from numpy import core
 
-from physicalai.capture import Frame, discover_all
+from physicalai.capture import discover_all
 from physicalai.capture.transport import SharedCamera
 from physicalai.inference import InferenceModel
 from physicalai.robot import SO101
-from physicalai.robot.interface import RobotObservation
 from physicalai.runtime import (
     ActionQueue,
     AsyncExecution,
@@ -40,24 +37,6 @@
 )
 
 
-def obs_to_input(
-    robot_obs: RobotObservation,
-    camera_frames: dict[str, Frame],
-) -> dict[str, Any]:
-    """Convert robot observation + camera frames to model input dict."""
-    state = np.asarray(robot_obs.joint_positions, dtype=np.float32)
-    if state.ndim == 1:
-        state = state[np.newaxis]  # (state_dim,) -> (1, state_dim)
-    model_input: dict[str, Any] = {
-        "state": state,
-    }
-    for name, frame in camera_frames.items():
-        # (H, W, C) -> (1, C, H, W) float32 [0, 1]
-        img = frame.data.transpose(2, 0, 1).astype(np.float32)[np.newaxis] / 255.0
-        model_input[f"images.{name}"] = img
-    return model_input
-
-
 def main():
     parser = argparse.ArgumentParser(description="Run policy with PolicyRuntime")
     parser.add_argument("--model", required=True, help="Exported model directory")
@@ -97,7 +76,6 @@ def main():
         action_queue=ActionQueue(smoother=LerpSmoother(duration_frames=5)),
         cameras=cameras,
         fps=args.fps,
-        obs_to_input=obs_to_input,
     )
 
     robot.connect()
diff --git a/src/physicalai/inference/preprocessors/pi05.py b/src/physicalai/inference/preprocessors/pi05.py
index 0d5c68e..c3d631f 100644
--- a/src/physicalai/inference/preprocessors/pi05.py
+++ b/src/physicalai/inference/preprocessors/pi05.py
@@ -135,7 +135,9 @@ def _preprocess_images(
             if img.ndim == max_image_dim:
                 img = img[:, -1, :, :, :]
 
-            if img.dtype != np.float32:
+            if img.dtype == np.uint8:
+                img = img.astype(np.float32) / 255.0
+            elif img.dtype != np.float32:
                 img = img.astype(np.float32)
 
             # Detect layout: assume channels-first when dim-1 == 3
@@ -152,8 +154,8 @@ def _preprocess_images(
             # [0, 1] -> [-1, 1]
             img = img * 2.0 - 1.0
 
-            if channels_first:
-                img = np.transpose(img, (0, 3, 1, 2))  # -> (B, C, H, W)
+            # Output is always (B, C, H, W) — img is HWC at this point
+            img = np.transpose(img, (0, 3, 1, 2))
 
             bsize = img.shape[0]
             mask = np.ones(bsize, dtype=np.bool_)
diff --git a/src/physicalai/runtime/__init__.py b/src/physicalai/runtime/__init__.py
index 4be467a..bbc41d0 100644
--- a/src/physicalai/runtime/__init__.py
+++ b/src/physicalai/runtime/__init__.py
@@ -9,7 +9,6 @@
     from physicalai.runtime import SyncExecution, AsyncExecution, Execution, WorkerDiedError
     from physicalai.runtime import ActionQueue
     from physicalai.runtime import ChunkSmoother, LerpSmoother, ReplaceSmoother
-    from physicalai.runtime import default_observation_to_input
 """
 
 from physicalai.runtime._action_queue import ActionQueue  # noqa: PLC2701
@@ -23,7 +22,6 @@
     PolicyRuntime,
     RunStats,
     RuntimeCallback,
-    default_observation_to_input,
 )
 from physicalai.runtime.smoothers import ChunkSmoother, LerpSmoother, ReplaceSmoother
 
@@ -39,5 +37,4 @@
     "RuntimeCallback",
     "SyncExecution",
     "WorkerDiedError",
-    "default_observation_to_input",
 ]
diff --git a/src/physicalai/runtime/runtime.py b/src/physicalai/runtime/runtime.py
index b0b7e2b..b947a48 100644
--- a/src/physicalai/runtime/runtime.py
+++ b/src/physicalai/runtime/runtime.py
@@ -17,42 +17,18 @@
 from physicalai.runtime.smoothers import LerpSmoother
 
 if TYPE_CHECKING:
-    from collections.abc import Callable, Mapping, Sequence
+    from collections.abc import Mapping, Sequence
 
     from physicalai.capture.camera import Camera
     from physicalai.capture.frame import Frame
     from physicalai.inference.model import InferenceModel
-    from physicalai.robot.interface import Robot, RobotObservation
+    from physicalai.robot.interface import Robot
 
 logger = logging.getLogger(__name__)
 
 _DEFAULT_LERP_FRAMES = 5
 
 
-def default_observation_to_input(
-    robot_obs: RobotObservation,
-    camera_frames: dict[str, Frame],
-) -> dict[str, Any]:
-    """Convert robot observation and camera frames to model input dict.
-
-    Maps:
-        - ``robot_obs.joint_positions`` → ``"state"`` (as batch dim)
-        - ``frame.data`` per camera → ``"images.{name}"``
-
-    Returns:
-        Model input dictionary.
-    """
-    model_input: dict[str, Any] = {}
-
-    if robot_obs.joint_positions is not None:
-        model_input["state"] = np.array([robot_obs.joint_positions], dtype=np.float32)
-
-    for name, frame in camera_frames.items():
-        model_input[f"images.{name}"] = frame.data
-
-    return model_input
-
-
 class RuntimeCallback(Protocol):
     """Optional hook points in the PolicyRuntime control loop."""
 
@@ -96,7 +72,6 @@ def __init__(  # noqa: D107
         fps: float,
         cameras: Mapping[str, Camera] | None = None,
         action_queue: ActionQueue | None = None,
-        obs_to_input: Callable[[RobotObservation, dict[str, Frame]], dict[str, Any]] | None = None,
         callbacks: Sequence[RuntimeCallback] = (),
     ) -> None:
         if fps <= 0:
@@ -108,7 +83,6 @@ def __init__(  # noqa: D107
         self._fps = fps
         self._cameras: Mapping[str, Camera] = cameras or {}
         self._action_queue = action_queue or ActionQueue(smoother=LerpSmoother(duration_frames=_DEFAULT_LERP_FRAMES))
-        self._obs_to_input = obs_to_input or default_observation_to_input
         self._callbacks = list(callbacks)
         self._goal_time = (1.0 / fps) * 3
 
@@ -194,8 +168,19 @@ def run(self, *, duration_s: float | None = None) -> RunStats:
 
     def _build_model_input(self) -> dict[str, Any]:
         robot_obs = self._robot.get_observation()
-        camera_frames = {name: cam.read_latest() for name, cam in self._cameras.items()}
-        return self._obs_to_input(robot_obs, camera_frames)
+        model_input: dict[str, Any] = {}
+
+        if robot_obs.joint_positions is not None:
+            model_input["state"] = np.array([robot_obs.joint_positions], dtype=np.float32)
+
+        # Merge robot-embedded images and external cameras
+        if robot_obs.images:
+            for name, frame in robot_obs.images.items():
+                model_input[f"images.{name}"] = frame.data[np.newaxis]
+        for name, cam in self._cameras.items():
+            model_input[f"images.{name}"] = cam.read_latest().data[np.newaxis]
+
+        return model_input
 
     def _shutdown(self, step: int) -> None:
         self._execution.stop()
diff --git a/tests/unit/runtime/test_runtime.py b/tests/unit/runtime/test_runtime.py
index 6f1b378..ccb01c0 100644
--- a/tests/unit/runtime/test_runtime.py
+++ b/tests/unit/runtime/test_runtime.py
@@ -16,7 +16,7 @@
 
 from physicalai.runtime._action_queue import ActionQueue
 from physicalai.runtime.execution import SyncExecution, WorkerDiedError
-from physicalai.runtime.runtime import PolicyRuntime, RunStats, default_observation_to_input
+from physicalai.runtime.runtime import PolicyRuntime, RunStats
 
 from physicalai.capture import Frame
 
@@ -235,55 +235,3 @@ def test_fields_populated(self) -> None:
         assert stats.total_pops == 8
         assert stats.total_holds == 2
         assert stats.inference_count == 3
-
-
-class TestDefaultObservationToInput:
-    def test_extracts_joint_positions(self) -> None:
-        positions = np.array([0.1, 0.2, 0.3], dtype=np.float32)
-        obs = FakeRobotObservation(
-            joint_positions=positions,
-            timestamp=0.0,
-            sensor_data=None,
-            images=None,
-        )
-
-        result = default_observation_to_input(obs, {})
-
-        assert "state" in result
-        np.testing.assert_array_equal(result["state"], np.array([positions], dtype=np.float32))
-
-    def test_extracts_frame_data(self) -> None:
-        obs = FakeRobotObservation(
-            joint_positions=np.zeros(3),
-            timestamp=0.0,
-            sensor_data=None,
-            images=None,
-        )
-        frame = Frame(data=np.zeros((480, 640, 3), dtype=np.uint8), timestamp=0.0, sequence=0)
-
-        result = default_observation_to_input(obs, camera_frames={"cam0": frame})
-
-        assert "images.cam0" in result
-        np.testing.assert_array_equal(result["images.cam0"], frame.data)
-
-    def test_custom_obs_to_input(self) -> None:
-        robot = _make_mock_robot()
-        model = _make_mock_model(chunk_size=10)
-        execution = SyncExecution()
-
-        custom_fn = MagicMock(return_value={"custom_key": np.zeros(3)})
-
-        runtime = PolicyRuntime(
-            robot=robot,
-            model=model,
-            execution=execution,
-            fps=10.0,
-            obs_to_input=custom_fn,
-        )
-
-        with patch("physicalai.runtime.runtime.time") as mock_time:
-            mock_time.perf_counter.return_value = 0.0
-            mock_time.sleep = MagicMock()
-            runtime.run(duration_s=0.1)
-
-        assert custom_fn.call_count >= 2

From 41003bb99955a52e3c56dcca0aa0e5c7a1505bd8 Mon Sep 17 00:00:00 2001
From: Max X <xiangxiang.ma@intel.com>
Date: Tue, 19 May 2026 22:51:09 +0200
Subject: [PATCH 10/19] Address comment: runtime manages robot/camera lifecycle

---
 examples/runtime/async_inference.py | 25 +++++-----
 src/physicalai/runtime/runtime.py   | 72 ++++++++++++++++++++++++++++-
 tests/unit/runtime/test_runtime.py  | 36 ++++++++++++---
 3 files changed, 110 insertions(+), 23 deletions(-)

diff --git a/examples/runtime/async_inference.py b/examples/runtime/async_inference.py
index b2e9bb8..006496c 100644
--- a/examples/runtime/async_inference.py
+++ b/examples/runtime/async_inference.py
@@ -78,16 +78,16 @@ def main():
         fps=args.fps,
     )
 
-    robot.connect()
+    try:
+        runtime.connect()
+    except Exception as e:
+        print(f"Failed to connect: {e}")
+        print("Available cameras:")
+        for c in discover_all():
+            print(f"  Driver: {c.driver}, Device: {c.device}, Info: {c.info}")
+        return
+
     for name, cam in cameras.items():
-        try:
-            cam.connect()
-        except Exception as e:
-            print(f"Failed to connect camera '{name}': {e}")
-            print("Available cameras:")
-            for c in discover_all():
-                print(f"Driver: {c.driver}, Device: {c.device}, Info: {c.info}")
-            return
         print(f"Camera '{name}' connected: {cam.actual_width}x{cam.actual_height} @ {cam.actual_fps}fps")
 
     print("Starting policy runtime...")
@@ -95,11 +95,8 @@ def main():
         stats = runtime.run(duration_s=args.duration_s)
         print(f"\nDone — {stats.steps} steps, {stats.inference_count} inferences, {stats.total_holds} holds")
     finally:
-        for name, cam in cameras.items():
-            cam.disconnect()
-            print(f"Camera '{name}' disconnected")
-        robot.disconnect()
-        print("Robot disconnected")
+        runtime.disconnect()
+        print("Disconnected")
 
 
 if __name__ == "__main__":
diff --git a/src/physicalai/runtime/runtime.py b/src/physicalai/runtime/runtime.py
index b947a48..42c1df6 100644
--- a/src/physicalai/runtime/runtime.py
+++ b/src/physicalai/runtime/runtime.py
@@ -8,7 +8,7 @@
 import logging
 import time
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Protocol
+from typing import TYPE_CHECKING, Any, Protocol, Self
 
 import numpy as np
 
@@ -61,7 +61,11 @@ class PolicyRuntime:
     """Runs a policy on robot hardware.
 
     Loop: observe → maybe_request → pop → send → sleep.
-    Robot and cameras must be connected before run(). Caller owns lifecycle.
+
+    Supports context manager for safe lifecycle management::
+
+        with PolicyRuntime(robot=robot, model=model, ...) as runtime:
+            stats = runtime.run(duration_s=60.0)
     """
 
     def __init__(  # noqa: D107
@@ -85,6 +89,66 @@ def __init__(  # noqa: D107
         self._action_queue = action_queue or ActionQueue(smoother=LerpSmoother(duration_frames=_DEFAULT_LERP_FRAMES))
         self._callbacks = list(callbacks)
         self._goal_time = (1.0 / fps) * 3
+        self._connected = False
+
+    def connect(self) -> None:
+        """Connect robot and cameras.
+
+        Connects robot first, then cameras in dict order. On failure,
+        disconnects everything already connected and re-raises.
+
+        Idempotent — calling on an already-connected runtime is a no-op.
+        """
+        if self._connected:
+            logger.debug("connect() called but already connected — no-op")
+            return
+
+        self._robot.connect()
+        connected_cameras: list[str] = []
+        try:
+            for name, cam in self._cameras.items():
+                cam.connect()
+                connected_cameras.append(name)
+        except Exception:
+            for cam_name in connected_cameras:
+                try:
+                    self._cameras[cam_name].disconnect()
+                except Exception:
+                    logger.warning("Failed to disconnect camera '%s' during rollback", cam_name, exc_info=True)
+            try:
+                self._robot.disconnect()
+            except Exception:
+                logger.warning("Failed to disconnect robot during rollback", exc_info=True)
+            raise
+
+        self._connected = True
+
+    def disconnect(self) -> None:
+        """Disconnect cameras then robot. Never raises.
+
+        Idempotent — calling on an already-disconnected runtime is a no-op.
+        """
+        if not self._connected:
+            return
+
+        for name, cam in self._cameras.items():
+            try:
+                cam.disconnect()
+            except Exception:
+                logger.warning("Failed to disconnect camera '%s'", name, exc_info=True)
+        try:
+            self._robot.disconnect()
+        except Exception:
+            logger.warning("Failed to disconnect robot", exc_info=True)
+
+        self._connected = False
+
+    def __enter__(self) -> Self:
+        self.connect()
+        return self
+
+    def __exit__(self, *exc_info: object) -> None:
+        self.disconnect()
 
     def run(self, *, duration_s: float | None = None) -> RunStats:
         """Run the control loop.
@@ -96,8 +160,12 @@ def run(self, *, duration_s: float | None = None) -> RunStats:
             Statistics from the run session.
 
         Raises:
+            RuntimeError: If called before connect().
             WorkerDiedError: If the inference worker thread dies.
         """
+        if not self._connected:
+            msg = "PolicyRuntime.run() called before connect(). Use 'with runtime:' or call runtime.connect() first."
+            raise RuntimeError(msg)
         self._execution.start(self._model, self._action_queue)
         sample_obs = self._build_model_input()
         self._execution.warmup(sample_obs)
diff --git a/tests/unit/runtime/test_runtime.py b/tests/unit/runtime/test_runtime.py
index ccb01c0..839adb2 100644
--- a/tests/unit/runtime/test_runtime.py
+++ b/tests/unit/runtime/test_runtime.py
@@ -48,6 +48,13 @@ def _make_mock_model(chunk_size: int = 4, action_dim: int = 3) -> MagicMock:
     return model
 
 
+def _make_runtime(**kwargs: Any) -> PolicyRuntime:
+    """Create a PolicyRuntime with _connected=True for testing."""
+    runtime = PolicyRuntime(**kwargs)
+    runtime._connected = True  # noqa: SLF001
+    return runtime
+
+
 def _exhaustible_side_effect(
     initial_chunks: list[np.ndarray],
     action_dim: int = 2,
@@ -69,7 +76,7 @@ def test_full_loop_with_duration(self) -> None:
         execution = SyncExecution()
         queue = ActionQueue()
 
-        runtime = PolicyRuntime(
+        runtime = _make_runtime(
             robot=robot,
             model=model,
             execution=execution,
@@ -94,7 +101,7 @@ def test_hold_fallback_when_queue_empty(self) -> None:
         execution = SyncExecution()
         queue = ActionQueue()
 
-        runtime = PolicyRuntime(
+        runtime = _make_runtime(
             robot=robot,
             model=model,
             execution=execution,
@@ -123,7 +130,7 @@ def test_worker_died_error_propagation(self) -> None:
         queue = ActionQueue()
         queue.push_chunk(np.random.randn(4, 3).astype(np.float32))
 
-        runtime = PolicyRuntime(
+        runtime = _make_runtime(
             robot=robot,
             model=model,
             execution=execution,
@@ -141,7 +148,7 @@ def test_shutdown_does_not_disconnect(self) -> None:
         model = _make_mock_model()
         execution = SyncExecution()
 
-        runtime = PolicyRuntime(
+        runtime = _make_runtime(
             robot=robot,
             model=model,
             execution=execution,
@@ -155,6 +162,21 @@ def test_shutdown_does_not_disconnect(self) -> None:
 
         robot.disconnect.assert_not_called()
 
+    def test_run_raises_if_not_connected(self) -> None:
+        robot = _make_mock_robot()
+        model = _make_mock_model()
+        execution = SyncExecution()
+
+        runtime = PolicyRuntime(
+            robot=robot,
+            model=model,
+            execution=execution,
+            fps=10.0,
+        )
+
+        with pytest.raises(RuntimeError, match="connect"):
+            runtime.run(duration_s=1.0)
+
 
 class TestRuntimeCallback:
     def test_before_send_action_called(self) -> None:
@@ -164,7 +186,7 @@ def test_before_send_action_called(self) -> None:
         callback = MagicMock()
         callback.before_send_action.return_value = None
 
-        runtime = PolicyRuntime(
+        runtime = _make_runtime(
             robot=robot,
             model=model,
             execution=execution,
@@ -186,7 +208,7 @@ def test_callback_raises_does_not_crash_loop(self) -> None:
         bad_callback = MagicMock()
         bad_callback.before_send_action.side_effect = RuntimeError("oops")
 
-        runtime = PolicyRuntime(
+        runtime = _make_runtime(
             robot=robot,
             model=model,
             execution=execution,
@@ -212,7 +234,7 @@ def test_on_hold_called_when_queue_empty(self) -> None:
         callback.before_send_action.return_value = None
         callback.on_hold.return_value = None
 
-        runtime = PolicyRuntime(
+        runtime = _make_runtime(
             robot=robot,
             model=model,
             execution=execution,

From 93696bbbea7d05309a2d5d737d545b1bc4c269ba Mon Sep 17 00:00:00 2001
From: Max Xiang <xiangxiang.ma@intel.com>
Date: Tue, 19 May 2026 23:09:13 +0200
Subject: [PATCH 11/19] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 src/physicalai/runtime/runtime.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/physicalai/runtime/runtime.py b/src/physicalai/runtime/runtime.py
index 42c1df6..910bf2d 100644
--- a/src/physicalai/runtime/runtime.py
+++ b/src/physicalai/runtime/runtime.py
@@ -192,12 +192,14 @@ def run(self, *, duration_s: float | None = None) -> RunStats:
                     holds = self._action_queue.consecutive_holds
                     if holds == 1:
                         logger.warning("Queue empty — holding position")
-                    elif self._fps > 0 and holds % int(self._fps) == 0:
-                        logger.warning(
-                            "Queue starvation: %d consecutive holds (%.1fs)",
-                            holds,
-                            holds / self._fps,
-                        )
+                    elif self._fps > 0:
+                        warning_interval = max(int(self._fps), 1)
+                        if holds % warning_interval == 0:
+                            logger.warning(
+                                "Queue starvation: %d consecutive holds (%.1fs)",
+                                holds,
+                                holds / self._fps,
+                            )
                     self._invoke_callback("on_hold", step=step, holds=holds)
 
                 if action is None:

From 90effbf6bf33a2f01ba81ce27d36b06cd88c7b82 Mon Sep 17 00:00:00 2001
From: Max Xiang <xiangxiang.ma@intel.com>
Date: Tue, 19 May 2026 23:09:46 +0200
Subject: [PATCH 12/19] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 src/physicalai/runtime/runtime.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/physicalai/runtime/runtime.py b/src/physicalai/runtime/runtime.py
index 910bf2d..2458701 100644
--- a/src/physicalai/runtime/runtime.py
+++ b/src/physicalai/runtime/runtime.py
@@ -204,6 +204,10 @@ def run(self, *, duration_s: float | None = None) -> RunStats:
 
                 if action is None:
                     logger.error("No action available (warmup may have failed)")
+                    elapsed = time.perf_counter() - loop_start
+                    sleep_time = goal_time - elapsed
+                    if sleep_time > 0:
+                        time.sleep(sleep_time)
                     step += 1
                     continue
 

From ac6c8e3d366b96c94b2b6064458692035bafd9bf Mon Sep 17 00:00:00 2001
From: Max X <xiangxiang.ma@intel.com>
Date: Tue, 19 May 2026 23:16:13 +0200
Subject: [PATCH 13/19] address copilot

---
 src/physicalai/runtime/runtime.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/physicalai/runtime/runtime.py b/src/physicalai/runtime/runtime.py
index 2458701..3e3ac7f 100644
--- a/src/physicalai/runtime/runtime.py
+++ b/src/physicalai/runtime/runtime.py
@@ -280,7 +280,11 @@ def _invoke_callback(self, method: str, **kwargs: Any) -> Any:  # noqa: ANN401
             fn = getattr(cb, method, None)
             if fn is not None:
                 try:
-                    result = fn(**kwargs)
+                    callback_result = fn(**kwargs)
+                    if callback_result is not None:
+                        result = callback_result
+                        if method == "before_send_action":
+                            kwargs["action"] = callback_result
                 except Exception:
                     logger.exception("Callback %s.%s raised", type(cb).__name__, method)
         return result

From 211fba489681d47910f828a09a6b3827b6477040 Mon Sep 17 00:00:00 2001
From: Max X <xiangxiang.ma@intel.com>
Date: Tue, 19 May 2026 23:19:19 +0200
Subject: [PATCH 14/19] fix ruff

---
 src/physicalai/runtime/runtime.py | 50 ++++++++++++++++---------------
 1 file changed, 26 insertions(+), 24 deletions(-)

diff --git a/src/physicalai/runtime/runtime.py b/src/physicalai/runtime/runtime.py
index 3e3ac7f..e8f9a80 100644
--- a/src/physicalai/runtime/runtime.py
+++ b/src/physicalai/runtime/runtime.py
@@ -20,7 +20,6 @@
     from collections.abc import Mapping, Sequence
 
     from physicalai.capture.camera import Camera
-    from physicalai.capture.frame import Frame
     from physicalai.inference.model import InferenceModel
     from physicalai.robot.interface import Robot
 
@@ -143,11 +142,11 @@ def disconnect(self) -> None:
 
         self._connected = False
 
-    def __enter__(self) -> Self:
+    def __enter__(self) -> Self:  # noqa: D105
         self.connect()
         return self
 
-    def __exit__(self, *exc_info: object) -> None:
+    def __exit__(self, *exc_info: object) -> None:  # noqa: D105
         self.disconnect()
 
     def run(self, *, duration_s: float | None = None) -> RunStats:
@@ -189,25 +188,11 @@ def run(self, *, duration_s: float | None = None) -> RunStats:
                     last_action = action
                 else:
                     action = last_action
-                    holds = self._action_queue.consecutive_holds
-                    if holds == 1:
-                        logger.warning("Queue empty — holding position")
-                    elif self._fps > 0:
-                        warning_interval = max(int(self._fps), 1)
-                        if holds % warning_interval == 0:
-                            logger.warning(
-                                "Queue starvation: %d consecutive holds (%.1fs)",
-                                holds,
-                                holds / self._fps,
-                            )
-                    self._invoke_callback("on_hold", step=step, holds=holds)
+                    self._handle_hold(step=step)
 
                 if action is None:
                     logger.error("No action available (warmup may have failed)")
-                    elapsed = time.perf_counter() - loop_start
-                    sleep_time = goal_time - elapsed
-                    if sleep_time > 0:
-                        time.sleep(sleep_time)
+                    self._tick_sleep(loop_start, goal_time)
                     step += 1
                     continue
 
@@ -217,11 +202,7 @@ def run(self, *, duration_s: float | None = None) -> RunStats:
 
                 self._robot.send_action(action, goal_time=self._goal_time)
                 self._invoke_callback("on_action_sent", action=action, step=step)
-
-                elapsed = time.perf_counter() - loop_start
-                sleep_time = goal_time - elapsed
-                if sleep_time > 0:
-                    time.sleep(sleep_time)
+                self._tick_sleep(loop_start, goal_time)
 
                 step += 1
 
@@ -240,6 +221,27 @@ def run(self, *, duration_s: float | None = None) -> RunStats:
             inference_count=getattr(self._execution, "inference_count", 0),
         )
 
+    def _handle_hold(self, *, step: int) -> None:
+        holds = self._action_queue.consecutive_holds
+        if holds == 1:
+            logger.warning("Queue empty — holding position")
+        elif self._fps > 0:
+            warning_interval = max(int(self._fps), 1)
+            if holds % warning_interval == 0:
+                logger.warning(
+                    "Queue starvation: %d consecutive holds (%.1fs)",
+                    holds,
+                    holds / self._fps,
+                )
+        self._invoke_callback("on_hold", step=step, holds=holds)
+
+    @staticmethod
+    def _tick_sleep(loop_start: float, goal_time: float) -> None:
+        elapsed = time.perf_counter() - loop_start
+        sleep_time = goal_time - elapsed
+        if sleep_time > 0:
+            time.sleep(sleep_time)
+
     def _build_model_input(self) -> dict[str, Any]:
         robot_obs = self._robot.get_observation()
         model_input: dict[str, Any] = {}

From 3d05530ee93e77d2a92aeeafc02af4db4274e24f Mon Sep 17 00:00:00 2001
From: Max X <xiangxiang.ma@intel.com>
Date: Tue, 19 May 2026 23:22:28 +0200
Subject: [PATCH 15/19] fix all linting issues

---
 examples/runtime/async_inference.py | 5 +++--
 src/physicalai/robot/__init__.py    | 7 +++++++
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/examples/runtime/async_inference.py b/examples/runtime/async_inference.py
index 006496c..6b07efa 100644
--- a/examples/runtime/async_inference.py
+++ b/examples/runtime/async_inference.py
@@ -83,8 +83,9 @@ def main():
     except Exception as e:
         print(f"Failed to connect: {e}")
         print("Available cameras:")
-        for c in discover_all():
-            print(f"  Driver: {c.driver}, Device: {c.device}, Info: {c.info}")
+        for driver, devices in discover_all().items():
+            for dev in devices:
+                print(f"  Driver: {driver}, Device: {dev.device_id}, Info: {dev.name}")
         return
 
     for name, cam in cameras.items():
diff --git a/src/physicalai/robot/__init__.py b/src/physicalai/robot/__init__.py
index d2159e9..2c55631 100644
--- a/src/physicalai/robot/__init__.py
+++ b/src/physicalai/robot/__init__.py
@@ -13,10 +13,17 @@
 
 from __future__ import annotations
 
+from typing import TYPE_CHECKING
+
 from physicalai.robot.connect import connect
 from physicalai.robot.interface import Robot, RobotObservation
 from physicalai.robot.verify import verify_robot
 
+if TYPE_CHECKING:
+    from physicalai.robot.so101 import SO101 as SO101
+    from physicalai.robot.trossen import BimanualWidowXAI as BimanualWidowXAI
+    from physicalai.robot.trossen import WidowXAI as WidowXAI
+
 __all__ = [
     "Robot",
     "RobotObservation",

From 51a892773aa710d53dc1b5bacee80015c122f656 Mon Sep 17 00:00:00 2001
From: Max Xiang <xiangxiang.ma@intel.com>
Date: Wed, 20 May 2026 15:55:38 +0200
Subject: [PATCH 16/19] update copyright header

---
 src/physicalai/inference/__init__.py            | 2 +-
 src/physicalai/inference/adapters/__init__.py   | 2 +-
 src/physicalai/inference/adapters/_discovery.py | 2 +-
 src/physicalai/inference/adapters/openvino.py   | 2 +-
 src/physicalai/inference/model.py               | 2 +-
 src/physicalai/runtime/__init__.py              | 2 +-
 src/physicalai/runtime/_action_queue.py         | 2 +-
 src/physicalai/runtime/execution.py             | 2 +-
 src/physicalai/runtime/runtime.py               | 2 +-
 src/physicalai/runtime/smoothers.py             | 2 +-
 tests/unit/inference/test_adapters.py           | 2 +-
 tests/unit/inference/test_model.py              | 2 +-
 tests/unit/runtime/test_runtime.py              | 2 +-
 13 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/physicalai/inference/__init__.py b/src/physicalai/inference/__init__.py
index aacf56f..186b175 100644
--- a/src/physicalai/inference/__init__.py
+++ b/src/physicalai/inference/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2025-2026 Intel Corporation
+# Copyright (C) 2026 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
 """Production inference module for exported policies.
diff --git a/src/physicalai/inference/adapters/__init__.py b/src/physicalai/inference/adapters/__init__.py
index f3124c7..debc1d8 100644
--- a/src/physicalai/inference/adapters/__init__.py
+++ b/src/physicalai/inference/adapters/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2025-2026 Intel Corporation
+# Copyright (C) 2026 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
 """Inference adapters for different backend runtimes.
diff --git a/src/physicalai/inference/adapters/_discovery.py b/src/physicalai/inference/adapters/_discovery.py
index ed00a77..154c203 100644
--- a/src/physicalai/inference/adapters/_discovery.py
+++ b/src/physicalai/inference/adapters/_discovery.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2025-2026 Intel Corporation
+# Copyright (C) 2026 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
 """Adapter discovery and factory helpers."""
diff --git a/src/physicalai/inference/adapters/openvino.py b/src/physicalai/inference/adapters/openvino.py
index 08a3ff4..81b5d2e 100644
--- a/src/physicalai/inference/adapters/openvino.py
+++ b/src/physicalai/inference/adapters/openvino.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2025-2026 Intel Corporation
+# Copyright (C) 2026 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
 """OpenVINO adapter for inference."""
diff --git a/src/physicalai/inference/model.py b/src/physicalai/inference/model.py
index 8d943a9..20948c1 100644
--- a/src/physicalai/inference/model.py
+++ b/src/physicalai/inference/model.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2025-2026 Intel Corporation
+# Copyright (C) 2026 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
 """Production-ready inference model with unified API."""
diff --git a/src/physicalai/runtime/__init__.py b/src/physicalai/runtime/__init__.py
index bbc41d0..4a4ec70 100644
--- a/src/physicalai/runtime/__init__.py
+++ b/src/physicalai/runtime/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2025-2026 Intel Corporation
+# Copyright (C) 2026 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
 """Runtime system for running trained policies on robot hardware.
diff --git a/src/physicalai/runtime/_action_queue.py b/src/physicalai/runtime/_action_queue.py
index af768a6..c195edf 100644
--- a/src/physicalai/runtime/_action_queue.py
+++ b/src/physicalai/runtime/_action_queue.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2025-2026 Intel Corporation
+# Copyright (C) 2026 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
 from __future__ import annotations
diff --git a/src/physicalai/runtime/execution.py b/src/physicalai/runtime/execution.py
index 28ec86c..bb2c854 100644
--- a/src/physicalai/runtime/execution.py
+++ b/src/physicalai/runtime/execution.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2025-2026 Intel Corporation
+# Copyright (C) 2026 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
 """Execution strategies for scheduling policy inference."""
diff --git a/src/physicalai/runtime/runtime.py b/src/physicalai/runtime/runtime.py
index e8f9a80..801daa9 100644
--- a/src/physicalai/runtime/runtime.py
+++ b/src/physicalai/runtime/runtime.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2025-2026 Intel Corporation
+# Copyright (C) 2026 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
 """PolicyRuntime — runs a trained policy on robot hardware."""
diff --git a/src/physicalai/runtime/smoothers.py b/src/physicalai/runtime/smoothers.py
index 70d0bd9..58b1d14 100644
--- a/src/physicalai/runtime/smoothers.py
+++ b/src/physicalai/runtime/smoothers.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2025-2026 Intel Corporation
+# Copyright (C) 2026 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
 """Chunk smoothers for runtime action queues."""
diff --git a/tests/unit/inference/test_adapters.py b/tests/unit/inference/test_adapters.py
index f2be690..42efaba 100644
--- a/tests/unit/inference/test_adapters.py
+++ b/tests/unit/inference/test_adapters.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2025-2026 Intel Corporation
+# Copyright (C) 2026 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
 """Unit tests for inference adapters shipped with the core ``physicalai`` package.
diff --git a/tests/unit/inference/test_model.py b/tests/unit/inference/test_model.py
index 77e801b..b52c1de 100644
--- a/tests/unit/inference/test_model.py
+++ b/tests/unit/inference/test_model.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2025-2026 Intel Corporation
+# Copyright (C) 2026 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
 """Unit tests for InferenceModel and inference runners."""
diff --git a/tests/unit/runtime/test_runtime.py b/tests/unit/runtime/test_runtime.py
index 839adb2..45b7bf8 100644
--- a/tests/unit/runtime/test_runtime.py
+++ b/tests/unit/runtime/test_runtime.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2025-2026 Intel Corporation
+# Copyright (C) 2026 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
 """Tests for physicalai.runtime.runtime."""

From 3c8b0a34ce73a1aefb106cede392c4590cd12f7a Mon Sep 17 00:00:00 2001
From: Max X <xiangxiang.ma@intel.com>
Date: Thu, 21 May 2026 13:49:44 +0200
Subject: [PATCH 17/19] address PR comments

---
 src/physicalai/inference/model.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/physicalai/inference/model.py b/src/physicalai/inference/model.py
index 20948c1..aeb4c7e 100644
--- a/src/physicalai/inference/model.py
+++ b/src/physicalai/inference/model.py
@@ -229,10 +229,17 @@ def predict_action_chunk(self, observation: dict[str, np.ndarray]) -> np.ndarray
             2-D action chunk with shape ``(chunk_size, action_dim)``.
         """
         outputs = self(observation)
-        actions = np.asarray(outputs[ACTION])
-        # Drop the adapter's batch dimension so the return value matches
-        # the documented contract: a 2-D (chunk, action_dim) sequence.
-        return np.atleast_2d(np.squeeze(actions))
+        actions = outputs[ACTION]
+        # Strip the batch dimension; reject actual batches (batch > 1).
+        if actions.ndim == 3:
+            if actions.shape[0] != 1:
+                msg = (
+                    f"Batched inference is not supported by predict_action_chunk: "
+                    f"expected batch dimension of 1, got shape {actions.shape}"
+                )
+                raise ValueError(msg)
+            actions = actions[0]
+        return np.atleast_2d(actions)
 
     def reset(self) -> None:
         """Reset policy state for new episode.

From f2df02824865718ce1c2a07ebd3abc8f267e8698 Mon Sep 17 00:00:00 2001
From: Max X <xiangxiang.ma@intel.com>
Date: Thu, 21 May 2026 13:57:51 +0200
Subject: [PATCH 18/19] fix style

---
 src/physicalai/inference/model.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/physicalai/inference/model.py b/src/physicalai/inference/model.py
index aeb4c7e..365f7b2 100644
--- a/src/physicalai/inference/model.py
+++ b/src/physicalai/inference/model.py
@@ -227,11 +227,14 @@ def predict_action_chunk(self, observation: dict[str, np.ndarray]) -> np.ndarray
 
         Returns:
             2-D action chunk with shape ``(chunk_size, action_dim)``.
+
+        Raises:
+            ValueError: If the output has a batch dimension greater than 1.
         """
         outputs = self(observation)
         actions = outputs[ACTION]
         # Strip the batch dimension; reject actual batches (batch > 1).
-        if actions.ndim == 3:
+        if actions.ndim == 3:  # noqa: N806
             if actions.shape[0] != 1:
                 msg = (
                     f"Batched inference is not supported by predict_action_chunk: "

From fb12417ca372640e354439fb13c37178e6a8c425 Mon Sep 17 00:00:00 2001
From: Max X <xiangxiang.ma@intel.com>
Date: Thu, 21 May 2026 13:58:55 +0200
Subject: [PATCH 19/19] fix style

---
 src/physicalai/inference/model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/physicalai/inference/model.py b/src/physicalai/inference/model.py
index 365f7b2..4964969 100644
--- a/src/physicalai/inference/model.py
+++ b/src/physicalai/inference/model.py
@@ -234,7 +234,7 @@ def predict_action_chunk(self, observation: dict[str, np.ndarray]) -> np.ndarray
         outputs = self(observation)
         actions = outputs[ACTION]
         # Strip the batch dimension; reject actual batches (batch > 1).
-        if actions.ndim == 3:  # noqa: N806
+        if actions.ndim == 3:  # noqa: PLR2004
             if actions.shape[0] != 1:
                 msg = (
                     f"Batched inference is not supported by predict_action_chunk: "