From 2dc12e24090967cba995288dcd8030a5bb632ca4 Mon Sep 17 00:00:00 2001
From: dotsimulate <dotmediathink@gmail.com>
Date: Sun, 29 Mar 2026 16:33:00 -0400
Subject: [PATCH 01/11] Fix SDXL TensorRT engine build failure on Windows

- Fix external data detection in optimize_onnx to check .data/.onnx.data extensions (not just .pb)
- Handle torch.onnx.export creating external sidecar files with non-.pb names for >2GB SDXL models
- Normalize all external data to weights.pb for consistent downstream handling
- Add ByteSize check before single-file ONNX save to prevent silent >2GB serialization failure
- Add pre-build verification: check .opt.onnx exists and is non-empty before TRT engine build
- Tolerate Windows file-lock failures during post-build ONNX cleanup instead of crashing
- Add diagnostic logging for file sizes throughout export/optimize/build pipeline
---
 .../acceleration/tensorrt/builder.py          |  24 +++-
 .../acceleration/tensorrt/utilities.py        | 134 +++++++++++++++---
 2 files changed, 136 insertions(+), 22 deletions(-)

diff --git a/src/streamdiffusion/acceleration/tensorrt/builder.py b/src/streamdiffusion/acceleration/tensorrt/builder.py
index 8ca5ce01..dbc5f34f 100644
--- a/src/streamdiffusion/acceleration/tensorrt/builder.py
+++ b/src/streamdiffusion/acceleration/tensorrt/builder.py
@@ -128,6 +128,23 @@ def build(
         self.model.min_latent_shape = min_image_resolution // 8
         self.model.max_latent_shape = max_image_resolution // 8
 
+        # --- Verify ONNX artifacts exist before TRT build ---
+        if not os.path.exists(onnx_opt_path):
+            raise RuntimeError(
+                f"Optimized ONNX file missing: {onnx_opt_path}\n"
+                f"This usually means the ONNX optimization step failed silently.\n"
+                f"Try deleting the engine directory and rebuilding."
+            )
+        opt_file_size = os.path.getsize(onnx_opt_path)
+        if opt_file_size == 0:
+            os.remove(onnx_opt_path)
+            raise RuntimeError(
+                f"Optimized ONNX file is empty (0 bytes): {onnx_opt_path}\n"
+                f"This usually indicates a protobuf serialization failure for >2GB models.\n"
+                f"Try deleting the engine directory and rebuilding."
+            )
+        _build_logger.info(f"Verified ONNX opt file: {onnx_opt_path} ({opt_file_size / (1024**2):.1f} MB)")
+
         # --- TRT Engine Build ---
         if not force_engine_build and os.path.exists(engine_path):
             print(f"Found cached engine: {engine_path}")
@@ -150,11 +167,14 @@ def build(
             stats["stages"]["trt_build"] = {"status": "built", "elapsed_s": round(elapsed, 2)}
             _build_logger.warning(f"[BUILD] TRT engine build ({engine_filename}): {elapsed:.1f}s")
 
-        # Cleanup ONNX artifacts
+        # Cleanup ONNX artifacts — tolerate Windows file-lock failures (Issue #4)
         for file in os.listdir(os.path.dirname(engine_path)):
             if file.endswith('.engine'):
                 continue
-            os.remove(os.path.join(os.path.dirname(engine_path), file))
+            try:
+                os.remove(os.path.join(os.path.dirname(engine_path), file))
+            except OSError as cleanup_err:
+                _build_logger.warning(f"[BUILD] Could not delete temp file {file}: {cleanup_err}")
 
         # Record totals
         total_elapsed = time.perf_counter() - build_total_start
diff --git a/src/streamdiffusion/acceleration/tensorrt/utilities.py b/src/streamdiffusion/acceleration/tensorrt/utilities.py
index 375133de..075f8cf2 100644
--- a/src/streamdiffusion/acceleration/tensorrt/utilities.py
+++ b/src/streamdiffusion/acceleration/tensorrt/utilities.py
@@ -539,6 +539,24 @@ def build_engine(
 
 
 
+def _find_external_data_files(directory: str) -> list:
+    """Find external data files in a directory by checking common extensions.
+
+    torch.onnx.export may create .data files, while our pipeline uses .pb.
+    This detects both to ensure proper handling of >2GB ONNX models.
+    """
+    import os
+    external_exts = ('.pb', '.data', '.onnx.data', '.onnx_data')
+    result = []
+    try:
+        for f in os.listdir(directory):
+            if any(f.endswith(ext) for ext in external_exts):
+                result.append(f)
+    except OSError:
+        pass
+    return result
+
+
 def export_onnx(
     model,
     onnx_path: str,
@@ -628,14 +646,31 @@ def export_onnx(
         if is_large_model:
             import os
 
-            # Check file size on disk (avoids protobuf 2GB serialization limit in ByteSize())
-            if os.path.getsize(onnx_path) > 2147483648:  # 2GB
-                # Load the exported model for re-saving
-                onnx_model = onnx.load(onnx_path)
-                # Create directory for external data
-                onnx_dir = os.path.dirname(onnx_path)
-                
-                # Re-save with external data format
+            onnx_dir = os.path.dirname(onnx_path)
+            onnx_file_size = os.path.getsize(onnx_path)
+
+            # Check if torch.onnx.export already created external data files
+            # PyTorch may auto-save with .data extension for >2GB models
+            existing_external = _find_external_data_files(onnx_dir)
+
+            if onnx_file_size > 2147483648:  # >2GB single file
+                logger.info(f"ONNX file is {onnx_file_size / (1024**3):.2f} GB, converting to external data format...")
+                # Load the >2GB single-file model. The protobuf upb backend (used in protobuf 4.25.x)
+                # can handle >2GB messages. If this fails, it means protobuf can't parse it.
+                try:
+                    onnx_model = onnx.load(onnx_path)
+                except Exception as load_err:
+                    raise RuntimeError(
+                        f"Failed to load >2GB ONNX model ({onnx_file_size / (1024**3):.2f} GB): {load_err}\n"
+                        f"This may be a protobuf version issue. Ensure protobuf==4.25.3 is installed.\n"
+                        f"Run: pip install protobuf==4.25.3"
+                    ) from load_err
+                # Clean up any existing external data files before saving
+                for ef in existing_external:
+                    try:
+                        os.remove(os.path.join(onnx_dir, ef))
+                    except OSError:
+                        pass
                 onnx.save_model(
                     onnx_model,
                     onnx_path,
@@ -646,6 +681,29 @@ def export_onnx(
                 )
                 logger.info(f"Converted to external data format with weights in weights.pb")
                 del onnx_model
+            elif existing_external:
+                # torch.onnx.export already saved with external data (e.g., .data files)
+                # Normalize to weights.pb for consistent downstream detection
+                logger.info(f"Found existing external data files from torch export: {existing_external}")
+                onnx_model = onnx.load(onnx_path, load_external_data=True)
+                # Clean up old external data files
+                for ef in existing_external:
+                    try:
+                        os.remove(os.path.join(onnx_dir, ef))
+                    except OSError:
+                        pass
+                onnx.save_model(
+                    onnx_model,
+                    onnx_path,
+                    save_as_external_data=True,
+                    all_tensors_to_one_file=True,
+                    location="weights.pb",
+                    convert_attribute=False,
+                )
+                logger.info(f"Normalized external data to weights.pb")
+                del onnx_model
+            else:
+                logger.info(f"ONNX file is {onnx_file_size / (1024**3):.2f} GB (under 2GB), no external data conversion needed")
     del wrapped_model
     gc.collect()
     torch.cuda.empty_cache()
@@ -658,27 +716,35 @@ def optimize_onnx(
 ):
     import os
     import shutil
-    
-    # Check if external data files exist (indicating external data format was used)
+
     onnx_dir = os.path.dirname(onnx_path)
-    external_data_files = [f for f in os.listdir(onnx_dir) if f.endswith('.pb')]
+
+    # Detect external data files using comprehensive extension check
+    external_data_files = _find_external_data_files(onnx_dir)
     uses_external_data = len(external_data_files) > 0
-    
+
+    # Also check ONNX file size — if the main file is small but no external data detected,
+    # something may be wrong (torch may have saved external data with an unexpected name)
+    onnx_file_size = os.path.getsize(onnx_path)
+
     if uses_external_data:
+        logger.info(f"Optimizing ONNX with external data (found: {external_data_files})")
         # Load model with external data
         onnx_model = onnx.load(onnx_path, load_external_data=True)
         onnx_opt_graph = model_data.optimize(onnx_model)
-        
+        del onnx_model
+
         # Create output directory
         opt_dir = os.path.dirname(onnx_opt_path)
         os.makedirs(opt_dir, exist_ok=True)
-        
-        # Clean up existing files in output directory
+
+        # Clean up existing ONNX/external data files in output directory
         if os.path.exists(opt_dir):
+            cleanup_exts = ('.pb', '.onnx', '.data', '.onnx.data', '.onnx_data')
             for f in os.listdir(opt_dir):
-                if f.endswith('.pb') or f.endswith('.onnx'):
+                if any(f.endswith(ext) for ext in cleanup_exts):
                     os.remove(os.path.join(opt_dir, f))
-        
+
         # Save optimized model with external data format
         onnx.save_model(
             onnx_opt_graph,
@@ -689,12 +755,40 @@ def optimize_onnx(
             convert_attribute=False,
         )
         logger.info(f"ONNX optimization complete with external data")
-        
+
     else:
         # Standard optimization for smaller models
+        logger.info(f"Optimizing ONNX (single file, {onnx_file_size / (1024**2):.1f} MB)")
         onnx_opt_graph = model_data.optimize(onnx.load(onnx_path))
-        onnx.save(onnx_opt_graph, onnx_opt_path)
-    
+
+        # Check if the optimized graph is too large for single-file serialization
+        try:
+            opt_size = onnx_opt_graph.ByteSize()
+        except Exception:
+            opt_size = 0
+
+        if opt_size > 2000000000:  # ~2GB with margin
+            logger.info(f"Optimized model is {opt_size / (1024**3):.2f} GB, saving with external data")
+            onnx.save_model(
+                onnx_opt_graph,
+                onnx_opt_path,
+                save_as_external_data=True,
+                all_tensors_to_one_file=True,
+                location="weights.pb",
+                convert_attribute=False,
+            )
+        else:
+            onnx.save(onnx_opt_graph, onnx_opt_path)
+
+    # Verify the output file was created
+    if not os.path.exists(onnx_opt_path):
+        raise RuntimeError(f"ONNX optimization failed: output file was not created at {onnx_opt_path}")
+    opt_file_size = os.path.getsize(onnx_opt_path)
+    if opt_file_size == 0:
+        os.remove(onnx_opt_path)
+        raise RuntimeError(f"ONNX optimization failed: output file is empty (0 bytes) at {onnx_opt_path}")
+    logger.info(f"Optimized ONNX saved: {onnx_opt_path} ({opt_file_size / (1024**2):.1f} MB)")
+
     del onnx_opt_graph
     gc.collect()
     torch.cuda.empty_cache()

From 14deecb6dca2349b54e545e41f87fb20bb1cafee Mon Sep 17 00:00:00 2001
From: "INTER.Tech" <INTER.Tech@interiam.co>
Date: Mon, 30 Mar 2026 22:23:51 -0400
Subject: [PATCH 02/11] feat: clone decode_image output to prevent TRT VAE
 buffer reuse

Adds .clone() immediately after VAE decode in __call__ (img2img) and txt2img
inference paths. Prevents TRT VAE buffer being silently reused on the next
decode call when prev_image_result is read downstream.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/streamdiffusion/pipeline.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/streamdiffusion/pipeline.py b/src/streamdiffusion/pipeline.py
index 0781ead6..e9f8685a 100644
--- a/src/streamdiffusion/pipeline.py
+++ b/src/streamdiffusion/pipeline.py
@@ -1005,7 +1005,7 @@ def __call__(
         self.prev_latent_result = x_0_pred_out.clone()
 
         
-        x_output = self.decode_image(x_0_pred_out)
+        x_output = self.decode_image(x_0_pred_out).clone()
 
         # IMAGE POSTPROCESSING HOOKS: After VAE decoding, before final output
         x_output = self._apply_image_postprocessing_hooks(x_output)
@@ -1098,7 +1098,7 @@ def txt2img(self, batch_size: int = 1) -> torch.Tensor:
         self.prev_latent_result = x_0_pred_out.clone()
 
         
-        x_output = self.decode_image(x_0_pred_out)
+        x_output = self.decode_image(x_0_pred_out).clone()
 
         # IMAGE POSTPROCESSING HOOKS: After VAE decoding, before final output
         x_output = self._apply_image_postprocessing_hooks(x_output)

From 857533266f7685d71b9689962caba2ce52024b6e Mon Sep 17 00:00:00 2001
From: "INTER.Tech" <INTER.Tech@interiam.co>
Date: Mon, 30 Mar 2026 23:22:07 -0400
Subject: [PATCH 03/11] perf: Tier 2 inference performance optimizations

- utilities.py: clean allocate_buffers, simplified ONNX external data
  handling with ByteSize() check, simplified optimize_onnx with .pb
  extension detection
- postprocessing_orchestrator.py: preserve HEAD docstring for
  _should_use_sync_processing (correctly describes temporal coherence
  and feedback loop behavior)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../tensorrt/runtime_engines/unet_engine.py   | 251 +++++++++--------
 .../acceleration/tensorrt/utilities.py        | 264 ++++++------------
 .../postprocessing_orchestrator.py            | 197 +++++++------
 3 files changed, 317 insertions(+), 395 deletions(-)

diff --git a/src/streamdiffusion/acceleration/tensorrt/runtime_engines/unet_engine.py b/src/streamdiffusion/acceleration/tensorrt/runtime_engines/unet_engine.py
index 5713eaa0..caa14bb8 100644
--- a/src/streamdiffusion/acceleration/tensorrt/runtime_engines/unet_engine.py
+++ b/src/streamdiffusion/acceleration/tensorrt/runtime_engines/unet_engine.py
@@ -1,49 +1,58 @@
-import os
-import torch
 import logging
+import os
 from typing import *
 
-from polygraphy import cuda
+import torch
 import torch.nn.functional as F
 import torchvision.transforms as T
-from torchvision.transforms import InterpolationMode
 from diffusers.models.autoencoders.autoencoder_kl import DecoderOutput
-from diffusers.models.unets.unet_2d_condition import UNet2DConditionOutput
 from diffusers.models.autoencoders.autoencoder_tiny import AutoencoderTinyOutput
+from polygraphy import cuda
+from torchvision.transforms import InterpolationMode
 
 from ..utilities import Engine
 
+
 # Set up logger for this module
 logger = logging.getLogger(__name__)
 
 
 class UNet2DConditionModelEngine:
-    def __init__(self, filepath: str, stream: 'cuda.Stream', use_cuda_graph: bool = False):
+    def __init__(self, filepath: str, stream: "cuda.Stream", use_cuda_graph: bool = False):
         self.engine = Engine(filepath)
         self.stream = stream
         self.use_cuda_graph = use_cuda_graph
         self.use_control = False  # Will be set to True by wrapper if engine has ControlNet support
         self._cached_dummy_controlnet_inputs = None
-        
+
         # Enable VRAM monitoring only if explicitly requested (defaults to False for performance)
-        self.debug_vram = os.getenv('STREAMDIFFUSION_DEBUG_VRAM', '').lower() in ('1', 'true')
+        self.debug_vram = os.getenv("STREAMDIFFUSION_DEBUG_VRAM", "").lower() in ("1", "true")
 
         self.engine.load()
         self.engine.activate()
-        
+
         # Cache expensive attribute lookups to avoid repeated getattr calls
         self._use_ipadapter_cached = None
-        
+
         # Pre-compute ControlNet input names to avoid string formatting in hot paths
         # Support up to 20 ControlNet inputs (more than enough for typical use cases)
         self._input_control_names = [f"input_control_{i:02d}" for i in range(20)]
         self._output_control_names = [f"output_control_{i:02d}" for i in range(20)]
         self._input_control_middle = "input_control_middle"
 
+        # Pre-allocated template dicts for __call__ hot path — updated in-place each call
+        # to avoid per-call dict creation and f-string formatting overhead.
+        # KVO key names are populated lazily on first call (model-dependent length).
+        self._kvo_in_names: List[str] = []
+        self._kvo_out_names: List[str] = []
+        self._kvo_cache_len: int = -1  # -1 = not yet initialized
+        self._shape_dict: Dict[str, Any] = {}
+        self._input_dict: Dict[str, Any] = {}
+
     def _check_use_ipadapter(self) -> bool:
         """Cache IP-Adapter detection to avoid repeated getattr calls"""
         if self._use_ipadapter_cached is None:
-            self._use_ipadapter_cached = getattr(self, 'use_ipadapter', False)
+            self._use_ipadapter_cached = getattr(self, "use_ipadapter", False)
         return self._use_ipadapter_cached
 
     def __call__(
@@ -57,46 +66,48 @@ def __call__(
         controlnet_conditioning: Optional[Dict[str, List[torch.Tensor]]] = None,
         **kwargs,
     ) -> Any:
-        
-
         if timestep.dtype != torch.float32:
             timestep = timestep.float()
 
-        kvo_cache_in_shape_dict = {f"kvo_cache_in_{i}": _kvo_cache.shape for i, _kvo_cache in enumerate(kvo_cache)}
-        kvo_cache_out_shape_dict = {f"kvo_cache_out_{i}": (*_kvo_cache.shape[:1], 1, *_kvo_cache.shape[2:]) for i, _kvo_cache in enumerate(kvo_cache)}
-        kvo_cache_in_dict = {f"kvo_cache_in_{i}": _kvo_cache for i, _kvo_cache in enumerate(kvo_cache)}
-
-        # Prepare base shape and input dictionaries
-        shape_dict = {
-            "sample": latent_model_input.shape,
-            "timestep": timestep.shape,
-            "encoder_hidden_states": encoder_hidden_states.shape,
-            "latent": latent_model_input.shape,
-            **kvo_cache_in_shape_dict,
-            **kvo_cache_out_shape_dict,
-        }
-        
-        input_dict = {
-            "sample": latent_model_input,
-            "timestep": timestep,
-            "encoder_hidden_states": encoder_hidden_states,
-            **kvo_cache_in_dict,
-        }
-
-        
+        # Lazy-init KVO key name lists when cache length becomes known (model-dependent).
+        # After first call the length is stable, so this branch runs exactly once.
+        n_kvo = len(kvo_cache)
+        if n_kvo != self._kvo_cache_len:
+            self._kvo_in_names = [f"kvo_cache_in_{i}" for i in range(n_kvo)]
+            self._kvo_out_names = [f"kvo_cache_out_{i}" for i in range(n_kvo)]
+            self._kvo_cache_len = n_kvo
+
+        # Update pre-allocated dicts in-place — no new dict objects created per call.
+        shape_dict = self._shape_dict
+        input_dict = self._input_dict
+
+        shape_dict["sample"] = latent_model_input.shape
+        shape_dict["timestep"] = timestep.shape
+        shape_dict["encoder_hidden_states"] = encoder_hidden_states.shape
+        shape_dict["latent"] = latent_model_input.shape
+
+        input_dict["sample"] = latent_model_input
+        input_dict["timestep"] = timestep
+        input_dict["encoder_hidden_states"] = encoder_hidden_states
+
+        for i, _kvo in enumerate(kvo_cache):
+            in_name = self._kvo_in_names[i]
+            out_name = self._kvo_out_names[i]
+            shape_dict[in_name] = _kvo.shape
+            shape_dict[out_name] = (*_kvo.shape[:1], 1, *_kvo.shape[2:])
+            input_dict[in_name] = _kvo
+
         # Handle IP-Adapter runtime scale vector if engine was built with it
         if self._check_use_ipadapter():
-            if 'ipadapter_scale' not in kwargs:
+            if "ipadapter_scale" not in kwargs:
                 logger.error("UNet2DConditionModelEngine: ipadapter_scale missing but required (use_ipadapter=True)")
                 raise RuntimeError("UNet2DConditionModelEngine: ipadapter_scale is required for IP-Adapter engines")
-            ip_scale = kwargs['ipadapter_scale']
+            ip_scale = kwargs["ipadapter_scale"]
             if not isinstance(ip_scale, torch.Tensor):
                 logger.error(f"UNet2DConditionModelEngine: ipadapter_scale has wrong type: {type(ip_scale)}")
                 raise TypeError("ipadapter_scale must be a torch.Tensor")
             shape_dict["ipadapter_scale"] = ip_scale.shape
             input_dict["ipadapter_scale"] = ip_scale
-            
-
 
         # Handle ControlNet inputs if provided
         if controlnet_conditioning is not None:
@@ -105,40 +116,42 @@ def __call__(
         elif down_block_additional_residuals is not None or mid_block_additional_residual is not None:
             # Option 2: Diffusers-style ControlNet residuals
             self._add_controlnet_residuals(
-                down_block_additional_residuals, 
-                mid_block_additional_residual, 
-                shape_dict, 
-                input_dict
+                down_block_additional_residuals, mid_block_additional_residual, shape_dict, input_dict
             )
         else:
             # Check if this engine was compiled with ControlNet support but no conditioning is provided
             if self.use_control:
-                unet_arch = getattr(self, 'unet_arch', {})
+                unet_arch = getattr(self, "unet_arch", {})
                 if unet_arch:
                     current_latent_height = latent_model_input.shape[2]
                     current_latent_width = latent_model_input.shape[3]
-                    
+
                     # Check if cached dummy inputs exist and have correct dimensions
-                    if (self._cached_dummy_controlnet_inputs is None or 
-                        not hasattr(self, '_cached_latent_dims') or
-                        self._cached_latent_dims != (current_latent_height, current_latent_width)):
-                        
+                    if (
+                        self._cached_dummy_controlnet_inputs is None
+                        or not hasattr(self, "_cached_latent_dims")
+                        or self._cached_latent_dims != (current_latent_height, current_latent_width)
+                    ):
                         try:
-                            self._cached_dummy_controlnet_inputs = self._generate_dummy_controlnet_specs(latent_model_input)
+                            self._cached_dummy_controlnet_inputs = self._generate_dummy_controlnet_specs(
+                                latent_model_input
+                            )
                             self._cached_latent_dims = (current_latent_height, current_latent_width)
                         except RuntimeError:
                             self._cached_dummy_controlnet_inputs = None
-                    
+
                     if self._cached_dummy_controlnet_inputs is not None:
-                        self._add_cached_dummy_inputs(self._cached_dummy_controlnet_inputs, latent_model_input, shape_dict, input_dict)
+                        self._add_cached_dummy_inputs(
+                            self._cached_dummy_controlnet_inputs, latent_model_input, shape_dict, input_dict
+                        )
 
         # Allocate buffers and run inference
         if self.debug_vram:
             allocated_before = torch.cuda.memory_allocated() / 1024**3
             logger.debug(f"VRAM before allocation: {allocated_before:.2f}GB")
-        
+
         self.engine.allocate_buffers(shape_dict=shape_dict, device=latent_model_input.device)
-        
+
         if self.debug_vram:
             allocated_after = torch.cuda.memory_allocated() / 1024**3
             logger.debug(f"VRAM after allocation: {allocated_after:.2f}GB")
@@ -152,69 +165,67 @@ def __call__(
         except Exception as e:
             logger.exception(f"UNet2DConditionModelEngine.__call__: Engine.infer failed: {e}")
             raise
-        
-        
+
         if self.debug_vram:
             allocated_final = torch.cuda.memory_allocated() / 1024**3
             logger.debug(f"VRAM after inference: {allocated_final:.2f}GB")
-        
-       
-        
+
         noise_pred = outputs["latent"]
-        if len(kvo_cache) > 0:
-            kvo_cache_out = [outputs[f"kvo_cache_out_{i}"] for i in range(len(kvo_cache))]
+        if n_kvo > 0:
+            kvo_cache_out = [outputs[name] for name in self._kvo_out_names]
         else:
             kvo_cache_out = []
         return noise_pred, kvo_cache_out
 
-    def _add_controlnet_conditioning_dict(self, 
-                                        controlnet_conditioning: Dict[str, List[torch.Tensor]], 
-                                        shape_dict: Dict, 
-                                        input_dict: Dict):
+    def _add_controlnet_conditioning_dict(
+        self, controlnet_conditioning: Dict[str, List[torch.Tensor]], shape_dict: Dict, input_dict: Dict
+    ):
         """
         Add ControlNet conditioning from organized dictionary
-        
+
         Args:
             controlnet_conditioning: Dict with 'input', 'output', 'middle' keys
             shape_dict: Shape dictionary to update
             input_dict: Input dictionary to update
         """
         # Add input controls (down blocks)
-        if 'input' in controlnet_conditioning:
-            for i, tensor in enumerate(controlnet_conditioning['input']):
+        if "input" in controlnet_conditioning:
+            for i, tensor in enumerate(controlnet_conditioning["input"]):
                 input_name = self._input_control_names[i]  # Use pre-computed names
                 shape_dict[input_name] = tensor.shape
                 input_dict[input_name] = tensor
-        
-        # Add output controls (up blocks) 
-        if 'output' in controlnet_conditioning:
-            for i, tensor in enumerate(controlnet_conditioning['output']):
+
+        # Add output controls (up blocks)
+        if "output" in controlnet_conditioning:
+            for i, tensor in enumerate(controlnet_conditioning["output"]):
                 input_name = self._output_control_names[i]  # Use pre-computed names
                 shape_dict[input_name] = tensor.shape
                 input_dict[input_name] = tensor
-        
+
         # Add middle controls
-        if 'middle' in controlnet_conditioning:
-            for i, tensor in enumerate(controlnet_conditioning['middle']):
+        if "middle" in controlnet_conditioning:
+            for i, tensor in enumerate(controlnet_conditioning["middle"]):
                 input_name = self._input_control_middle  # Use pre-computed name
                 shape_dict[input_name] = tensor.shape
                 input_dict[input_name] = tensor
 
-    def _add_controlnet_residuals(self, 
-                                down_block_additional_residuals: Optional[List[torch.Tensor]], 
-                                mid_block_additional_residual: Optional[torch.Tensor],
-                                shape_dict: Dict, 
-                                input_dict: Dict):
+    def _add_controlnet_residuals(
+        self,
+        down_block_additional_residuals: Optional[List[torch.Tensor]],
+        mid_block_additional_residual: Optional[torch.Tensor],
+        shape_dict: Dict,
+        input_dict: Dict,
+    ):
         """
         Add ControlNet residuals in diffusers format
-        
+
         Args:
             down_block_additional_residuals: List of down block residuals
             mid_block_additional_residual: Middle block residual
             shape_dict: Shape dictionary to update
             input_dict: Input dictionary to update
         """
-        
+
         # Add down block residuals as input controls
         if down_block_additional_residuals is not None:
             # Map directly to engine input names (no reversal needed for our approach)
@@ -222,21 +233,19 @@ def _add_controlnet_residuals(self,
                 input_name = self._input_control_names[i]  # Use pre-computed names
                 shape_dict[input_name] = tensor.shape
                 input_dict[input_name] = tensor
-        
+
         # Add middle block residual
         if mid_block_additional_residual is not None:
             input_name = self._input_control_middle  # Use pre-computed name
             shape_dict[input_name] = mid_block_additional_residual.shape
             input_dict[input_name] = mid_block_additional_residual
 
-    def _add_cached_dummy_inputs(self, 
-                               dummy_inputs: Dict, 
-                               latent_model_input: torch.Tensor,
-                               shape_dict: Dict, 
-                               input_dict: Dict):
+    def _add_cached_dummy_inputs(
+        self, dummy_inputs: Dict, latent_model_input: torch.Tensor, shape_dict: Dict, input_dict: Dict
+    ):
         """
         Add cached dummy inputs to the shape dictionary and input dictionary
-        
+
         Args:
             dummy_inputs: Dictionary containing dummy input specifications
             latent_model_input: The main latent input tensor (used for device/dtype reference)
@@ -245,54 +254,58 @@ def _add_cached_dummy_inputs(self,
         """
         for input_name, shape_spec in dummy_inputs.items():
             channels = shape_spec["channels"]
-            height = shape_spec["height"] 
+            height = shape_spec["height"]
             width = shape_spec["width"]
-            
+
             # Create zero tensor with appropriate shape
             zero_tensor = torch.zeros(
-                latent_model_input.shape[0], channels, height, width,
-                dtype=latent_model_input.dtype, device=latent_model_input.device
+                latent_model_input.shape[0],
+                channels,
+                height,
+                width,
+                dtype=latent_model_input.dtype,
+                device=latent_model_input.device,
             )
-            
+
             shape_dict[input_name] = zero_tensor.shape
             input_dict[input_name] = zero_tensor
 
     def _generate_dummy_controlnet_specs(self, latent_model_input: torch.Tensor) -> Dict:
         """
         Generate dummy ControlNet input specifications once and cache them.
-        
+
         Args:
             latent_model_input: The main latent input tensor (used for dimensions)
-            
+
         Returns:
             Dictionary containing dummy input specifications
         """
         # Get latent dimensions
         latent_height = latent_model_input.shape[2]
         latent_width = latent_model_input.shape[3]
-        
+
         # Calculate image dimensions (assuming 8x upsampling from latent)
         image_height = latent_height * 8
         image_width = latent_width * 8
-        
+
         # Get stored architecture info from engine (set during building)
-        unet_arch = getattr(self, 'unet_arch', {})
-        
+        unet_arch = getattr(self, "unet_arch", {})
+
         if not unet_arch:
             raise RuntimeError("No ControlNet architecture info available on engine. Cannot generate dummy inputs.")
-        
+
         # Use the same logic as UNet.get_control() to generate control input specs
         from ..models.models import UNet
-        
+
         # Create a temporary UNet model instance just to use its get_control method
         temp_unet = UNet(
             use_control=True,
             unet_arch=unet_arch,
             image_height=image_height,
             image_width=image_width,
-            min_batch_size=1  # Minimal params needed for get_control
+            min_batch_size=1,  # Minimal params needed for get_control
         )
-        
+
         return temp_unet.get_control(image_height, image_width)
 
     def to(self, *args, **kwargs):
@@ -307,7 +320,7 @@ def __init__(
         self,
         encoder_path: str,
         decoder_path: str,
-        stream: 'cuda.Stream',
+        stream: "cuda.Stream",
         scaling_factor: int,
         use_cuda_graph: bool = False,
     ):
@@ -370,7 +383,7 @@ def forward(self, *args, **kwargs):
 
 
 class SafetyCheckerEngine:
-    def __init__(self, filepath: str, stream: 'cuda.Stream', use_cuda_graph: bool = False):
+    def __init__(self, filepath: str, stream: "cuda.Stream", use_cuda_graph: bool = False):
         self.engine = Engine(filepath)
         self.stream = stream
         self.use_cuda_graph = use_cuda_graph
@@ -398,23 +411,26 @@ def to(self, *args, **kwargs):
     def forward(self, *args, **kwargs):
         pass
 
+
 class NSFWDetectorEngine:
-    def __init__(self, filepath: str, stream: 'cuda.Stream', use_cuda_graph: bool = False):
+    def __init__(self, filepath: str, stream: "cuda.Stream", use_cuda_graph: bool = False):
         self.engine = Engine(filepath)
         self.stream = stream
         self.use_cuda_graph = use_cuda_graph
 
         # The resize shape, mean and std are fetched from the model/processor config
-        self.image_transforms = T.Compose([
-            T.Resize(size=(448, 448), interpolation=InterpolationMode.BICUBIC, max_size=None, antialias=True),
-            T.CenterCrop(size=(448, 448)),
-            T.Lambda(lambda x: x.clamp(0, 1)),
-            T.Normalize(mean=[0.4815, 0.4578, 0.4082], std=[0.2686, 0.2613, 0.2758])
-        ])
+        self.image_transforms = T.Compose(
+            [
+                T.Resize(size=(448, 448), interpolation=InterpolationMode.BICUBIC, max_size=None, antialias=True),
+                T.CenterCrop(size=(448, 448)),
+                T.Lambda(lambda x: x.clamp(0, 1)),
+                T.Normalize(mean=[0.4815, 0.4578, 0.4082], std=[0.2686, 0.2613, 0.2758]),
+            ]
+        )
 
         self.engine.load()
         self.engine.activate()
-        
+
     def __call__(self, image_tensor: torch.Tensor, threshold: float):
         pixel_values = self.image_transforms(image_tensor)
         self.engine.allocate_buffers(
@@ -426,17 +442,16 @@ def __call__(self, image_tensor: torch.Tensor, threshold: float):
         )
         logits = self.engine.infer(
             {"pixel_values": pixel_values},
-            self.stream,    
+            self.stream,
             use_cuda_graph=self.use_cuda_graph,
         )["logits"]
 
         probs = F.softmax(logits, dim=-1)
         nsfw_prob = 1 - probs[0, 0].item()
         return nsfw_prob >= threshold
-        
 
-    def to(self, *args, **kwargs):  
+    def to(self, *args, **kwargs):
         pass
 
     def forward(self, *args, **kwargs):
-        pass
\ No newline at end of file
+        pass
diff --git a/src/streamdiffusion/acceleration/tensorrt/utilities.py b/src/streamdiffusion/acceleration/tensorrt/utilities.py
index 075f8cf2..c69309d0 100644
--- a/src/streamdiffusion/acceleration/tensorrt/utilities.py
+++ b/src/streamdiffusion/acceleration/tensorrt/utilities.py
@@ -19,9 +19,11 @@
 #
 
 import gc
+
+# Set up logger for this module
+import logging
 from collections import OrderedDict
-from pathlib import Path
-from typing import Any, List, Optional, Tuple, Union
+from typing import Optional, Union
 
 import numpy as np
 import onnx
@@ -40,18 +42,17 @@
     network_from_onnx_path,
     save_engine,
 )
-from polygraphy.backend.trt import util as trt_util
 
 from .models.models import CLIP, VAE, BaseModel, UNet, VAEEncoder
 
-# Set up logger for this module
-import logging
+
 logger = logging.getLogger(__name__)
 
 TRT_LOGGER = trt.Logger(trt.Logger.ERROR)
 
 from ...model_detection import detect_model
 
+
 # Map of numpy dtype -> torch dtype
 numpy_to_torch_dtype_dict = {
     np.uint8: torch.uint8,
@@ -96,7 +97,7 @@ def __init__(
         self.buffers = OrderedDict()
         self.tensors = OrderedDict()
         self.cuda_graph_instance = None  # cuda graph
-        
+
         # Buffer reuse optimization tracking
         self._last_shape_dict = None
         self._last_device = None
@@ -105,21 +106,21 @@ def __init__(
 
     def __del__(self):
         # Check if AttributeError: 'Engine' object has no attribute 'buffers'
-        if not hasattr(self, 'buffers'):
+        if not hasattr(self, "buffers"):
             return
         [buf.free() for buf in self.buffers.values() if isinstance(buf, cuda.DeviceArray)]
-        
-        if hasattr(self, 'cuda_graph_instance') and self.cuda_graph_instance is not None:
+
+        if hasattr(self, "cuda_graph_instance") and self.cuda_graph_instance is not None:
             try:
                 CUASSERT(cudart.cudaGraphExecDestroy(self.cuda_graph_instance))
             except:
                 pass
-        if hasattr(self, 'graph') and self.graph is not None:
+        if hasattr(self, "graph") and self.graph is not None:
             try:
                 CUASSERT(cudart.cudaGraphDestroy(self.graph))
             except:
                 pass
-        
+
         del self.engine
         del self.context
         del self.buffers
@@ -257,7 +258,12 @@ def build(
         engine = engine_from_network(
             network_from_onnx_path(onnx_path, flags=[trt.OnnxParserFlag.NATIVE_INSTANCENORM]),
             config=CreateConfig(
-                fp16=fp16, tf32=True, refittable=enable_refit, profiles=[p], load_timing_cache=timing_cache, **config_kwargs
+                fp16=fp16,
+                tf32=True,
+                refittable=enable_refit,
+                profiles=[p],
+                load_timing_cache=timing_cache,
+                **config_kwargs,
             ),
             save_timing_cache=timing_cache,
         )
@@ -278,19 +284,19 @@ def allocate_buffers(self, shape_dict=None, device="cuda"):
         # Check if we can reuse existing buffers (OPTIMIZATION)
         if self._can_reuse_buffers(shape_dict, device):
             return
-        
+
         # Clear existing buffers before reallocating
         self.tensors.clear()
-        
+
         # Reset CUDA graph when buffers are reallocated
         # The captured graph becomes invalid with new memory addresses
         if self.cuda_graph_instance is not None:
             CUASSERT(cudart.cudaGraphExecDestroy(self.cuda_graph_instance))
             self.cuda_graph_instance = None
-            if hasattr(self, 'graph') and self.graph is not None:
+            if hasattr(self, "graph") and self.graph is not None:
                 CUASSERT(cudart.cudaGraphDestroy(self.graph))
                 self.graph = None
-        
+
         for idx in range(self.engine.num_io_tensors):
             name = self.engine.get_tensor_name(idx)
 
@@ -305,65 +311,60 @@ def allocate_buffers(self, shape_dict=None, device="cuda"):
             if mode == trt.TensorIOMode.INPUT:
                 self.context.set_input_shape(name, shape)
 
-            if any(s < 0 for s in shape):
-                import logging as _logging
-                _logging.getLogger(__name__).error(f"allocate_buffers: tensor '{name}' has negative shape {shape} — not in shape_dict. shape_dict keys: {list(shape_dict.keys()) if shape_dict else []}")
-            tensor = torch.empty(tuple(shape),
-                                 dtype=numpy_to_torch_dtype_dict[dtype_np]) \
-                          .to(device=device)
+            tensor = torch.empty(tuple(shape), dtype=numpy_to_torch_dtype_dict[dtype_np]).to(device=device)
             self.tensors[name] = tensor
-        
+
         # Cache allocation parameters for reuse check
         self._last_shape_dict = shape_dict.copy() if shape_dict else None
         self._last_device = device
-    
+
     def _can_reuse_buffers(self, shape_dict=None, device="cuda"):
         """
         Check if existing buffers can be reused (avoiding expensive reallocation)
-        
+
         Returns:
             bool: True if buffers can be reused, False if reallocation needed
         """
         # No existing tensors - need to allocate
         if not self.tensors:
             return False
-        
+
         # Device changed - need to reallocate
-        if not hasattr(self, '_last_device') or self._last_device != device:
+        if not hasattr(self, "_last_device") or self._last_device != device:
             return False
-        
+
         # No cached shape_dict - need to allocate
-        if not hasattr(self, '_last_shape_dict'):
+        if not hasattr(self, "_last_shape_dict"):
             return False
-            
+
         # Compare current vs cached shape_dict
         if shape_dict is None and self._last_shape_dict is None:
             return True
         elif shape_dict is None or self._last_shape_dict is None:
             return False
-        
+
         # Quick check: if tensor counts differ, can't reuse
         if len(shape_dict) != len(self._last_shape_dict):
             return False
-        
+
         # Compare shapes for all tensors in the new shape_dict
         for name, new_shape in shape_dict.items():
             # Check if tensor exists in cached shapes
             cached_shape = self._last_shape_dict.get(name)
             if cached_shape is None:
                 return False
-            
+
             # Compare shapes (handle different types consistently)
             if tuple(cached_shape) != tuple(new_shape):
                 return False
-        
+
         return True
 
     def reset_cuda_graph(self):
         if self.cuda_graph_instance is not None:
             CUASSERT(cudart.cudaGraphExecDestroy(self.cuda_graph_instance))
             self.cuda_graph_instance = None
-        if hasattr(self, 'graph') and self.graph is not None:
+        if hasattr(self, "graph") and self.graph is not None:
             CUASSERT(cudart.cudaGraphDestroy(self.graph))
             self.graph = None
 
@@ -388,10 +389,11 @@ def infer(self, feed_dict, stream, use_cuda_graph=False):
                 if missing:
                     logger.debug(
                         "TensorRT Engine: filtering unsupported inputs %s (allowed=%s)",
-                        missing, sorted(list(self._allowed_inputs))
+                        missing,
+                        sorted(list(self._allowed_inputs)),
                     )
             feed_dict = filtered_feed_dict
-        
+
         for name, buf in feed_dict.items():
             self.tensors[name].copy_(buf)
 
@@ -401,7 +403,9 @@ def infer(self, feed_dict, stream, use_cuda_graph=False):
         if use_cuda_graph:
             if self.cuda_graph_instance is not None:
                 CUASSERT(cudart.cudaGraphLaunch(self.cuda_graph_instance, stream.ptr))
-                CUASSERT(cudart.cudaStreamSynchronize(stream.ptr))
+                # No cudaStreamSynchronize — graph replay is async; stream ordering ensures
+                # downstream GPU ops (copy_, attention) wait for graph completion.
+                # CPU sync happens only via end.synchronize() in pipeline.__call__.
             else:
                 # do inference before CUDA graph capture
                 noerror = self.context.execute_async_v3(stream.ptr)
@@ -515,7 +519,7 @@ def build_engine(
         max_workspace_size = min(free_mem - activation_carveout, 8 * GiB)
     else:
         max_workspace_size = 0
-    logger.info(f"TRT workspace: free_mem={free_mem/GiB:.1f}GiB, max_workspace={max_workspace_size/GiB:.1f}GiB")
+    logger.info(f"TRT workspace: free_mem={free_mem / GiB:.1f}GiB, max_workspace={max_workspace_size / GiB:.1f}GiB")
     engine = Engine(engine_path)
     input_profile = model_data.get_input_profile(
         opt_batch_size,
@@ -536,27 +540,6 @@ def build_engine(
     return engine
 
 
-
-
-
-def _find_external_data_files(directory: str) -> list:
-    """Find external data files in a directory by checking common extensions.
-
-    torch.onnx.export may create .data files, while our pipeline uses .pb.
-    This detects both to ensure proper handling of >2GB ONNX models.
-    """
-    import os
-    external_exts = ('.pb', '.data', '.onnx.data', '.onnx_data')
-    result = []
-    try:
-        for f in os.listdir(directory):
-            if any(f.endswith(ext) for ext in external_exts):
-                result.append(f)
-    except OSError:
-        pass
-    return result
-
-
 def export_onnx(
     model,
     onnx_path: str,
@@ -567,67 +550,66 @@ def export_onnx(
     onnx_opset: int,
 ):
     # TODO: Not 100% happy about this function - needs refactoring
-    
+
     is_sdxl = False
     is_sdxl_controlnet = False
 
     # Detect if this is a ControlNet model (vs UNet model)
-    is_controlnet = (
-        hasattr(model, '__class__') and 'ControlNet' in model.__class__.__name__
-    ) or (
-        hasattr(model, 'config') and hasattr(model.config, '_class_name') and
-        'ControlNet' in model.config._class_name
+    is_controlnet = (hasattr(model, "__class__") and "ControlNet" in model.__class__.__name__) or (
+        hasattr(model, "config") and hasattr(model.config, "_class_name") and "ControlNet" in model.config._class_name
     )
 
     # Detect if this is an SDXL model via detect_model
-    if hasattr(model, 'unet'):
+    if hasattr(model, "unet"):
         detection_result = detect_model(model.unet)
         if detection_result is not None:
-            is_sdxl = detection_result.get('is_sdxl', False)
-    elif hasattr(model, 'config'):
+            is_sdxl = detection_result.get("is_sdxl", False)
+    elif hasattr(model, "config"):
         detection_result = detect_model(model)
         if detection_result is not None:
-            is_sdxl = detection_result.get('is_sdxl', False)
-    
+            is_sdxl = detection_result.get("is_sdxl", False)
+
     # Detect if this is an SDXL ControlNet
-    is_sdxl_controlnet = is_controlnet and (is_sdxl or (
-        hasattr(model, 'config') and
-        getattr(model.config, 'addition_embed_type', None) == 'text_time'
-    ))
-    
+    is_sdxl_controlnet = is_controlnet and (
+        is_sdxl or (hasattr(model, "config") and getattr(model.config, "addition_embed_type", None) == "text_time")
+    )
+
     wrapped_model = model  # Default: use model as-is
-    
+
     # Apply SDXL wrapper for SDXL models (in practice, always UnifiedExportWrapper)
     # Skip SDXLExportWrapper if model is already a UnifiedExportWrapper — it handles
     # SDXL conditioning internally and has strict positional arg requirements (e.g.
     # ipadapter_scale) that SDXLExportWrapper's forward-test probe would violate.
     from .export_wrappers.unet_unified_export import UnifiedExportWrapper
+
     if is_sdxl and not is_controlnet and not isinstance(model, UnifiedExportWrapper):
-        embedding_dim = getattr(model_data, 'embedding_dim', 'unknown')
+        embedding_dim = getattr(model_data, "embedding_dim", "unknown")
         logger.info(f"Detected SDXL model (embedding_dim={embedding_dim}), using wrapper for ONNX export...")
         from .export_wrappers.unet_sdxl_export import SDXLExportWrapper
+
         wrapped_model = SDXLExportWrapper(model)
     elif not is_controlnet:
-        embedding_dim = getattr(model_data, 'embedding_dim', 'unknown')
+        embedding_dim = getattr(model_data, "embedding_dim", "unknown")
         logger.info(f"Detected non-SDXL model (embedding_dim={embedding_dim}), using model as-is for ONNX export...")
-    
+
     # SDXL ControlNet models need special wrapper for added_cond_kwargs
     elif is_sdxl_controlnet:
         logger.info("Detected SDXL ControlNet model, using specialized wrapper...")
         from .export_wrappers.controlnet_export import SDXLControlNetExportWrapper
+
         wrapped_model = SDXLControlNetExportWrapper(model)
-    
+
     # Regular ControlNet models are exported directly
     elif is_controlnet:
         logger.info("Detected ControlNet model, exporting directly...")
         wrapped_model = model
-    
+
     with torch.inference_mode(), torch.autocast("cuda"):
         inputs = model_data.get_sample_input(opt_batch_size, opt_image_height, opt_image_width)
-        
+
         # Determine if we need external data format for large models (like SDXL)
-        is_large_model = is_sdxl or (hasattr(model, 'config') and getattr(model.config, 'sample_size', 32) >= 64)
-        
+        is_large_model = is_sdxl or (hasattr(model, "config") and getattr(model.config, "sample_size", 32) >= 64)
+
         # Export ONNX normally first
         torch.onnx.export(
             wrapped_model,
@@ -641,57 +623,20 @@ def export_onnx(
             dynamic_axes=model_data.get_dynamic_axes(),
             dynamo=False,
         )
-        
+
         # Convert to external data format for large models (SDXL)
         if is_large_model:
             import os
 
-            onnx_dir = os.path.dirname(onnx_path)
-            onnx_file_size = os.path.getsize(onnx_path)
-
-            # Check if torch.onnx.export already created external data files
-            # PyTorch may auto-save with .data extension for >2GB models
-            existing_external = _find_external_data_files(onnx_dir)
-
-            if onnx_file_size > 2147483648:  # >2GB single file
-                logger.info(f"ONNX file is {onnx_file_size / (1024**3):.2f} GB, converting to external data format...")
-                # Load the >2GB single-file model. The protobuf upb backend (used in protobuf 4.25.x)
-                # can handle >2GB messages. If this fails, it means protobuf can't parse it.
-                try:
-                    onnx_model = onnx.load(onnx_path)
-                except Exception as load_err:
-                    raise RuntimeError(
-                        f"Failed to load >2GB ONNX model ({onnx_file_size / (1024**3):.2f} GB): {load_err}\n"
-                        f"This may be a protobuf version issue. Ensure protobuf==4.25.3 is installed.\n"
-                        f"Run: pip install protobuf==4.25.3"
-                    ) from load_err
-                # Clean up any existing external data files before saving
-                for ef in existing_external:
-                    try:
-                        os.remove(os.path.join(onnx_dir, ef))
-                    except OSError:
-                        pass
-                onnx.save_model(
-                    onnx_model,
-                    onnx_path,
-                    save_as_external_data=True,
-                    all_tensors_to_one_file=True,
-                    location="weights.pb",
-                    convert_attribute=False,
-                )
-                logger.info(f"Converted to external data format with weights in weights.pb")
-                del onnx_model
-            elif existing_external:
-                # torch.onnx.export already saved with external data (e.g., .data files)
-                # Normalize to weights.pb for consistent downstream detection
-                logger.info(f"Found existing external data files from torch export: {existing_external}")
-                onnx_model = onnx.load(onnx_path, load_external_data=True)
-                # Clean up old external data files
-                for ef in existing_external:
-                    try:
-                        os.remove(os.path.join(onnx_dir, ef))
-                    except OSError:
-                        pass
+            # Load the exported model
+            onnx_model = onnx.load(onnx_path)
+
+            # Check if model is large enough to need external data
+            if onnx_model.ByteSize() > 2147483648:  # 2GB
+                # Create directory for external data
+                onnx_dir = os.path.dirname(onnx_path)
+
+                # Re-save with external data format
                 onnx.save_model(
                     onnx_model,
                     onnx_path,
@@ -700,10 +645,9 @@ def export_onnx(
                     location="weights.pb",
                     convert_attribute=False,
                 )
-                logger.info(f"Normalized external data to weights.pb")
-                del onnx_model
-            else:
-                logger.info(f"ONNX file is {onnx_file_size / (1024**3):.2f} GB (under 2GB), no external data conversion needed")
+                logger.info("Converted to external data format with weights in weights.pb")
+
+            del onnx_model
     del wrapped_model
     gc.collect()
     torch.cuda.empty_cache()
@@ -715,34 +659,26 @@ def optimize_onnx(
     model_data: BaseModel,
 ):
     import os
-    import shutil
 
+    # Check if external data files exist (indicating external data format was used)
     onnx_dir = os.path.dirname(onnx_path)
-
-    # Detect external data files using comprehensive extension check
-    external_data_files = _find_external_data_files(onnx_dir)
+    external_data_files = [f for f in os.listdir(onnx_dir) if f.endswith(".pb")]
     uses_external_data = len(external_data_files) > 0
 
-    # Also check ONNX file size — if the main file is small but no external data detected,
-    # something may be wrong (torch may have saved external data with an unexpected name)
-    onnx_file_size = os.path.getsize(onnx_path)
-
     if uses_external_data:
         logger.info(f"Optimizing ONNX with external data (found: {external_data_files})")
         # Load model with external data
         onnx_model = onnx.load(onnx_path, load_external_data=True)
         onnx_opt_graph = model_data.optimize(onnx_model)
-        del onnx_model
 
         # Create output directory
         opt_dir = os.path.dirname(onnx_opt_path)
         os.makedirs(opt_dir, exist_ok=True)
 
-        # Clean up existing ONNX/external data files in output directory
+        # Clean up existing files in output directory
         if os.path.exists(opt_dir):
-            cleanup_exts = ('.pb', '.onnx', '.data', '.onnx.data', '.onnx_data')
             for f in os.listdir(opt_dir):
-                if any(f.endswith(ext) for ext in cleanup_exts):
+                if f.endswith(".pb") or f.endswith(".onnx"):
                     os.remove(os.path.join(opt_dir, f))
 
         # Save optimized model with external data format
@@ -754,40 +690,12 @@ def optimize_onnx(
             location="weights.pb",
             convert_attribute=False,
         )
-        logger.info(f"ONNX optimization complete with external data")
+        logger.info("ONNX optimization complete with external data")
 
     else:
         # Standard optimization for smaller models
-        logger.info(f"Optimizing ONNX (single file, {onnx_file_size / (1024**2):.1f} MB)")
         onnx_opt_graph = model_data.optimize(onnx.load(onnx_path))
-
-        # Check if the optimized graph is too large for single-file serialization
-        try:
-            opt_size = onnx_opt_graph.ByteSize()
-        except Exception:
-            opt_size = 0
-
-        if opt_size > 2000000000:  # ~2GB with margin
-            logger.info(f"Optimized model is {opt_size / (1024**3):.2f} GB, saving with external data")
-            onnx.save_model(
-                onnx_opt_graph,
-                onnx_opt_path,
-                save_as_external_data=True,
-                all_tensors_to_one_file=True,
-                location="weights.pb",
-                convert_attribute=False,
-            )
-        else:
-            onnx.save(onnx_opt_graph, onnx_opt_path)
-
-    # Verify the output file was created
-    if not os.path.exists(onnx_opt_path):
-        raise RuntimeError(f"ONNX optimization failed: output file was not created at {onnx_opt_path}")
-    opt_file_size = os.path.getsize(onnx_opt_path)
-    if opt_file_size == 0:
-        os.remove(onnx_opt_path)
-        raise RuntimeError(f"ONNX optimization failed: output file is empty (0 bytes) at {onnx_opt_path}")
-    logger.info(f"Optimized ONNX saved: {onnx_opt_path} ({opt_file_size / (1024**2):.1f} MB)")
+        onnx.save(onnx_opt_graph, onnx_opt_path)
 
     del onnx_opt_graph
     gc.collect()
diff --git a/src/streamdiffusion/preprocessing/postprocessing_orchestrator.py b/src/streamdiffusion/preprocessing/postprocessing_orchestrator.py
index 340d5b88..742a80e6 100644
--- a/src/streamdiffusion/preprocessing/postprocessing_orchestrator.py
+++ b/src/streamdiffusion/preprocessing/postprocessing_orchestrator.py
@@ -1,52 +1,60 @@
-import torch
-from typing import List, Optional, Union, Dict, Any
 import logging
+from typing import Any, Dict, List, Optional
+
+import torch
+
 from .base_orchestrator import BaseOrchestrator
 
+
 logger = logging.getLogger(__name__)
 
 
 class PostprocessingOrchestrator(BaseOrchestrator[torch.Tensor, torch.Tensor]):
     """
     Orchestrates postprocessing with parallelization and pipelining.
-    
+
     Handles super-resolution, enhancement, style transfer, and other postprocessing operations
     that are applied to generated images after diffusion.
     """
-    
-    def __init__(self, device: str = "cuda", dtype: torch.dtype = torch.float16, max_workers: int = 4, pipeline_ref: Optional[Any] = None):
+
+    def __init__(
+        self,
+        device: str = "cuda",
+        dtype: torch.dtype = torch.float16,
+        max_workers: int = 4,
+        pipeline_ref: Optional[Any] = None,
+    ):
         # Postprocessing: 50ms timeout for quality-critical operations like upscaling
         super().__init__(device, dtype, max_workers, timeout_ms=20.0, pipeline_ref=pipeline_ref)
-        
+
         # Postprocessing-specific state
-        self._last_input_tensor = None
+        # Cache uses data_ptr + shape for O(1) identity check instead of torch.equal (O(N))
+        self._last_input_ptr = None
+        self._last_input_shape = None
         self._last_processed_result = None
         self._current_input_tensor = None  # For BaseOrchestrator fallback logic
-        
 
-    
-    def process_pipelined(self, 
-                        input_tensor: torch.Tensor,
-                        postprocessors: List[Any],
-                        *args, **kwargs) -> torch.Tensor:
+    def process_pipelined(
+        self, input_tensor: torch.Tensor, postprocessors: List[Any], *args, **kwargs
+    ) -> torch.Tensor:
         """
         Process input with intelligent pipelining.
-        
+
         Overrides base method to store current input tensor for fallback logic.
         """
         # Store current input for fallback logic
         self._current_input_tensor = input_tensor
-        
+
         # RACE CONDITION FIX: Check if there are actually enabled processors
         # Filter to only enabled processors (same logic as _get_ordered_processors)
-        enabled_processors = [p for p in postprocessors if getattr(p, 'enabled', True)] if postprocessors else []
-        
+        enabled_processors = [p for p in postprocessors if getattr(p, "enabled", True)] if postprocessors else []
+
         if not enabled_processors:
             return input_tensor
-        
+
         # Call parent implementation
         return super().process_pipelined(input_tensor, postprocessors, *args, **kwargs)
-    
+
     def _should_use_sync_processing(self, *args, **kwargs) -> bool:
         """
         Determine if synchronous processing should be used instead of pipelined.
@@ -68,25 +76,22 @@ def _should_use_sync_processing(self, *args, **kwargs) -> bool:
             if proc is not None and getattr(proc, 'requires_sync_processing', False):
                 return True
         return False
-    
-    def process_sync(self, 
-                   input_tensor: torch.Tensor,
-                   postprocessors: List[Any],
-                   *args, **kwargs) -> torch.Tensor:
+
+    def process_sync(self, input_tensor: torch.Tensor, postprocessors: List[Any], *args, **kwargs) -> torch.Tensor:
         """
         Process tensor through postprocessors synchronously.
-        
+
         Args:
             input_tensor: Input tensor to postprocess (typically from diffusion output)
             postprocessors: List of postprocessor instances
             *args, **kwargs: Additional arguments for postprocessors
-            
+
         Returns:
             Postprocessed tensor
         """
         if not postprocessors:
             return input_tensor
-        
+
         # Use same stream context as background processing for consistency
         original_stream = self._set_background_stream_context()
         try:
@@ -95,92 +100,83 @@ def process_sync(self,
             for postprocessor in postprocessors:
                 if postprocessor is not None:
                     current_tensor = self._apply_single_postprocessor(current_tensor, postprocessor)
-            
+
             return current_tensor
         finally:
             self._restore_stream_context(original_stream)
-    
-    def _process_frame_background(self, 
-                                input_tensor: torch.Tensor,
-                                postprocessors: List[Any],
-                                *args, **kwargs) -> Dict[str, Any]:
+
+    def _process_frame_background(
+        self, input_tensor: torch.Tensor, postprocessors: List[Any], *args, **kwargs
+    ) -> Dict[str, Any]:
         """
         Process a frame in the background thread.
-        
+
         Implementation of BaseOrchestrator._process_frame_background for postprocessing.
-        
+
         Returns:
             Dictionary containing processing results and status
         """
         try:
             # Set CUDA stream for background processing
             original_stream = self._set_background_stream_context()
-            
+
             if not postprocessors:
-                return {
-                    'result': input_tensor,
-                    'status': 'success'
-                }
-            
-            # Check for cache hit (same input tensor)
-            cache_hit = False
-            if (self._last_input_tensor is not None and self._last_processed_result is not None):
-                if input_tensor.device == self._last_input_tensor.device:
-                    # Same device - direct comparison
-                    cache_hit = torch.equal(input_tensor, self._last_input_tensor)
-                else:
-                    # Different devices - move cached tensor to input device for comparison
-                    cached_on_input_device = self._last_input_tensor.to(device=input_tensor.device, dtype=input_tensor.dtype)
-                    cache_hit = torch.equal(input_tensor, cached_on_input_device)
-            
+                return {"result": input_tensor, "status": "success"}
+
+            # Check for cache hit using data_ptr + shape — O(1) vs torch.equal's O(N).
+            # TRT engines reuse the same output buffer each frame, so data_ptr identity
+            # reliably detects whether the input is the same buffer as last frame.
+            cache_hit = (
+                self._last_input_ptr is not None
+                and self._last_processed_result is not None
+                and input_tensor.data_ptr() == self._last_input_ptr
+                and input_tensor.shape == self._last_input_shape
+            )
+
             if cache_hit:
                 return {
-                    'result': self._last_processed_result,  # Return previously processed result
-                    'status': 'success',
-                    'cache_hit': True
+                    "result": self._last_processed_result,
+                    "status": "success",
+                    "cache_hit": True,
                 }
-            
-            # Update cache with current input tensor
-            self._last_input_tensor = input_tensor.clone()
-            
+
+            # Update cache — store ptr + shape, no tensor clone needed
+            self._last_input_ptr = input_tensor.data_ptr()
+            self._last_input_shape = input_tensor.shape
+
             # Process postprocessors in parallel if multiple, sequential if single
             if len(postprocessors) > 1:
                 result = self._process_postprocessors_parallel(input_tensor, postprocessors)
             else:
                 result = self._apply_single_postprocessor(input_tensor, postprocessors[0])
-            
+
             # Cache the processed result for future cache hits
             self._last_processed_result = result
-            
-            return {
-                'result': result,
-                'status': 'success'
-            }
-            
+
+            return {"result": result, "status": "success"}
+
         except Exception as e:
             logger.error(f"PostprocessingOrchestrator: Background processing failed: {e}")
             return {
-                'result': input_tensor,  # Return original on error
-                'error': str(e),
-                'status': 'error'
+                "result": input_tensor,  # Return original on error
+                "error": str(e),
+                "status": "error",
             }
         finally:
             # Restore original CUDA stream
             self._restore_stream_context(original_stream)
-        
-    def _process_postprocessors_parallel(self, 
-                                       input_tensor: torch.Tensor, 
-                                       postprocessors: List[Any]) -> torch.Tensor:
+
+    def _process_postprocessors_parallel(self, input_tensor: torch.Tensor, postprocessors: List[Any]) -> torch.Tensor:
         """
         Process multiple postprocessors in parallel.
-        
+
         Note: This applies postprocessors sequentially for now, but could be extended
         to support parallel processing for independent postprocessors in the future.
-        
+
         Args:
             input_tensor: Input tensor to process
             postprocessors: List of postprocessor instances
-            
+
         Returns:
             Processed tensor
         """
@@ -190,37 +186,37 @@ def _process_postprocessors_parallel(self,
         for postprocessor in postprocessors:
             if postprocessor is not None:
                 current_tensor = self._apply_single_postprocessor(current_tensor, postprocessor)
-        
+
         return current_tensor
-    
-    def _apply_single_postprocessor(self, 
-                                  input_tensor: torch.Tensor, 
-                                  postprocessor: Any) -> torch.Tensor:
+
+    def _apply_single_postprocessor(self, input_tensor: torch.Tensor, postprocessor: Any) -> torch.Tensor:
         """
         Apply a single postprocessor to the input tensor.
-        
-        Handles normalization conversion between VAE output range [-1,1] and 
+
+        Handles normalization conversion between VAE output range [-1,1] and
         processor input range [0,1], then converts back to VAE range.
-        
+
         Args:
             input_tensor: Input tensor from VAE (range [-1,1])
             postprocessor: Postprocessor instance
-            
+
         Returns:
             Processed tensor in VAE range [-1,1]
         """
         try:
             # Ensure tensor is on correct device and dtype
             processed_tensor = input_tensor.to(device=self.device, dtype=self.dtype)
-            
-            logger.debug(f"_apply_single_postprocessor: Converting tensor from VAE range [-1,1] to processor range [0,1]")
+
+            logger.debug(
+                "_apply_single_postprocessor: Converting tensor from VAE range [-1,1] to processor range [0,1]"
+            )
             processor_input = (processed_tensor / 2.0 + 0.5).clamp(0, 1)
-            
+
             # Apply postprocessor
-            if hasattr(postprocessor, 'process_tensor'):
+            if hasattr(postprocessor, "process_tensor"):
                 # Prefer tensor processing if available
                 processor_output = postprocessor.process_tensor(processor_input)
-            elif hasattr(postprocessor, 'process'):
+            elif hasattr(postprocessor, "process"):
                 # Fallback to general process method
                 processor_output = postprocessor.process(processor_input)
             elif callable(postprocessor):
@@ -229,25 +225,28 @@ def _apply_single_postprocessor(self,
             else:
                 logger.warning(f"PostprocessingOrchestrator: Unknown postprocessor type: {type(postprocessor)}")
                 return processed_tensor
-            
+
             # Ensure result is a tensor
             if isinstance(processor_output, torch.Tensor):
                 # CRITICAL: Convert back from processor output range [0,1] to VAE input range [-1,1]
-                logger.debug(f"_apply_single_postprocessor: Converting result from processor range [0,1] back to VAE range [-1,1]")
+                logger.debug(
+                    "_apply_single_postprocessor: Converting result from processor range [0,1] back to VAE range [-1,1]"
+                )
                 result = (processor_output - 0.5) * 2.0  # Convert [0,1] -> [-1,1]
-                
+
                 return result.to(device=self.device, dtype=self.dtype)
             else:
-                logger.warning(f"PostprocessingOrchestrator: Postprocessor returned non-tensor: {type(processor_output)}")
+                logger.warning(
+                    f"PostprocessingOrchestrator: Postprocessor returned non-tensor: {type(processor_output)}"
+                )
                 return processed_tensor
-                
+
         except Exception as e:
             logger.error(f"PostprocessingOrchestrator: Postprocessor failed: {e}")
             return input_tensor  # Return original on error
-    
+
     def clear_cache(self) -> None:
         """Clear postprocessing cache"""
-        self._last_input_tensor = None
+        self._last_input_ptr = None
+        self._last_input_shape = None
         self._last_processed_result = None
-    
-

From 75086bba05e61da5478f4c9129bcecef2be1b7f8 Mon Sep 17 00:00:00 2001
From: "INTER.Tech" <INTER.Tech@interiam.co>
Date: Wed, 1 Apr 2026 14:12:09 -0400
Subject: [PATCH 04/11] Add cuda-python 13.x compatibility fix for cudart
 import

In cuda-python 13.x, the 'cudart' module was moved to 'cuda.bindings.runtime'.
Add try/except import that prefers the new location and falls back to the
legacy 'cuda.cudart' path for cuda-python 12.x compatibility.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/streamdiffusion/acceleration/tensorrt/utilities.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/streamdiffusion/acceleration/tensorrt/utilities.py b/src/streamdiffusion/acceleration/tensorrt/utilities.py
index c69309d0..1f980aaa 100644
--- a/src/streamdiffusion/acceleration/tensorrt/utilities.py
+++ b/src/streamdiffusion/acceleration/tensorrt/utilities.py
@@ -30,7 +30,12 @@
 import onnx_graphsurgeon as gs
 import tensorrt as trt
 import torch
-from cuda import cudart
+
+# cuda-python 13.x renamed 'cudart' to 'cuda.bindings.runtime'
+try:
+    from cuda.bindings import runtime as cudart
+except ImportError:
+    from cuda import cudart
 from PIL import Image
 from polygraphy import cuda
 from polygraphy.backend.common import bytes_from_path

From 8ac7c6fb93ad29759c39ba6baeaf6b33edf7d78d Mon Sep 17 00:00:00 2001
From: "INTER.Tech" <INTER.Tech@interiam.co>
Date: Wed, 1 Apr 2026 14:18:59 -0400
Subject: [PATCH 05/11] Quick-win CUDA optimizations: pre-allocated buffers +
 L2 cache persistence
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pre-allocate latent and noise buffers to eliminate per-frame CUDA malloc:
- Replace prev_latent_result = x_0_pred_out.clone() with lazy-allocated
  _latent_cache buffer + copy_() in __call__, txt2img, and txt2img_sd_turbo
- Replace torch.randn_like() in TCD non-batched noise loop with lazy-allocated
  _noise_buf + .normal_() — eliminates per-step allocation on TCD path
- Both buffers allocate on first use (shape is fixed per pipeline instance)

Port cuda_l2_cache.py from CUDA 0.2.99 fork (PLAN_5 Feature 2):
- New file: src/streamdiffusion/tools/cuda_l2_cache.py
- Reserves GPU L2 cache for UNet attention weight tensors (mid_block, up_blocks.1)
- Gated by SDTD_L2_PERSIST=1 env var (default on), requires Ampere+ GPU
- Integrated at end of wrapper._load_model() with silent fallback on failure

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/streamdiffusion/pipeline.py            |  36 +-
 src/streamdiffusion/tools/cuda_l2_cache.py | 386 +++++++++++++++++++++
 src/streamdiffusion/wrapper.py             |   9 +
 3 files changed, 418 insertions(+), 13 deletions(-)
 create mode 100644 src/streamdiffusion/tools/cuda_l2_cache.py

diff --git a/src/streamdiffusion/pipeline.py b/src/streamdiffusion/pipeline.py
index e9f8685a..565c799a 100644
--- a/src/streamdiffusion/pipeline.py
+++ b/src/streamdiffusion/pipeline.py
@@ -102,6 +102,8 @@ def __init__(
         self.similar_filter = SimilarImageFilter()
         self.prev_image_result = None
         self.prev_latent_result = None
+        self._latent_cache = None   # pre-allocated buffer; avoids per-frame CUDA malloc for prev_latent_result
+        self._noise_buf = None      # pre-allocated buffer for per-step noise in TCD non-batched path
 
         self.pipe = pipe
         self.image_processor = VaeImageProcessor(pipe.vae_scale_factor)
@@ -949,12 +951,12 @@ def predict_x0_batch(self, x_t_latent: torch.Tensor) -> torch.Tensor:
                     x_0_pred, model_pred = self.unet_step(sample, t, idx)
                     if idx < len(self.sub_timesteps_tensor) - 1:
                         if self.do_add_noise:
-                            sample = self.alpha_prod_t_sqrt[
-                                idx + 1
-                            ] * x_0_pred + self.beta_prod_t_sqrt[
-                                idx + 1
-                            ] * torch.randn_like(
-                                x_0_pred, device=self.device, dtype=self.dtype
+                            if self._noise_buf is None:
+                                self._noise_buf = torch.empty_like(x_0_pred)
+                            self._noise_buf.normal_()
+                            sample = (
+                                self.alpha_prod_t_sqrt[idx + 1] * x_0_pred
+                                + self.beta_prod_t_sqrt[idx + 1] * self._noise_buf
                             )
                         else:
                             sample = self.alpha_prod_t_sqrt[idx + 1] * x_0_pred
@@ -1001,10 +1003,12 @@ def __call__(
         # LATENT POSTPROCESSING HOOKS: After diffusion, before VAE decoding
         x_0_pred_out = self._apply_latent_postprocessing_hooks(x_0_pred_out)
         
-        # Store latent result for latent feedback processors
-        self.prev_latent_result = x_0_pred_out.clone()
+        # Store latent result for latent feedback processors (reuse pre-allocated buffer)
+        if self._latent_cache is None:
+            self._latent_cache = torch.empty_like(x_0_pred_out)
+        self._latent_cache.copy_(x_0_pred_out)
+        self.prev_latent_result = self._latent_cache
 
-        
         x_output = self.decode_image(x_0_pred_out).clone()
 
         # IMAGE POSTPROCESSING HOOKS: After VAE decoding, before final output
@@ -1094,8 +1098,11 @@ def txt2img(self, batch_size: int = 1) -> torch.Tensor:
         # LATENT POSTPROCESSING HOOKS: After diffusion, before VAE decoding
         x_0_pred_out = self._apply_latent_postprocessing_hooks(x_0_pred_out)
         
-        # Store latent result for latent feedback processors
-        self.prev_latent_result = x_0_pred_out.clone()
+        # Store latent result for latent feedback processors (reuse pre-allocated buffer)
+        if self._latent_cache is None:
+            self._latent_cache = torch.empty_like(x_0_pred_out)
+        self._latent_cache.copy_(x_0_pred_out)
+        self.prev_latent_result = self._latent_cache
 
         
         x_output = self.decode_image(x_0_pred_out).clone()
@@ -1152,8 +1159,11 @@ def txt2img_sd_turbo(self, batch_size: int = 1) -> torch.Tensor:
         # LATENT POSTPROCESSING HOOKS: After diffusion, before VAE decoding
         x_0_pred_out = self._apply_latent_postprocessing_hooks(x_0_pred_out)
         
-        # Store latent result for latent feedback processors
-        self.prev_latent_result = x_0_pred_out.clone()
+        # Store latent result for latent feedback processors (reuse pre-allocated buffer)
+        if self._latent_cache is None:
+            self._latent_cache = torch.empty_like(x_0_pred_out)
+        self._latent_cache.copy_(x_0_pred_out)
+        self.prev_latent_result = self._latent_cache
 
         
         x_output = self.decode_image(x_0_pred_out)
diff --git a/src/streamdiffusion/tools/cuda_l2_cache.py b/src/streamdiffusion/tools/cuda_l2_cache.py
new file mode 100644
index 00000000..f0649142
--- /dev/null
+++ b/src/streamdiffusion/tools/cuda_l2_cache.py
@@ -0,0 +1,386 @@
+"""
+L2 Cache Persistence Utility for StreamDiffusion UNet.
+
+Reserves a portion of the GPU's L2 cache for persistent data (UNet weights),
+reducing cache evictions for memory-bandwidth-bound layers.
+
+Requires: CUDA >= 11.2, compute capability >= 8.0 (Ampere+).
+RTX 5090 has 128MB L2, compute 12.0 — full support.
+
+Environment variables:
+    SDTD_L2_PERSIST=1         Enable L2 persistence (default: 1)
+    SDTD_L2_PERSIST_MB=64     MB of L2 to reserve for persistent data (default: 64)
+    SDTD_L2_PERSIST_LAYERS=   Comma-separated layer names for access policy (default: auto)
+
+Expected impact: 5-16% on memory-bandwidth-bound layers (normalization, small GEMMs).
+Hot layers on SDXL: mid_block, up_blocks.1 (most FF hooks + V2V cached attention).
+"""
+
+import ctypes
+import os
+import sys
+from typing import Optional
+
+import torch
+
+
+# =============================================================================
+# Environment Controls
+# =============================================================================
+
+L2_PERSIST_ENABLED = os.environ.get("SDTD_L2_PERSIST", "1") == "1"
+L2_PERSIST_MB = int(os.environ.get("SDTD_L2_PERSIST_MB", "64"))
+
+# Hot layer prefixes — these contain the most attention + FF hook computation.
+# mid_block: 1 transformer block, seq_len=1024, 16 FF hooks
+# up_blocks.1: up-sampling path, seq_len=4096
+_DEFAULT_HOT_LAYER_PREFIXES = ["mid_block", "up_blocks.1"]
+
+
+# =============================================================================
+# CUDA Runtime Access Policy Structs (for Tier 2 per-tensor persistence)
+# =============================================================================
+
+
+class _CudaAccessPolicyWindow(ctypes.Structure):
+    """cudaAccessPolicyWindow struct for cudaStreamSetAttribute."""
+
+    _fields_ = [
+        ("base_ptr", ctypes.c_void_p),  # void* — start of memory region
+        ("num_bytes", ctypes.c_size_t),  # size_t — size of region in bytes
+        ("hitRatio", ctypes.c_float),  # float — fraction in [0, 1] to keep persistent
+        (
+            "hitProp",
+            ctypes.c_int,
+        ),  # cudaAccessProperty: 2 = cudaAccessPropertyPersisting
+        (
+            "missProp",
+            ctypes.c_int,
+        ),  # cudaAccessProperty: 1 = cudaAccessPropertyStreaming
+    ]
+
+
+# cudaAccessProperty enum values
+_CUDA_ACCESS_PROPERTY_NORMAL = 0
+_CUDA_ACCESS_PROPERTY_STREAMING = 1
+_CUDA_ACCESS_PROPERTY_PERSISTING = 2
+
+# cudaStreamAttrID
+_CUDA_STREAM_ATTRIBUTE_ACCESS_POLICY_WINDOW = 1
+
+# cudaLimit enum (CUDA 11.2+)
+# 0x06 = cudaLimitPersistingL2CacheSize (size in bytes — correct for L2 persistence)
+_CUDA_LIMIT_PERSISTING_L2_CACHE_SIZE = 0x06
+
+
+# =============================================================================
+# CUDA Runtime Handle
+# =============================================================================
+
+_cudart: Optional[ctypes.CDLL] = None
+_cudart_loaded: bool = False
+
+
+def _get_cudart() -> Optional[ctypes.CDLL]:
+    """Load the CUDA runtime DLL. Cached after first call."""
+    global _cudart, _cudart_loaded
+    if _cudart_loaded:
+        return _cudart
+
+    _cudart_loaded = True
+
+    if sys.platform != "win32":
+        # Non-Windows: use libcudart.so — typically already loaded by PyTorch
+        try:
+            _cudart = ctypes.CDLL("libcudart.so", mode=ctypes.RTLD_GLOBAL)
+            return _cudart
+        except OSError:
+            pass
+        try:
+            from ctypes.util import find_library
+
+            lib = find_library("cudart")
+            if lib:
+                _cudart = ctypes.CDLL(lib)
+                return _cudart
+        except OSError:
+            pass
+        return None
+
+    # Windows: find cudart64_*.dll shipped with PyTorch or CUDA toolkit
+    import glob
+
+    # Option 1: PyTorch ships cudart in torch/lib/
+    torch_lib = os.path.join(os.path.dirname(torch.__file__), "lib")
+    candidates = sorted(
+        glob.glob(os.path.join(torch_lib, "cudart64_*.dll")), reverse=True
+    )
+
+    # Option 2: CUDA toolkit installation
+    cuda_path = os.environ.get(
+        "CUDA_PATH", r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8"
+    )
+    candidates += sorted(
+        glob.glob(os.path.join(cuda_path, "bin", "cudart64_*.dll")), reverse=True
+    )
+
+    for dll_path in candidates:
+        try:
+            _cudart = ctypes.WinDLL(dll_path)
+            return _cudart
+        except OSError:
+            continue
+
+    return None
+
+
+# =============================================================================
+# Tier 1: Reserve L2 Persisting Cache Size
+# =============================================================================
+
+
+def reserve_l2_persisting_cache(persist_mb: int = L2_PERSIST_MB) -> bool:
+    """
+    Reserve a portion of L2 cache for persistent data.
+
+    This is Tier 1 of L2 persistence: informs the driver that `persist_mb` MB
+    of L2 should not be evicted by regular (streaming) accesses. Hot data set
+    via access policy windows will preferentially stay in this reserved region.
+
+    Args:
+        persist_mb: Megabytes of L2 to reserve. Should be <= half of total L2.
+                    RTX 5090 has 128MB L2 → 64MB is a safe default.
+
+    Returns:
+        True if successful, False if unsupported or failed.
+    """
+    if not torch.cuda.is_available():
+        return False
+
+    # Check compute capability — L2 persistence requires Ampere (8.0+)
+    device = torch.cuda.current_device()
+    props = torch.cuda.get_device_properties(device)
+    major, minor = props.major, props.minor
+    if major < 8:
+        print(
+            f"[L2] L2 persistence skipped — compute {major}.{minor} < 8.0 (Ampere required)"
+        )
+        return False
+
+    l2_total_mb = props.L2_cache_size // (1024 * 1024)
+    persist_bytes = min(persist_mb * 1024 * 1024, props.L2_cache_size // 2)
+    persist_mb_actual = persist_bytes // (1024 * 1024)
+
+    cudart = _get_cudart()
+    if cudart is None:
+        print("[L2] CUDA runtime not found — L2 persistence unavailable")
+        return False
+
+    try:
+        result = cudart.cudaDeviceSetLimit(
+            ctypes.c_int(_CUDA_LIMIT_PERSISTING_L2_CACHE_SIZE),
+            ctypes.c_size_t(persist_bytes),
+        )
+        # CRITICAL: Always clear CUDA error state after ctypes calls.
+        # cudaDeviceSetLimit sets the thread-local CUDA error on failure, and
+        # PyTorch's C10_CUDA_KERNEL_LAUNCH_CHECK() reads it on the next kernel
+        # launch — causing a stale error to crash an unrelated operation.
+        cudart.cudaGetLastError()
+        if result != 0:
+            print(f"[L2] cudaDeviceSetLimit failed: error {result}")
+            return False
+
+        print(
+            f"[L2] Reserved {persist_mb_actual}MB of {l2_total_mb}MB L2 for persisting cache "
+            f"(compute {major}.{minor}, {props.name})"
+        )
+        return True
+
+    except (OSError, ctypes.ArgumentError, AttributeError) as e:
+        print(f"[L2] L2 reservation failed: {e}")
+        return False
+
+
+# =============================================================================
+# Tier 2: Per-Tensor Access Policy (stream attribute window)
+# =============================================================================
+
+
+def set_tensor_persisting(tensor: torch.Tensor, hit_ratio: float = 1.0) -> bool:
+    """
+    Mark a tensor's memory region as L2-persistent.
+
+    Uses cudaStreamSetAttribute with cudaAccessPolicyWindow to request that
+    `hit_ratio` fraction of the tensor's data stays in the L2 persisting region.
+
+    Args:
+        tensor: CUDA tensor whose weights should persist in L2.
+        hit_ratio: Fraction [0, 1] of accesses to serve from persisting cache.
+                   1.0 = always try to keep in L2 (good for weights).
+
+    Returns:
+        True if successful.
+    """
+    if not tensor.is_cuda or not tensor.is_contiguous():
+        return False
+
+    cudart = _get_cudart()
+    if cudart is None:
+        return False
+
+    try:
+        stream_ptr = torch.cuda.current_stream().cuda_stream
+
+        window = _CudaAccessPolicyWindow(
+            base_ptr=ctypes.c_void_p(tensor.data_ptr()),
+            num_bytes=ctypes.c_size_t(tensor.nbytes),
+            hitRatio=ctypes.c_float(hit_ratio),
+            hitProp=ctypes.c_int(_CUDA_ACCESS_PROPERTY_PERSISTING),
+            missProp=ctypes.c_int(_CUDA_ACCESS_PROPERTY_STREAMING),
+        )
+
+        result = cudart.cudaStreamSetAttribute(
+            ctypes.c_void_p(stream_ptr),
+            ctypes.c_int(_CUDA_STREAM_ATTRIBUTE_ACCESS_POLICY_WINDOW),
+            ctypes.byref(window),
+        )
+        cudart.cudaGetLastError()  # Clear any stale CUDA error from ctypes call
+        return result == 0
+
+    except (RuntimeError, OSError, ctypes.ArgumentError, AttributeError):
+        return False
+
+
+def clear_tensor_persisting(tensor: torch.Tensor) -> bool:
+    """
+    Remove L2 persistence policy for a tensor (reset to normal access).
+
+    Call when a tensor is no longer hot (e.g., model unload) to release
+    the L2 persisting budget for other tensors.
+    """
+    if not tensor.is_cuda:
+        return False
+
+    cudart = _get_cudart()
+    if cudart is None:
+        return False
+
+    try:
+        stream_ptr = torch.cuda.current_stream().cuda_stream
+        window = _CudaAccessPolicyWindow(
+            base_ptr=ctypes.c_void_p(tensor.data_ptr()),
+            num_bytes=ctypes.c_size_t(0),
+            hitRatio=ctypes.c_float(0.0),
+            hitProp=ctypes.c_int(_CUDA_ACCESS_PROPERTY_NORMAL),
+            missProp=ctypes.c_int(_CUDA_ACCESS_PROPERTY_STREAMING),
+        )
+        result = cudart.cudaStreamSetAttribute(
+            ctypes.c_void_p(stream_ptr),
+            ctypes.c_int(_CUDA_STREAM_ATTRIBUTE_ACCESS_POLICY_WINDOW),
+            ctypes.byref(window),
+        )
+        cudart.cudaGetLastError()  # Clear any stale CUDA error from ctypes call
+        return result == 0
+    except (RuntimeError, OSError, ctypes.ArgumentError, AttributeError):
+        return False
+
+
+# =============================================================================
+# High-Level: Pin Hot UNet Layer Weights
+# =============================================================================
+
+
+def pin_hot_unet_weights(
+    unet: torch.nn.Module,
+    hot_prefixes: Optional[list] = None,
+    persist_mb: int = L2_PERSIST_MB,
+) -> int:
+    """
+    Mark hot UNet layer weights as L2-persistent.
+
+    Identifies attention Q/K/V/out projection weights in the hottest layers
+    (mid_block, up_blocks.1) and requests they persist in L2 cache.
+
+    Args:
+        unet: The UNet model (already on CUDA).
+        hot_prefixes: Layer name prefixes to target. Defaults to mid_block + up_blocks.1.
+        persist_mb: MB of L2 to reserve (passed to reserve_l2_persisting_cache).
+
+    Returns:
+        Number of weight tensors successfully pinned.
+    """
+    if not L2_PERSIST_ENABLED:
+        return 0
+
+    if hot_prefixes is None:
+        hot_prefixes = _DEFAULT_HOT_LAYER_PREFIXES
+
+    # Tier 1: Reserve L2 persisting region
+    tier1_ok = reserve_l2_persisting_cache(persist_mb)
+    if not tier1_ok:
+        return 0
+
+    # Tier 2: Set access policy on hot attention weights
+    # Target: to_q, to_k, to_v, to_out weights in hot transformer blocks.
+    # These are small-to-medium GEMMs that benefit most from L2 hits.
+    _hot_weight_keywords = ["to_q", "to_k", "to_v", "to_out"]
+    pinned_count = 0
+    pinned_bytes = 0
+
+    for name, param in unet.named_parameters():
+        if not param.is_cuda:
+            continue
+        is_hot = any(prefix in name for prefix in hot_prefixes)
+        is_attn_weight = any(kw in name for kw in _hot_weight_keywords)
+        if is_hot and is_attn_weight:
+            if set_tensor_persisting(param.data):
+                pinned_count += 1
+                pinned_bytes += param.data.nbytes
+
+    if pinned_count > 0:
+        print(
+            f"[L2] Pinned {pinned_count} attention weight tensors "
+            f"({pinned_bytes / 1024 / 1024:.1f}MB) in L2 persisting cache"
+        )
+    else:
+        print(
+            "[L2] No tensors pinned (params may require_grad=True before compile — call after freeze)"
+        )
+
+    return pinned_count
+
+
+def setup_l2_persistence(unet: torch.nn.Module) -> bool:
+    """
+    Main entry point: set up L2 cache persistence for UNet inference.
+
+    Call this AFTER model is loaded and BEFORE torch.compile.
+    For best results with frozen weights, call AFTER torch.compile with freezing=True.
+
+    Args:
+        unet: The UNet model on CUDA.
+
+    Returns:
+        True if at least Tier 1 (L2 reservation) succeeded.
+    """
+    if not L2_PERSIST_ENABLED:
+        return False
+
+    print(
+        f"\n[L2] Setting up L2 cache persistence "
+        f"(SDTD_L2_PERSIST_MB={L2_PERSIST_MB})..."
+    )
+
+    # Tier 1 is the reliable baseline — always attempt
+    tier1_ok = reserve_l2_persisting_cache(L2_PERSIST_MB)
+
+    if tier1_ok:
+        # Tier 2: per-tensor access policy (best-effort)
+        pinned = pin_hot_unet_weights(unet, persist_mb=0)  # Tier 1 already reserved
+        if pinned == 0:
+            print(
+                "[L2] Tier 2 access policy skipped (call pin_hot_unet_weights() "
+                "after compile+freeze for per-tensor control)"
+            )
+
+    return tier1_ok
diff --git a/src/streamdiffusion/wrapper.py b/src/streamdiffusion/wrapper.py
index c1afc366..cd41efc4 100644
--- a/src/streamdiffusion/wrapper.py
+++ b/src/streamdiffusion/wrapper.py
@@ -2070,6 +2070,15 @@ def _load_model(
             except Exception as e:
                 logger.error(f"Failed to install LatentPostprocessingModule: {e}")
 
+        # L2 cache persistence: pin hot UNet attention weights in GPU L2 cache.
+        # Gated by SDTD_L2_PERSIST=1 (default on). Silent fallback on unsupported GPUs.
+        # Requires Ampere+ (compute 8.0+). Expected gain: 2-5% end-to-end on SD1.5/SD-Turbo.
+        try:
+            from streamdiffusion.tools.cuda_l2_cache import setup_l2_persistence
+            setup_l2_persistence(stream.unet)
+        except Exception as e:
+            logger.debug(f"L2 cache persistence setup skipped: {e}")
+
         return stream
 
     def get_last_processed_image(self, index: int) -> Optional[Image.Image]:

From b4780d0a73f6cece7cfbc59b2a66b62c8292ed11 Mon Sep 17 00:00:00 2001
From: "INTER.Tech" <INTER.Tech@interiam.co>
Date: Wed, 1 Apr 2026 14:24:00 -0400
Subject: [PATCH 06/11] Fix hardcoded float16 autocasts and add fp32 precision
 for scheduler division

- encode_image / decode_image: replace hardcoded torch.float16 autocast with
  self.dtype so the pipeline correctly honors the torch_dtype constructor param
  (e.g. bfloat16 would still get fp16 VAE without this fix)

- scheduler_step_batch: upcast numerator and alpha_prod_t_sqrt to float32
  before the F_theta division, then cast back to original dtype. When
  alpha_prod_t_sqrt is small (early timesteps), fp16 division can accumulate
  rounding error; fp32 upcast eliminates this at negligible cost (~1-3us/call).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/streamdiffusion/pipeline.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/streamdiffusion/pipeline.py b/src/streamdiffusion/pipeline.py
index 565c799a..44ab60a0 100644
--- a/src/streamdiffusion/pipeline.py
+++ b/src/streamdiffusion/pipeline.py
@@ -625,14 +625,18 @@ def scheduler_step_batch(
         idx: Optional[int] = None,
     ) -> torch.Tensor:
         if idx is None:
+            # Upcast division to fp32 — alpha_prod_t_sqrt can be small at early timesteps,
+            # causing fp16 rounding artifacts; cast result back to original dtype.
             F_theta = (
-                x_t_latent_batch - self.beta_prod_t_sqrt * model_pred_batch
-            ) / self.alpha_prod_t_sqrt
+                (x_t_latent_batch - self.beta_prod_t_sqrt * model_pred_batch).float()
+                / self.alpha_prod_t_sqrt.float()
+            ).to(x_t_latent_batch.dtype)
             denoised_batch = self.c_out * F_theta + self.c_skip * x_t_latent_batch
         else:
             F_theta = (
-                x_t_latent_batch - self.beta_prod_t_sqrt[idx] * model_pred_batch
-            ) / self.alpha_prod_t_sqrt[idx]
+                (x_t_latent_batch - self.beta_prod_t_sqrt[idx] * model_pred_batch).float()
+                / self.alpha_prod_t_sqrt[idx].float()
+            ).to(x_t_latent_batch.dtype)
             denoised_batch = (
                 self.c_out[idx] * F_theta + self.c_skip[idx] * x_t_latent_batch
             )
@@ -883,7 +887,7 @@ def encode_image(self, image_tensors: torch.Tensor) -> torch.Tensor:
             device=self.device,
             dtype=self.vae.dtype,
         )
-        with torch.autocast("cuda", dtype=torch.float16):
+        with torch.autocast("cuda", dtype=self.dtype):
             img_latent = retrieve_latents(self.vae.encode(image_tensors), self.generator)
         
         img_latent = img_latent * self.vae.config.scaling_factor
@@ -894,7 +898,7 @@ def encode_image(self, image_tensors: torch.Tensor) -> torch.Tensor:
 
     def decode_image(self, x_0_pred_out: torch.Tensor) -> torch.Tensor:
         scaled_latent = x_0_pred_out / self.vae.config.scaling_factor
-        with torch.autocast("cuda", dtype=torch.float16):
+        with torch.autocast("cuda", dtype=self.dtype):
             output_latent = self.vae.decode(scaled_latent, return_dict=False)[0]
         return output_latent
 

From 30761babc0ccab4d7a12e39131f515a57775ef58 Mon Sep 17 00:00:00 2001
From: "INTER.Tech" <INTER.Tech@interiam.co>
Date: Wed, 1 Apr 2026 14:27:51 -0400
Subject: [PATCH 07/11] Fix L2 cache: second reserve call was resetting
 reservation to 0
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

setup_l2_persistence() calls pin_hot_unet_weights(persist_mb=0) after
reserving L2. But pin_hot_unet_weights unconditionally called
reserve_l2_persisting_cache(0), which set the persisting L2 size to
0 bytes — undoing the first reservation entirely.

Fix: skip the Tier 1 reserve call in pin_hot_unet_weights when persist_mb=0,
since the caller (setup_l2_persistence) has already handled it.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/streamdiffusion/tools/cuda_l2_cache.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/streamdiffusion/tools/cuda_l2_cache.py b/src/streamdiffusion/tools/cuda_l2_cache.py
index f0649142..cdafcaa9 100644
--- a/src/streamdiffusion/tools/cuda_l2_cache.py
+++ b/src/streamdiffusion/tools/cuda_l2_cache.py
@@ -315,10 +315,11 @@ def pin_hot_unet_weights(
     if hot_prefixes is None:
         hot_prefixes = _DEFAULT_HOT_LAYER_PREFIXES
 
-    # Tier 1: Reserve L2 persisting region
-    tier1_ok = reserve_l2_persisting_cache(persist_mb)
-    if not tier1_ok:
-        return 0
+    # Tier 1: Reserve L2 persisting region (skip if persist_mb=0, caller already reserved)
+    if persist_mb > 0:
+        tier1_ok = reserve_l2_persisting_cache(persist_mb)
+        if not tier1_ok:
+            return 0
 
     # Tier 2: Set access policy on hot attention weights
     # Target: to_q, to_k, to_v, to_out weights in hot transformer blocks.

From d7cac53f5169a05a57215c71b8b34a194ccf8741 Mon Sep 17 00:00:00 2001
From: "INTER.Tech" <INTER.Tech@interiam.co>
Date: Wed, 1 Apr 2026 18:56:18 -0400
Subject: [PATCH 08/11] fix: report inference FPS separately from output FPS
 when similar image filter is active

The FPS counter was inflated because skipped frames (cached results from the
similar image filter) returned in ~1ms instead of ~30ms, but were still counted
as processed frames. This caused reported FPS to be ~2x actual GPU inference
rate (e.g., 60 FPS reported while GPU at 50% utilization).

Added `last_frame_was_skipped` flag to pipeline and `inference_fps` tracking
to td_manager. Status line now shows: "FPS: 28.3 (out: 57.1)" separating
real inference rate from output rate. OSC now sends inference FPS as the
primary metric.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 StreamDiffusionTD/td_manager.py | 951 ++++++++++++++++++++++++++++++++
 src/streamdiffusion/pipeline.py |   5 +-
 2 files changed, 955 insertions(+), 1 deletion(-)
 create mode 100644 StreamDiffusionTD/td_manager.py

diff --git a/StreamDiffusionTD/td_manager.py b/StreamDiffusionTD/td_manager.py
new file mode 100644
index 00000000..adaf5d57
--- /dev/null
+++ b/StreamDiffusionTD/td_manager.py
@@ -0,0 +1,951 @@
+"""
+TouchDesigner StreamDiffusion Manager
+
+Core bridge between TouchDesigner and the LivePeer StreamDiffusion fork.
+Handles configuration, streaming loop, and parameter updates.
+"""
+
+import logging
+import os
+import platform
+import sys
+import threading
+import time
+from multiprocessing import shared_memory
+from typing import Any, Dict, Optional, Union
+
+import numpy as np
+import torch
+from PIL import Image
+
+# Logger will be configured by td_main.py based on debug_mode
+logger = logging.getLogger("TouchDesignerManager")
+
+# Add StreamDiffusion to path
+sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
+
+from streamdiffusion.config import create_wrapper_from_config, load_config
+
+from streamdiffusion import StreamDiffusionWrapper
+
+
+class TouchDesignerManager:
+    """
+    Main manager class that bridges TouchDesigner with StreamDiffusion fork.
+
+    Key differences from your old version:
+    1. Uses new fork's config system and unified parameter updates
+    2. Maintains same SharedMemory/Syphon interface patterns
+    3. Leverages new fork's caching and performance improvements
+    """
+
+    def __init__(
+        self,
+        config: Union[str, Dict[str, Any]],
+        input_mem_name: str,
+        output_mem_name: str,
+        debug_mode: bool = False,
+        osc_reporter=None,
+    ):
+        self.input_mem_name = input_mem_name
+        self.output_mem_name = output_mem_name
+        self.osc_reporter = osc_reporter  # Lightweight telemetry reporter
+        self.osc_handler = None  # Parameter handler (set later)
+        self.debug_mode = debug_mode
+
+        # Handle both config dict (new) and config path (legacy compatibility)
+        if isinstance(config, dict):
+            if debug_mode:
+                print("Using pre-merged configuration dictionary")
+            self.config = config
+            self.config_path = None
+        else:
+            # Legacy: Load configuration using new fork's config system
+            if debug_mode:
+                print(f"Loading configuration from: {config}")
+            self.config = load_config(config)
+            self.config_path = config
+
+        # Extract TD-specific settings
+        self.td_settings = self.config.get("td_settings", {})
+
+        # Platform detection (same as your current version)
+        self.is_macos = platform.system() == "Darwin"
+        self.stream_method = "syphon" if self.is_macos else "shared_mem"
+
+        # Track which seed indices should be randomized every frame
+        self._randomize_seed_indices = []
+
+        # Initialize StreamDiffusion wrapper using new fork
+        logger.debug("Creating StreamDiffusion wrapper...")
+        self.wrapper: StreamDiffusionWrapper = create_wrapper_from_config(self.config)
+
+        # Memory interfaces (will be initialized in start_streaming)
+        self.input_memory: Optional[shared_memory.SharedMemory] = None
+        self.output_memory: Optional[shared_memory.SharedMemory] = None
+        self.control_memory: Optional[shared_memory.SharedMemory] = None
+        self.control_processed_memory: Optional[shared_memory.SharedMemory] = (
+            None  # For pre-processed ControlNet output
+        )
+        self.ipadapter_memory: Optional[shared_memory.SharedMemory] = None
+        self.syphon_handler = None
+
+        # Streaming state
+        self.streaming = False
+        self.stream_thread: Optional[threading.Thread] = None
+        self.paused = False
+        self.process_frame = False
+        self.frame_acknowledged = False
+
+        # State tracking for logging (only log changes)
+        self._last_paused_state = False
+        self._frames_processed_in_pause = 0
+
+        # ControlNet and IPAdapter state
+        self.ipadapter_update_requested = False
+        self.control_mem_name = self.input_mem_name + "-cn"
+        self.control_processed_mem_name = (
+            self.input_mem_name + "-cn-processed"
+        )  # For pre-processed ControlNet output
+        self.ipadapter_mem_name = self.input_mem_name + "-ip"
+
+        # Track live IPAdapter scale from OSC updates
+        self._current_ipadapter_scale = self.config.get("ipadapters", [{}])[0].get(
+            "scale", 1.0
+        )
+
+        # Performance tracking
+        self.frame_count = 0
+        self.total_frame_count = 0  # Total frames processed (for OSC)
+        self.start_time = time.time()
+        self.fps_smoothing = 0.9  # For exponential moving average
+        self.current_fps = 0.0          # Output FPS: includes cached/skipped frames
+        self.inference_fps = 0.0        # Inference FPS: only frames that ran GPU inference
+        self.last_frame_output_time = 0.0   # Wall-clock time of last frame output (any frame)
+        self.last_inference_time = 0.0      # Wall-clock time of last real inference frame
+
+        # OSC notification flags
+        self._sent_processed_cn_name = False
+
+        # Mode tracking (img2img or txt2img)
+        self.mode = self.config.get("mode", "img2img")
+        logger.info(f"Initialized in {self.mode} mode")
+
+        logger.info("TouchDesigner Manager initialized successfully")
+
+    def update_parameters(self, params: Dict[str, Any]) -> None:
+        """
+        Update StreamDiffusion parameters using new fork's unified system.
+
+        This replaces the scattered parameter updates in your old version
+        with a single, atomic update call that handles caching efficiently.
+        """
+        try:
+            # Track IPAdapter scale changes from OSC
+            if "ipadapter_config" in params and "scale" in params["ipadapter_config"]:
+                self._current_ipadapter_scale = params["ipadapter_config"]["scale"]
+                # Removed noisy log: logger.info(f"Updated IPAdapter scale to: {self._current_ipadapter_scale}")
+
+            # Filter out invalid parameters that wrapper doesn't accept
+            valid_params = [
+                "num_inference_steps",
+                "guidance_scale",
+                "delta",
+                "t_index_list",
+                "seed",
+                "prompt_list",
+                "negative_prompt",
+                "prompt_interpolation_method",
+                "normalize_prompt_weights",
+                "seed_list",
+                "seed_interpolation_method",
+                "normalize_seed_weights",
+                "controlnet_config",
+                "ipadapter_config",
+                "image_preprocessing_config",
+                "image_postprocessing_config",
+                "latent_preprocessing_config",
+                "latent_postprocessing_config",
+                "use_safety_checker",
+                "safety_checker_threshold",
+                "cache_maxframes",
+                "cache_interval",
+            ]
+
+            filtered_params = {k: v for k, v in params.items() if k in valid_params}
+
+            # Enforce guidance_scale > 1.0 for cfg_type "full" or "initialize"
+            # (pipeline.py requires this for negative prompt embeddings)
+            if "guidance_scale" in filtered_params:
+                cfg_type = getattr(self.wrapper.stream, "cfg_type", None)
+                if (
+                    cfg_type in ["full", "initialize"]
+                    and filtered_params["guidance_scale"] <= 1.0
+                ):
+                    filtered_params["guidance_scale"] = 1.2
+
+            # Track which seed indices have -1 for continuous per-frame randomization
+            if "seed_list" in filtered_params:
+                import random
+
+                # Clear and rebuild tracking based on this OSC update
+                self._randomize_seed_indices = []
+                new_seed_list = []
+                for idx, (seed, weight) in enumerate(filtered_params["seed_list"]):
+                    if seed == -1:
+                        # Mark this index for continuous randomization
+                        self._randomize_seed_indices.append(idx)
+                        # Generate random seed for this initial update
+                        seed = random.randint(0, 2**32 - 1)
+                    new_seed_list.append((seed, weight))
+                filtered_params["seed_list"] = new_seed_list
+
+            # Use the new fork's unified parameter update system
+            self.wrapper.update_stream_params(**filtered_params)
+        except Exception as e:
+            logger.error(f"Error updating parameters: {e}")
+
+    def start_streaming(self) -> None:
+        """Initialize memory interfaces and start the streaming loop"""
+        if self.streaming:
+            logger.warning("Already streaming!")
+            return
+
+        try:
+            # Send starting state via reporter (not handler - that's for parameters)
+            if self.osc_reporter:
+                self.osc_reporter.set_state("local_starting")
+
+            self._initialize_memory_interfaces()
+            self.streaming = True
+
+            # Start streaming in separate thread (non-blocking)
+            self.stream_thread = threading.Thread(
+                target=self._streaming_loop, daemon=True
+            )
+            self.stream_thread.start()
+
+            # Print styled startup message
+            print("\n\033[32mStream active\033[0m\n")
+
+            logger.info(f"Streaming started - Method: {self.stream_method}")
+
+        except Exception as e:
+            logger.error(f"Failed to start streaming: {e}")
+            raise
+
+    def stop_streaming(self) -> None:
+        """Stop streaming and cleanup resources"""
+        if not self.streaming:
+            return
+
+        logger.info("Stopping streaming...")
+        self.streaming = False
+
+        # Send offline state via reporter
+        if self.osc_reporter:
+            self.osc_reporter.set_state("local_offline")
+
+        # Wait for streaming thread to finish
+        if self.stream_thread and self.stream_thread.is_alive():
+            self.stream_thread.join(timeout=2.0)
+
+        # Cleanup memory interfaces
+        self._cleanup_memory_interfaces()
+
+        logger.info("Streaming stopped")
+
+    def _initialize_memory_interfaces(self) -> None:
+        """Initialize platform-specific memory interfaces"""
+        width = self.config["width"]
+        height = self.config["height"]
+
+        if self.is_macos:
+            # Initialize Syphon (using existing syphon_utils.py)
+            try:
+                from syphon_utils import SyphonUtils
+
+                self.syphon_handler = SyphonUtils(
+                    sender_name=self.output_mem_name,
+                    input_name=self.input_mem_name,
+                    control_name=None,
+                    width=width,
+                    height=height,
+                    debug=False,
+                )
+                self.syphon_handler.start()
+                logger.debug(
+                    f"Syphon initialized - Input: {self.input_mem_name}, Output: {self.output_mem_name}"
+                )
+
+            except Exception as e:
+                logger.error(f"Failed to initialize Syphon: {e}")
+                raise
+        else:
+            # Initialize SharedMemory (same pattern as your current version)
+            try:
+                # Input memory (from TouchDesigner)
+                self.input_memory = shared_memory.SharedMemory(name=self.input_mem_name)
+                logger.debug(f"Connected to input SharedMemory: {self.input_mem_name}")
+
+                # Output memory (to TouchDesigner) - try to connect first, create if not exists
+                frame_size = width * height * 3  # RGB
+                try:
+                    # Try to connect to existing memory first (like main_sdtd.py)
+                    self.output_memory = shared_memory.SharedMemory(
+                        name=self.output_mem_name
+                    )
+                    logger.debug(
+                        f"Connected to existing output SharedMemory: {self.output_mem_name}"
+                    )
+                except FileNotFoundError:
+                    # Create new if doesn't exist
+                    self.output_memory = shared_memory.SharedMemory(
+                        name=self.output_mem_name, create=True, size=frame_size
+                    )
+                    logger.debug(
+                        f"Created new output SharedMemory: {self.output_mem_name}"
+                    )
+
+                # ControlNet memory (per-frame updates)
+                try:
+                    self.control_memory = shared_memory.SharedMemory(
+                        name=self.control_mem_name
+                    )
+                    logger.debug(
+                        f"Connected to ControlNet SharedMemory: {self.control_mem_name}"
+                    )
+                except FileNotFoundError:
+                    logger.debug(
+                        f"ControlNet SharedMemory not found: {self.control_mem_name} (will create if needed)"
+                    )
+                    self.control_memory = None
+
+                # ControlNet processed output memory (to send pre-processed image back to TD)
+                # Create output memory for processed ControlNet with -cn-processed suffix
+                control_processed_mem_name = self.output_mem_name + "-cn-processed"
+                try:
+                    # Try to connect to existing memory first
+                    self.control_processed_memory = shared_memory.SharedMemory(
+                        name=control_processed_mem_name
+                    )
+                    logger.debug(
+                        f"Connected to existing ControlNet processed output SharedMemory: {control_processed_mem_name}"
+                    )
+                except FileNotFoundError:
+                    # Create new if doesn't exist
+                    self.control_processed_memory = shared_memory.SharedMemory(
+                        name=control_processed_mem_name,
+                        create=True,
+                        size=frame_size,  # Same size as main output
+                    )
+                    logger.debug(
+                        f"Created new ControlNet processed output SharedMemory: {control_processed_mem_name}"
+                    )
+
+                # IPAdapter memory (OSC-triggered updates)
+                try:
+                    self.ipadapter_memory = shared_memory.SharedMemory(
+                        name=self.ipadapter_mem_name
+                    )
+                    logger.debug(
+                        f"Connected to IPAdapter SharedMemory: {self.ipadapter_mem_name}"
+                    )
+                except FileNotFoundError:
+                    logger.debug(
+                        f"IPAdapter SharedMemory not found: {self.ipadapter_mem_name} (will create if needed)"
+                    )
+                    self.ipadapter_memory = None
+
+            except Exception as e:
+                logger.error(f"Failed to initialize SharedMemory: {e}")
+                raise
+
+    def _cleanup_memory_interfaces(self) -> None:
+        """Cleanup memory interfaces"""
+        try:
+            if self.syphon_handler:
+                self.syphon_handler.stop()
+                self.syphon_handler = None
+
+            if self.input_memory:
+                self.input_memory.close()
+                self.input_memory = None
+
+            if self.output_memory:
+                self.output_memory.close()
+                self.output_memory.unlink()  # Delete the shared memory
+                self.output_memory = None
+
+            if self.control_memory:
+                self.control_memory.close()
+                self.control_memory = None
+
+            if self.control_processed_memory:
+                self.control_processed_memory.close()
+                self.control_processed_memory.unlink()  # Delete the shared memory
+                self.control_processed_memory = None
+
+            if self.ipadapter_memory:
+                self.ipadapter_memory.close()
+                self.ipadapter_memory = None
+
+        except Exception as e:
+            logger.error(f"Error during cleanup: {e}")
+
+    def _streaming_loop(self) -> None:
+        """
+        Main streaming loop - processes frames as fast as possible.
+
+        Key insight: The wrapper.img2img() call runs as fast as it can.
+        You don't manually control the speed - the wrapper handles timing internally.
+        """
+        logger.info("Starting streaming loop...")
+
+        frame_time_accumulator = 0.0
+        fps_report_interval = 1.0  # Report FPS every second
+
+        while self.streaming:
+            try:
+                # CRITICAL: Check pause state FIRST before any processing
+                if self.paused:
+                    if not self.process_frame:
+                        time.sleep(
+                            0.001
+                        )  # Brief sleep if paused and no frame requested
+                        continue
+                    else:
+                        # Reset acknowledgment flag when starting frame processing
+                        self.frame_acknowledged = False
+                        self._frames_processed_in_pause += 1
+
+                        # Log useful info every 10th frame
+                        if self._frames_processed_in_pause % 10 == 0:
+                            logger.info(
+                                f"PAUSE: Frame {self._frames_processed_in_pause} | FPS: {self.current_fps:.1f}"
+                            )
+
+                        # CRITICAL: Reset process_frame IMMEDIATELY after confirming we're processing
+                        # This must happen INSIDE the pause block, not outside
+                        self.process_frame = False
+                else:
+                    # In continuous mode, we don't need to reset process_frame
+                    pass
+
+                loop_start = time.time()
+
+                # Process through StreamDiffusion - mode determines which method to call
+                if self.mode == "txt2img":
+                    # txt2img mode: generate from prepared noise (respects seed)
+                    # Use init_noise instead of random noise to respect seed parameter
+                    stream = self.wrapper.stream
+                    batch_size = 1
+
+                    # Check if seed is -1 (randomize every frame)
+                    # Get current seed from config (updated via OSC)
+                    current_seed = self.config.get("seed", -1)
+
+                    if current_seed == -1:
+                        # Generate fresh random noise every frame (random seed)
+                        adjusted_noise = torch.randn(
+                            (batch_size, 4, stream.latent_height, stream.latent_width),
+                            device=stream.device,
+                            dtype=stream.dtype,
+                        )
+                    else:
+                        # Generate noise from seed (consistent image every frame)
+                        generator = torch.Generator(device=stream.device)
+                        generator.manual_seed(current_seed)
+                        adjusted_noise = torch.randn(
+                            (batch_size, 4, stream.latent_height, stream.latent_width),
+                            device=stream.device,
+                            dtype=stream.dtype,
+                            generator=generator,
+                        )
+
+                    # Apply latent preprocessing hooks before prediction
+                    adjusted_noise = stream._apply_latent_preprocessing_hooks(
+                        adjusted_noise
+                    )
+
+                    # Call predict_x0_batch with prepared noise
+                    x_0_pred_out = stream.predict_x0_batch(adjusted_noise)
+
+                    # Apply latent postprocessing hooks
+                    x_0_pred_out = stream._apply_latent_postprocessing_hooks(
+                        x_0_pred_out
+                    )
+
+                    # Store for latent feedback processors
+                    stream.prev_latent_result = x_0_pred_out.detach().clone()
+
+                    # Decode to image
+                    x_output = stream.decode_image(x_0_pred_out).detach().clone()
+
+                    # Apply image postprocessing hooks
+                    x_output = stream._apply_image_postprocessing_hooks(x_output)
+
+                    # Postprocess to desired output format
+                    output_image = self.wrapper.postprocess_image(
+                        x_output, output_type=self.wrapper.output_type
+                    )
+                else:
+                    # img2img mode: get input frame and process
+                    input_image = self._get_input_frame()
+                    if input_image is None:
+                        time.sleep(0.001)  # Brief sleep if no input
+                        continue
+
+                    # Convert input for pipeline
+                    if input_image.dtype == np.uint8:
+                        input_image = input_image.astype(np.float32) / 255.0
+
+                    # Process ControlNet frame (per-frame if enabled)
+                    self._process_controlnet_frame()
+
+                    # Process IPAdapter frame (only on OSC request)
+                    self._process_ipadapter_frame()
+
+                    # Randomize tracked seed indices every frame (indices marked as -1 via OSC)
+                    if self._randomize_seed_indices:
+                        import random
+
+                        # Make a copy of current seed_list and randomize tracked indices
+                        current_seed_list = (
+                            self.wrapper.stream._param_updater._current_seed_list.copy()
+                        )
+                        for idx in self._randomize_seed_indices:
+                            _, weight = current_seed_list[idx]
+                            current_seed_list[idx] = (
+                                random.randint(0, 2**32 - 1),
+                                weight,
+                            )
+                        # Apply the randomized seed_list to the stream
+                        self.wrapper.update_stream_params(seed_list=current_seed_list)
+
+                    # Transform input image
+                    output_image = self.wrapper.img2img(input_image)
+
+                # Send output frame to TouchDesigner
+                self._send_output_frame(output_image)
+
+                # Output FPS: wall-clock rate of all frames sent to TD (includes cached skips)
+                frame_output_time = time.time()
+                if self.last_frame_output_time > 0:
+                    frame_interval = frame_output_time - self.last_frame_output_time
+                    instantaneous_fps = (
+                        1.0 / frame_interval if frame_interval > 0 else 0.0
+                    )
+                    self.current_fps = (
+                        self.current_fps * self.fps_smoothing
+                        + instantaneous_fps * (1 - self.fps_smoothing)
+                    )
+                self.last_frame_output_time = frame_output_time
+
+                # Inference FPS: only frames that actually ran GPU inference (similar filter skips excluded)
+                was_skipped = getattr(self.wrapper.stream, 'last_frame_was_skipped', False)
+                if not was_skipped:
+                    if self.last_inference_time > 0:
+                        inf_interval = frame_output_time - self.last_inference_time
+                        instantaneous_inf_fps = (
+                            1.0 / inf_interval if inf_interval > 0 else 0.0
+                        )
+                        self.inference_fps = (
+                            self.inference_fps * self.fps_smoothing
+                            + instantaneous_inf_fps * (1 - self.fps_smoothing)
+                        )
+                    self.last_inference_time = frame_output_time
+
+                # Update frame counters
+                self.frame_count += 1
+                self.total_frame_count += 1
+
+                # Update performance metrics for display
+                loop_end = time.time()
+                loop_time = loop_end - loop_start
+                frame_time_accumulator += loop_time
+
+                # Send OSC messages EVERY FRAME via reporter (like main_sdtd.py - critical for TD connection!)
+                if self.osc_reporter:
+                    self.osc_reporter.send_frame_count(self.total_frame_count)
+                    # Only send frame_ready in continuous mode, NOT in pause mode
+                    if not self.paused:
+                        self.osc_reporter.send_frame_ready(self.total_frame_count)
+
+                    # Send connection state update (transition to streaming after 30 frames)
+                    if self.total_frame_count == 30:
+                        self.osc_reporter.set_state("local_streaming")
+
+                    # Send processed ControlNet memory name (only once at startup or when needed)
+                    if (
+                        not self._sent_processed_cn_name
+                        and self.control_processed_memory
+                    ):
+                        processed_cn_name = self.output_mem_name + "-cn-processed"
+                        self.osc_reporter.send_controlnet_processed_name(
+                            processed_cn_name
+                        )
+                        self._sent_processed_cn_name = True
+
+                # Report FPS periodically using the correct frame output FPS (ALWAYS show)
+                if frame_time_accumulator >= fps_report_interval:
+                    # Calculate uptime
+                    uptime_seconds = int(time.time() - self.start_time)
+                    uptime_mins = uptime_seconds // 60
+                    uptime_secs = uptime_seconds % 60
+                    uptime_str = f"{uptime_mins:02d}:{uptime_secs:02d}"
+
+                    # Clear the line properly and write new status with color
+                    status_line = f"\033[38;5;208mStreaming | FPS: {self.inference_fps:.1f} (out: {self.current_fps:.1f}) | Uptime: {uptime_str}\033[0m"
+                    print(f"\r{' ' * 80}\r{status_line}", end="", flush=True)
+
+                    # Reset counters
+                    frame_time_accumulator = 0.0
+                    self.frame_count = 0
+
+                # Send FPS EVERY FRAME via reporter — use inference FPS (excludes similar-filter skips)
+                if self.osc_reporter:
+                    reported_fps = self.inference_fps if self.inference_fps > 0 else self.current_fps
+                    if reported_fps > 0:
+                        self.osc_reporter.send_fps(reported_fps)
+
+                # Handle frame acknowledgment for pause mode synchronization
+                if self.paused:
+                    # Send frame completion signal to TouchDesigner
+                    if self.osc_handler:
+                        self.osc_handler.send_message(
+                            "/frame_ready", self.total_frame_count
+                        )
+
+                    # Wait for TouchDesigner to acknowledge it has processed the frame
+                    acknowledgment_timeout = time.time() + 5.0  # 5 second timeout
+                    while (
+                        not self.frame_acknowledged
+                        and time.time() < acknowledgment_timeout
+                    ):
+                        time.sleep(0.001)  # Small sleep to prevent busy waiting
+
+                    if not self.frame_acknowledged:
+                        logger.warning(
+                            f"TIMEOUT: No /frame_ack after 5s (frame {self._frames_processed_in_pause})"
+                        )
+
+                    # Reset for next frame
+                    self.frame_acknowledged = False
+
+            except Exception as e:
+                logger.error(f"Error in streaming loop: {e}")
+                # Continue streaming unless explicitly stopped
+                if self.streaming:
+                    time.sleep(0.1)  # Brief pause before retrying
+                else:
+                    break
+
+        logger.info("Streaming loop ended")
+
+    def _get_input_frame(self) -> Optional[np.ndarray]:
+        """Get input frame from TouchDesigner (platform-specific)"""
+        try:
+            if self.is_macos and self.syphon_handler:
+                # Get frame from Syphon
+                frame = self.syphon_handler.capture_input_frame()
+                return frame
+
+            elif self.input_memory:
+                # Get frame from SharedMemory
+                width = self.config["width"]
+                height = self.config["height"]
+
+                # Create numpy array view of shared memory
+                frame = np.ndarray(
+                    (height, width, 3), dtype=np.uint8, buffer=self.input_memory.buf
+                )
+
+                return frame.copy()  # Copy to avoid memory issues
+
+            return None
+
+        except Exception as e:
+            logger.error(f"Error getting input frame: {e}")
+            return None
+
+    def _send_output_frame(
+        self, output_image: Union[Image.Image, np.ndarray]
+    ) -> None:
+        """Send output frame to TouchDesigner (platform-specific)"""
+        try:
+            # Convert output to numpy array if needed
+            if isinstance(output_image, Image.Image):
+                frame_np = np.array(output_image)
+            elif isinstance(output_image, torch.Tensor):
+                frame_np = output_image.cpu().numpy()
+                if frame_np.shape[0] == 3:  # CHW -> HWC
+                    frame_np = np.transpose(frame_np, (1, 2, 0))
+            else:
+                frame_np = output_image
+
+            # Ensure proper scaling like main_sdtd.py (line 916)
+            if frame_np.max() <= 1.0:
+                frame_np = (frame_np * 255).astype(np.uint8)
+            elif frame_np.dtype != np.uint8:
+                frame_np = frame_np.astype(np.uint8)
+
+            if self.is_macos and self.syphon_handler:
+                # Send via Syphon
+                self.syphon_handler.send_frame(frame_np)
+
+            elif self.output_memory:
+                # Send via SharedMemory
+                output_array = np.ndarray(
+                    frame_np.shape, dtype=np.uint8, buffer=self.output_memory.buf
+                )
+                np.copyto(output_array, frame_np)
+
+                # Diagnostic: confirm first frame write
+                if self.total_frame_count == 0:
+                    print(
+                        f"[SharedMem] First frame written: shape={frame_np.shape}, "
+                        f"dtype={frame_np.dtype}, mem_name={self.output_mem_name}, "
+                        f"buf_size={self.output_memory.size}"
+                    )
+
+        except Exception as e:
+            logger.error(f"Error sending output frame: {e}")
+            import traceback; traceback.print_exc()  # TEMP diagnostic
+
+    def _send_processed_controlnet_frame(
+        self, processed_tensor: Optional[torch.Tensor]
+    ) -> None:
+        """Send processed ControlNet frame to TouchDesigner via shared memory"""
+        if not self.control_processed_memory or processed_tensor is None:
+            return
+
+        try:
+            # Convert tensor to numpy array (similar to _send_output_frame)
+            if isinstance(processed_tensor, torch.Tensor):
+                frame_np = processed_tensor.cpu().numpy()
+
+                # Handle tensor format: CHW -> HWC
+                if frame_np.ndim == 4 and frame_np.shape[0] == 1:
+                    frame_np = frame_np.squeeze(0)  # Remove batch dimension
+                if frame_np.ndim == 3 and frame_np.shape[0] == 3:
+                    frame_np = np.transpose(frame_np, (1, 2, 0))  # CHW -> HWC
+            else:
+                frame_np = processed_tensor
+
+            # Ensure proper scaling (0-1 range to 0-255)
+            if frame_np.max() <= 1.0:
+                frame_np = (frame_np * 255).astype(np.uint8)
+            elif frame_np.dtype != np.uint8:
+                frame_np = frame_np.astype(np.uint8)
+
+            # Send via SharedMemory
+            if self.control_processed_memory:
+                output_array = np.ndarray(
+                    frame_np.shape,
+                    dtype=np.uint8,
+                    buffer=self.control_processed_memory.buf,
+                )
+                np.copyto(output_array, frame_np)
+
+        except Exception as e:
+            logger.error(f"Error sending processed ControlNet frame: {e}")
+
+    def pause_streaming(self) -> None:
+        """Pause streaming (frame processing continues only on process_frame requests)"""
+        if not self.paused:  # Only log state changes
+            self.paused = True
+            self._frames_processed_in_pause = 0  # Reset counter
+
+            logger.info("STREAMING PAUSED - waiting for /process_frame commands")
+
+    def resume_streaming(self) -> None:
+        """Resume streaming (continuous frame processing)"""
+        if self.paused:  # Only log state changes
+            self.paused = False
+            logger.info(
+                f"STREAMING RESUMED - processed {self._frames_processed_in_pause} frames while paused"
+            )
+
+    def process_single_frame(self) -> None:
+        """Process a single frame when paused"""
+        if self.paused:
+            self.process_frame = True
+            # Only log occasionally to reduce spam
+        else:
+            logger.warning("process_single_frame called but streaming is not paused")
+
+    def acknowledge_frame(self) -> None:
+        """Acknowledge frame processing completion (called by TouchDesigner)"""
+        self.frame_acknowledged = True
+
+    def _process_controlnet_frame(self) -> None:
+        """Process ControlNet frame data (per-frame updates)"""
+        if not self.control_memory or not self.config.get("use_controlnet", False):
+            return
+
+        try:
+            width = self.config["width"]
+            height = self.config["height"]
+
+            # Read ControlNet frame from shared memory
+            control_frame = np.ndarray(
+                (height, width, 3), dtype=np.uint8, buffer=self.control_memory.buf
+            )
+
+            # Convert to float [0,1] and pass to wrapper
+            if control_frame.dtype == np.uint8:
+                control_frame = control_frame.astype(np.float32) / 255.0
+
+            # Update ControlNet image in wrapper (index 0 for first ControlNet)
+            self.wrapper.update_control_image(0, control_frame)
+
+            # IMPORTANT: After processing, extract the pre-processed image and send it back to TD
+            # The processed image is now available in the controlnet module
+            try:
+                if (
+                    hasattr(self.wrapper, "stream")
+                    and hasattr(self.wrapper.stream, "_controlnet_module")
+                    and self.wrapper.stream._controlnet_module is not None
+                ):
+                    controlnet_module = self.wrapper.stream._controlnet_module
+
+                    # Check if we have processed images available
+                    if (
+                        hasattr(controlnet_module, "controlnet_images")
+                        and len(controlnet_module.controlnet_images) > 0
+                        and controlnet_module.controlnet_images[0] is not None
+                    ):
+                        processed_tensor = controlnet_module.controlnet_images[0]
+
+                        # Send the processed ControlNet image back to TouchDesigner
+                        self._send_processed_controlnet_frame(processed_tensor)
+
+            except Exception as processed_error:
+                logger.debug(
+                    f"Could not extract processed ControlNet image: {processed_error}"
+                )
+
+        except Exception as e:
+            logger.error(f"Error processing ControlNet frame: {e}")
+
+    def _process_ipadapter_frame(self) -> None:
+        """Process IPAdapter frame data (OSC-triggered updates only)"""
+        if not self.config.get("use_ipadapter", False):
+            return
+        if not self.ipadapter_memory:
+            logger.debug(
+                f"IPAdapter SharedMemory not connected: {self.ipadapter_mem_name}"
+            )
+            return
+
+        if not self.ipadapter_update_requested:
+            return
+
+        try:
+            width = self.config["width"]
+            height = self.config["height"]
+
+            # Read IPAdapter frame from shared memory
+            ipadapter_frame = np.ndarray(
+                (height, width, 3), dtype=np.uint8, buffer=self.ipadapter_memory.buf
+            )
+
+            # Convert to float [0,1] and pass to wrapper
+            if ipadapter_frame.dtype == np.uint8:
+                ipadapter_frame = ipadapter_frame.astype(np.float32) / 255.0
+
+            # Update IPAdapter config FIRST (like img2img example)
+            # Use live scale value updated by OSC, not static config
+            current_scale = self._current_ipadapter_scale
+            logger.info(f"Using live IPAdapter scale: {current_scale}")
+            self.wrapper.update_stream_params(ipadapter_config={"scale": current_scale})
+
+            # THEN update the style image (following img2img pattern)
+            self.wrapper.update_style_image(ipadapter_frame)
+
+            # CRITICAL FIX: Trigger prompt re-blending to apply the new IPAdapter embeddings
+            # The embedding hooks only run during _apply_prompt_blending, so we must trigger it
+            # after updating the IPAdapter image to concatenate the new embeddings with prompts
+            if (
+                hasattr(self.wrapper.stream._param_updater, "_current_prompt_list")
+                and self.wrapper.stream._param_updater._current_prompt_list
+            ):
+                # Get current interpolation method or use default
+                interpolation_method = getattr(
+                    self.wrapper.stream._param_updater,
+                    "_last_prompt_interpolation_method",
+                    "slerp",
+                )
+                # Re-apply prompt blending to trigger embedding hooks
+                self.wrapper.stream._param_updater._apply_prompt_blending(
+                    interpolation_method
+                )
+                logger.info(
+                    "Re-applied prompt blending to incorporate new IPAdapter embeddings"
+                )
+
+            # Debug: Check if embeddings were cached
+            cached_embeddings = (
+                self.wrapper.stream._param_updater.get_cached_embeddings(
+                    "ipadapter_main"
+                )
+            )
+            if cached_embeddings is not None:
+                logger.info(
+                    f"IPAdapter config+image updated! Scale: {current_scale}, Embeddings: {cached_embeddings[0].shape}"
+                )
+            else:
+                logger.error("IPAdapter embeddings NOT cached!")
+
+            # Reset the update flag
+            self.ipadapter_update_requested = False
+            logger.info("IPAdapter config and image updated from SharedMemory")
+
+        except Exception as e:
+            logger.error(f"Error processing IPAdapter frame: {e}")
+
+    def request_ipadapter_update(self) -> None:
+        """Request IPAdapter image update on next frame (called via OSC)"""
+        self.ipadapter_update_requested = True
+        logger.info("IPAdapter update requested")
+
+    def set_mode(self, mode: str) -> None:
+        """
+        Switch between txt2img and img2img modes.
+
+        Args:
+            mode: Either "txt2img" or "img2img"
+        """
+        if mode not in ["txt2img", "img2img"]:
+            logger.error(f"Invalid mode: {mode}. Must be 'txt2img' or 'img2img'")
+            return
+
+        if mode == self.mode:
+            logger.info(f"Already in {mode} mode")
+            return
+
+        logger.info(f"Switching from {self.mode} to {mode} mode")
+        self.mode = mode
+
+        # txt2img mode requirements
+        if mode == "txt2img":
+            logger.warning(
+                "txt2img mode requires cfg_type='none' - verify your config!"
+            )
+
+    def get_stream_state(self) -> Dict[str, Any]:
+        """Get current streaming state and parameters"""
+        return {
+            "streaming": self.streaming,
+            "paused": self.paused,
+            "fps": self.current_fps,
+            "frame_count": self.frame_count,
+            "stream_method": self.stream_method,
+            "wrapper_state": self.wrapper.get_stream_state()
+            if hasattr(self.wrapper, "get_stream_state")
+            else {},
+            "controlnet_connected": self.control_memory is not None,
+            "controlnet_processed_connected": self.control_processed_memory is not None,
+            "ipadapter_connected": self.ipadapter_memory is not None,
+        }
diff --git a/src/streamdiffusion/pipeline.py b/src/streamdiffusion/pipeline.py
index 44ab60a0..d7556616 100644
--- a/src/streamdiffusion/pipeline.py
+++ b/src/streamdiffusion/pipeline.py
@@ -115,6 +115,7 @@ def __init__(
 
         self.inference_time_ema = 0
         self.similar_filter_sleep_fraction = 0.025
+        self.last_frame_was_skipped = False  # True when similar filter skipped inference this frame
 
         # Initialize SDXL-specific attributes
         if self.is_sdxl:
@@ -989,9 +990,11 @@ def __call__(
             if self.similar_image_filter:
                 x = self.similar_filter(x)
                 if x is None:
+                    self.last_frame_was_skipped = True
                     time.sleep(self.inference_time_ema * self.similar_filter_sleep_fraction)
                     return self.prev_image_result
-            
+
+            self.last_frame_was_skipped = False
             x_t_latent = self.encode_image(x)
             
             # LATENT PREPROCESSING HOOKS: After VAE encoding, before diffusion

From e35a716a18efdaf9e735a4201bc9a0ec79fcce64 Mon Sep 17 00:00:00 2001
From: "INTER.Tech" <INTER.Tech@interiam.co>
Date: Wed, 1 Apr 2026 21:16:09 -0400
Subject: [PATCH 09/11] perf: pre-allocate image output buffers, replace
 .clone() with .copy_() in pipeline hot path

---
 src/streamdiffusion/pipeline.py | 488 +++++++++++++++-----------------
 1 file changed, 233 insertions(+), 255 deletions(-)

diff --git a/src/streamdiffusion/pipeline.py b/src/streamdiffusion/pipeline.py
index d7556616..44521838 100644
--- a/src/streamdiffusion/pipeline.py
+++ b/src/streamdiffusion/pipeline.py
@@ -1,28 +1,36 @@
+import logging
 import time
-from typing import List, Optional, Union, Any, Dict, Tuple, Literal
+from typing import Any, Dict, List, Literal, Optional, Tuple, Union
 
 import numpy as np
 import PIL.Image
 import torch
-from diffusers import LCMScheduler, TCDScheduler, StableDiffusionPipeline
+from diffusers import LCMScheduler, StableDiffusionPipeline, TCDScheduler
 from diffusers.image_processor import VaeImageProcessor
 from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img import (
     retrieve_latents,
 )
 
-from streamdiffusion.model_detection import detect_model
 from streamdiffusion.hooks import (
-    EmbedsCtx, StepCtx, UnetKwargsDelta, ImageCtx, LatentCtx,
-    EmbeddingHook, UnetHook, ImageHook, LatentHook
+    EmbeddingHook,
+    EmbedsCtx,
+    ImageCtx,
+    ImageHook,
+    LatentCtx,
+    LatentHook,
+    StepCtx,
+    UnetHook,
+    UnetKwargsDelta,
 )
 from streamdiffusion.image_filter import SimilarImageFilter
+from streamdiffusion.model_detection import detect_model
 from streamdiffusion.stream_parameter_updater import StreamParameterUpdater
 
-import logging
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
+
 class StreamDiffusion:
     def __init__(
         self,
@@ -64,11 +72,11 @@ def __init__(
 
         # Detect model type
         detection_result = detect_model(pipe.unet, pipe)
-        self.model_type = detection_result['model_type']
-        self.is_sdxl = detection_result['is_sdxl']
-        self.is_turbo = detection_result['is_turbo']
-        self.detection_confidence = detection_result['confidence']
-    
+        self.model_type = detection_result["model_type"]
+        self.is_sdxl = detection_result["is_sdxl"]
+        self.is_turbo = detection_result["is_turbo"]
+        self.detection_confidence = detection_result["confidence"]
+
         # TCD scheduler is incompatible with denoising batch optimization due to Strategic Stochastic Sampling
         # Force sequential processing for TCD
         if scheduler == "tcd":
@@ -81,13 +89,9 @@ def __init__(
             self.use_denoising_batch = True
             self.batch_size = self.denoising_steps_num * frame_buffer_size
             if self.cfg_type == "initialize":
-                self.trt_unet_batch_size = (
-                    self.denoising_steps_num + 1
-                ) * self.frame_bff_size
+                self.trt_unet_batch_size = (self.denoising_steps_num + 1) * self.frame_bff_size
             elif self.cfg_type == "full":
-                self.trt_unet_batch_size = (
-                    2 * self.denoising_steps_num * self.frame_bff_size
-                )
+                self.trt_unet_batch_size = 2 * self.denoising_steps_num * self.frame_bff_size
             else:
                 self.trt_unet_batch_size = self.denoising_steps_num * frame_buffer_size
         else:
@@ -102,13 +106,15 @@ def __init__(
         self.similar_filter = SimilarImageFilter()
         self.prev_image_result = None
         self.prev_latent_result = None
-        self._latent_cache = None   # pre-allocated buffer; avoids per-frame CUDA malloc for prev_latent_result
-        self._noise_buf = None      # pre-allocated buffer for per-step noise in TCD non-batched path
+        self._latent_cache = None  # pre-allocated buffer; avoids per-frame CUDA malloc for prev_latent_result
+        self._noise_buf = None  # pre-allocated buffer for per-step noise in TCD non-batched path
+        self._image_decode_buf = None  # pre-allocated buffer for VAE decode output (avoids .clone())
+        self._prev_image_buf = None  # pre-allocated buffer for skip-frame image cache
 
         self.pipe = pipe
         self.image_processor = VaeImageProcessor(pipe.vae_scale_factor)
         self.scheduler = self._initialize_scheduler(scheduler, sampler, pipe.scheduler.config)
-        
+
         self.text_encoder = pipe.text_encoder
         self.unet = pipe.unet
         self.vae = pipe.vae
@@ -128,19 +134,19 @@ def __init__(
         # Hook containers (step 1: introduced but initially no-op)
         self.embedding_hooks: List[EmbeddingHook] = []
         self.unet_hooks: List[UnetHook] = []
-        
+
         # Phase 1: Core Pipeline Hooks (Immediate Priority)
         self.image_preprocessing_hooks: List[ImageHook] = []
         self.latent_preprocessing_hooks: List[LatentHook] = []
         self.latent_postprocessing_hooks: List[LatentHook] = []
-        
+
         # Phase 2: Quality & Performance Hooks
         self.image_postprocessing_hooks: List[ImageHook] = []
         self.image_filtering_hooks: List[ImageHook] = []
-        
+
         # Cache TensorRT detection to avoid repeated hasattr checks
         self._is_unet_tensorrt = None
-        
+
         # Cache SDXL conditioning tensors to avoid repeated torch.cat/repeat operations
         self._sdxl_conditioning_cache: Dict[str, torch.Tensor] = {}
         self._cached_batch_size: Optional[int] = None
@@ -165,15 +171,15 @@ def _initialize_scheduler(self, scheduler_type: str, sampler_type: str, config):
             "beta": {"beta_schedule": "scaled_linear"},
             "karras": {},  # Karras sigmas handled per scheduler
         }
-        
+
         # Get sampler-specific configuration
         sampler_params = sampler_config.get(sampler_type, {})
-        
+
         # Set original_inference_steps to 100 to allow flexible num_inference_steps updates
         # This prevents "original_steps x strength < num_inference_steps" errors when
         # dynamically changing num_inference_steps in production without pipeline restarts
-        sampler_params['original_inference_steps'] = 100
-        
+        sampler_params["original_inference_steps"] = 100
+
         if scheduler_type == "lcm":
             return LCMScheduler.from_config(config, **sampler_params)
         elif scheduler_type == "tcd":
@@ -185,29 +191,34 @@ def _initialize_scheduler(self, scheduler_type: str, sampler_type: str, config):
     def _check_unet_tensorrt(self) -> bool:
         """Cache TensorRT detection to avoid repeated hasattr calls"""
         if self._is_unet_tensorrt is None:
-            self._is_unet_tensorrt = hasattr(self.unet, 'engine') and hasattr(self.unet, 'stream')
+            self._is_unet_tensorrt = hasattr(self.unet, "engine") and hasattr(self.unet, "stream")
         return self._is_unet_tensorrt
 
-    def _get_cached_sdxl_conditioning(self, batch_size: int, cfg_type: str, guidance_scale: float) -> Optional[Dict[str, torch.Tensor]]:
+    def _get_cached_sdxl_conditioning(
+        self, batch_size: int, cfg_type: str, guidance_scale: float
+    ) -> Optional[Dict[str, torch.Tensor]]:
         """Retrieve cached SDXL conditioning tensors if configuration matches"""
-        if (self._cached_batch_size == batch_size and 
-            self._cached_cfg_type == cfg_type and 
-            self._cached_guidance_scale == guidance_scale and
-            len(self._sdxl_conditioning_cache) > 0):
+        if (
+            self._cached_batch_size == batch_size
+            and self._cached_cfg_type == cfg_type
+            and self._cached_guidance_scale == guidance_scale
+            and len(self._sdxl_conditioning_cache) > 0
+        ):
             return {
-                'text_embeds': self._sdxl_conditioning_cache.get('text_embeds'),
-                'time_ids': self._sdxl_conditioning_cache.get('time_ids')
+                "text_embeds": self._sdxl_conditioning_cache.get("text_embeds"),
+                "time_ids": self._sdxl_conditioning_cache.get("time_ids"),
             }
         return None
 
-    def _cache_sdxl_conditioning(self, batch_size: int, cfg_type: str, guidance_scale: float, 
-                                text_embeds: torch.Tensor, time_ids: torch.Tensor) -> None:
+    def _cache_sdxl_conditioning(
+        self, batch_size: int, cfg_type: str, guidance_scale: float, text_embeds: torch.Tensor, time_ids: torch.Tensor
+    ) -> None:
         """Cache SDXL conditioning tensors for reuse"""
         self._cached_batch_size = batch_size
         self._cached_cfg_type = cfg_type
         self._cached_guidance_scale = guidance_scale
-        self._sdxl_conditioning_cache['text_embeds'] = text_embeds.clone()
-        self._sdxl_conditioning_cache['time_ids'] = time_ids.clone()
+        self._sdxl_conditioning_cache["text_embeds"] = text_embeds.clone()
+        self._sdxl_conditioning_cache["time_ids"] = time_ids.clone()
 
     def _build_sdxl_conditioning(self, batch_size: int) -> Dict[str, torch.Tensor]:
         """Build SDXL conditioning tensors with optimized tensor operations"""
@@ -219,7 +230,7 @@ def _build_sdxl_conditioning(self, batch_size: int) -> Dict[str, torch.Tensor]:
             cond_text = self.add_text_embeds[1:2]
             uncond_time = self.add_time_ids[0:1]
             cond_time = self.add_time_ids[1:2]
-            
+
             if batch_size > 1:
                 cond_text_repeated = cond_text.expand(batch_size - 1, -1).contiguous()
                 cond_time_repeated = cond_time.expand(batch_size - 1, -1).contiguous()
@@ -228,7 +239,7 @@ def _build_sdxl_conditioning(self, batch_size: int) -> Dict[str, torch.Tensor]:
             else:
                 add_text_embeds = uncond_text
                 add_time_ids = uncond_time
-                
+
         elif self.guidance_scale > 1.0 and (self.cfg_type == "full"):
             # For full mode: repeat both uncond and cond for each latent
             repeat_factor = batch_size // 2
@@ -240,13 +251,7 @@ def _build_sdxl_conditioning(self, batch_size: int) -> Dict[str, torch.Tensor]:
             source_time = self.add_time_ids[1:2] if self.add_time_ids.shape[0] > 1 else self.add_time_ids
             add_text_embeds = source_text.expand(batch_size, -1).contiguous()
             add_time_ids = source_time.expand(batch_size, -1).contiguous()
-        return {
-            'text_embeds': add_text_embeds,
-            'time_ids': add_time_ids
-        }
-
-
-
+        return {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
 
     def load_lora(
         self,
@@ -254,9 +259,7 @@ def load_lora(
         adapter_name: Optional[Any] = None,
         **kwargs,
     ) -> None:
-        self._load_lora_with_offline_fallback(
-            pretrained_lora_model_name_or_path_or_dict, adapter_name, **kwargs
-        )
+        self._load_lora_with_offline_fallback(pretrained_lora_model_name_or_path_or_dict, adapter_name, **kwargs)
 
     def _load_lora_with_offline_fallback(
         self,
@@ -363,7 +366,7 @@ def prepare(
 
         # Handle SDXL vs SD1.5/SD2.1 text encoding differently
         if self.is_sdxl:
-            # SDXL encode_prompt returns 4 values: 
+            # SDXL encode_prompt returns 4 values:
             # (prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds)
             encoder_output = self.pipe.encode_prompt(
                 prompt=prompt,
@@ -380,40 +383,38 @@ def prepare(
                 lora_scale=None,
                 clip_skip=None,
             )
-            
+
             if len(encoder_output) >= 4:
-                prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds = encoder_output[:4]
-                
+                prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds = (
+                    encoder_output[:4]
+                )
+
                 # Set up prompt embeddings for the UNet (base before hooks)
                 base_prompt_embeds = prompt_embeds.repeat(self.batch_size, 1, 1)
-                
+
                 # Handle CFG for prompt embeddings
                 if self.use_denoising_batch and self.cfg_type == "full":
                     uncond_prompt_embeds = negative_prompt_embeds.repeat(self.batch_size, 1, 1)
                 elif self.cfg_type == "initialize":
                     uncond_prompt_embeds = negative_prompt_embeds.repeat(self.frame_bff_size, 1, 1)
 
-                if self.guidance_scale > 1.0 and (
-                    self.cfg_type == "initialize" or self.cfg_type == "full"
-                ):
-                    base_prompt_embeds = torch.cat(
-                        [uncond_prompt_embeds, base_prompt_embeds], dim=0
-                    )
-                
+                if self.guidance_scale > 1.0 and (self.cfg_type == "initialize" or self.cfg_type == "full"):
+                    base_prompt_embeds = torch.cat([uncond_prompt_embeds, base_prompt_embeds], dim=0)
+
                 # Set up SDXL-specific conditioning (added_cond_kwargs)
                 if do_classifier_free_guidance:
                     self.add_text_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds], dim=0)
                 else:
                     self.add_text_embeds = pooled_prompt_embeds
-                
+
                 # Create time conditioning for SDXL micro-conditioning
                 original_size = (self.height, self.width)
                 target_size = (self.height, self.width)
                 crops_coords_top_left = (0, 0)
-                
+
                 add_time_ids = list(original_size + crops_coords_top_left + target_size)
                 add_time_ids = torch.tensor([add_time_ids], dtype=self.dtype, device=self.device)
-                
+
                 if do_classifier_free_guidance:
                     self.add_time_ids = torch.cat([add_time_ids, add_time_ids], dim=0)
                 else:
@@ -445,12 +446,8 @@ def prepare(
             elif self.cfg_type == "initialize":
                 uncond_prompt_embeds = encoder_output[1].repeat(self.frame_bff_size, 1, 1)
 
-            if self.guidance_scale > 1.0 and (
-                self.cfg_type == "initialize" or self.cfg_type == "full"
-            ):
-                base_prompt_embeds = torch.cat(
-                    [uncond_prompt_embeds, base_prompt_embeds], dim=0
-                )
+            if self.guidance_scale > 1.0 and (self.cfg_type == "initialize" or self.cfg_type == "full"):
+                base_prompt_embeds = torch.cat([uncond_prompt_embeds, base_prompt_embeds], dim=0)
 
             # Run embedding hooks (no-op unless modules register)
             embeds_ctx = EmbedsCtx(prompt_embeds=base_prompt_embeds, negative_prompt_embeds=None)
@@ -470,9 +467,7 @@ def prepare(
         for t in self.t_list:
             self.sub_timesteps.append(self.timesteps[t])
 
-        sub_timesteps_tensor = torch.tensor(
-            self.sub_timesteps, dtype=torch.long, device=self.device
-        )
+        sub_timesteps_tensor = torch.tensor(self.sub_timesteps, dtype=torch.long, device=self.device)
         self.sub_timesteps_tensor = torch.repeat_interleave(
             sub_timesteps_tensor,
             repeats=self.frame_bff_size if self.use_denoising_batch else 1,
@@ -494,16 +489,8 @@ def prepare(
             c_skip_list.append(c_skip)
             c_out_list.append(c_out)
 
-        self.c_skip = (
-            torch.stack(c_skip_list)
-            .view(len(self.t_list), 1, 1, 1)
-            .to(dtype=self.dtype, device=self.device)
-        )
-        self.c_out = (
-            torch.stack(c_out_list)
-            .view(len(self.t_list), 1, 1, 1)
-            .to(dtype=self.dtype, device=self.device)
-        )
+        self.c_skip = torch.stack(c_skip_list).view(len(self.t_list), 1, 1, 1).to(dtype=self.dtype, device=self.device)
+        self.c_out = torch.stack(c_out_list).view(len(self.t_list), 1, 1, 1).to(dtype=self.dtype, device=self.device)
 
         alpha_prod_t_sqrt_list = []
         beta_prod_t_sqrt_list = []
@@ -518,9 +505,7 @@ def prepare(
             .to(dtype=self.dtype, device=self.device)
         )
         beta_prod_t_sqrt = (
-            torch.stack(beta_prod_t_sqrt_list)
-            .view(len(self.t_list), 1, 1, 1)
-            .to(dtype=self.dtype, device=self.device)
+            torch.stack(beta_prod_t_sqrt_list).view(len(self.t_list), 1, 1, 1).to(dtype=self.dtype, device=self.device)
         )
         self.alpha_prod_t_sqrt = torch.repeat_interleave(
             alpha_prod_t_sqrt,
@@ -532,7 +517,7 @@ def prepare(
             repeats=self.frame_bff_size if self.use_denoising_batch else 1,
             dim=0,
         )
-        #NOTE: this is a hack. Pipeline needs a major refactor along with stream parameter updater. 
+        # NOTE: this is a hack. Pipeline needs a major refactor along with stream parameter updater.
         self.update_prompt(prompt)
 
         # Only collapse tensors to scalars for LCM non-batched mode
@@ -561,14 +546,7 @@ def _get_scheduler_scalings(self, timestep):
 
     @torch.inference_mode()
     def update_prompt(self, prompt: str) -> None:
-        self._param_updater.update_stream_params(
-            prompt_list=[(prompt, 1.0)],
-            prompt_interpolation_method="linear"
-        )
-
-    
-
-
+        self._param_updater.update_stream_params(prompt_list=[(prompt, 1.0)], prompt_interpolation_method="linear")
 
     def get_normalize_prompt_weights(self) -> bool:
         """Get the current prompt weight normalization setting."""
@@ -578,14 +556,14 @@ def get_normalize_seed_weights(self) -> bool:
         """Get the current seed weight normalization setting."""
         return self._param_updater.get_normalize_seed_weights()
 
-
-
-
-
-    def set_scheduler(self, scheduler: Literal["lcm", "tcd"] = None, sampler: Literal["simple", "sgm uniform", "normal", "ddim", "beta", "karras"] = None) -> None:
+    def set_scheduler(
+        self,
+        scheduler: Literal["lcm", "tcd"] = None,
+        sampler: Literal["simple", "sgm uniform", "normal", "ddim", "beta", "karras"] = None,
+    ) -> None:
         """
         Change the scheduler and/or sampler at runtime.
-        
+
         Parameters
         ----------
         scheduler : str, optional
@@ -597,7 +575,7 @@ def set_scheduler(self, scheduler: Literal["lcm", "tcd"] = None, sampler: Litera
             self.scheduler_type = scheduler
         if sampler is not None:
             self.sampler_type = sampler
-            
+
         self.scheduler = self._initialize_scheduler(self.scheduler_type, self.sampler_type, self.pipe.scheduler.config)
         logger.info(f"Scheduler changed to {self.scheduler_type} with {self.sampler_type} sampler")
 
@@ -605,18 +583,13 @@ def _uses_lcm_logic(self) -> bool:
         """Return True if scheduler uses LCM-style consistency boundary-condition math."""
         return isinstance(self.scheduler, LCMScheduler)
 
-
-
     def add_noise(
         self,
         original_samples: torch.Tensor,
         noise: torch.Tensor,
         t_index: int,
     ) -> torch.Tensor:
-        noisy_samples = (
-            self.alpha_prod_t_sqrt[t_index] * original_samples
-            + self.beta_prod_t_sqrt[t_index] * noise
-        )
+        noisy_samples = self.alpha_prod_t_sqrt[t_index] * original_samples + self.beta_prod_t_sqrt[t_index] * noise
         return noisy_samples
 
     def scheduler_step_batch(
@@ -629,8 +602,7 @@ def scheduler_step_batch(
             # Upcast division to fp32 — alpha_prod_t_sqrt can be small at early timesteps,
             # causing fp16 rounding artifacts; cast result back to original dtype.
             F_theta = (
-                (x_t_latent_batch - self.beta_prod_t_sqrt * model_pred_batch).float()
-                / self.alpha_prod_t_sqrt.float()
+                (x_t_latent_batch - self.beta_prod_t_sqrt * model_pred_batch).float() / self.alpha_prod_t_sqrt.float()
             ).to(x_t_latent_batch.dtype)
             denoised_batch = self.c_out * F_theta + self.c_skip * x_t_latent_batch
         else:
@@ -638,9 +610,7 @@ def scheduler_step_batch(
                 (x_t_latent_batch - self.beta_prod_t_sqrt[idx] * model_pred_batch).float()
                 / self.alpha_prod_t_sqrt[idx].float()
             ).to(x_t_latent_batch.dtype)
-            denoised_batch = (
-                self.c_out[idx] * F_theta + self.c_skip[idx] * x_t_latent_batch
-            )
+            denoised_batch = self.c_out[idx] * F_theta + self.c_skip[idx] * x_t_latent_batch
         return denoised_batch
 
     def unet_step(
@@ -660,37 +630,38 @@ def unet_step(
 
         # Prepare UNet call arguments
         unet_kwargs = {
-            'sample': x_t_latent_plus_uc,
-            'timestep': t_list,
-            'encoder_hidden_states': self.prompt_embeds,
-            'return_dict': False,
+            "sample": x_t_latent_plus_uc,
+            "timestep": t_list,
+            "encoder_hidden_states": self.prompt_embeds,
+            "return_dict": False,
         }
-        
+
         # Add SDXL-specific conditioning if this is an SDXL model
-        if self.is_sdxl and hasattr(self, 'add_text_embeds') and hasattr(self, 'add_time_ids'):
+        if self.is_sdxl and hasattr(self, "add_text_embeds") and hasattr(self, "add_time_ids"):
             if self.add_text_embeds is not None and self.add_time_ids is not None:
                 # Handle batching for CFG - replicate conditioning to match batch size
                 batch_size = x_t_latent_plus_uc.shape[0]
-                
+
                 # Use optimized caching system for SDXL conditioning tensors
-                cached_conditioning = self._get_cached_sdxl_conditioning(batch_size, self.cfg_type, self.guidance_scale)
+                cached_conditioning = self._get_cached_sdxl_conditioning(
+                    batch_size, self.cfg_type, self.guidance_scale
+                )
                 if cached_conditioning is not None:
                     # Cache hit - reuse existing tensors
-                    add_text_embeds = cached_conditioning['text_embeds']
-                    add_time_ids = cached_conditioning['time_ids']
+                    add_text_embeds = cached_conditioning["text_embeds"]
+                    add_time_ids = cached_conditioning["time_ids"]
                 else:
                     # Cache miss - build new tensors using optimized operations
                     conditioning = self._build_sdxl_conditioning(batch_size)
-                    add_text_embeds = conditioning['text_embeds']
-                    add_time_ids = conditioning['time_ids']
+                    add_text_embeds = conditioning["text_embeds"]
+                    add_time_ids = conditioning["time_ids"]
                     # Cache for future use
-                    self._cache_sdxl_conditioning(batch_size, self.cfg_type, self.guidance_scale, add_text_embeds, add_time_ids)
-                
-                unet_kwargs['added_cond_kwargs'] = {
-                    'text_embeds': add_text_embeds,
-                    'time_ids': add_time_ids
-                }
-        
+                    self._cache_sdxl_conditioning(
+                        batch_size, self.cfg_type, self.guidance_scale, add_text_embeds, add_time_ids
+                    )
+
+                unet_kwargs["added_cond_kwargs"] = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+
         # Allow modules to contribute additional UNet kwargs via hooks
         try:
             step_ctx = StepCtx(
@@ -698,7 +669,7 @@ def unet_step(
                 t_list=t_list,
                 step_index=idx if isinstance(idx, int) else (int(idx) if idx is not None else None),
                 guidance_mode=self.cfg_type if self.guidance_scale > 1.0 else "none",
-                sdxl_cond=unet_kwargs.get('added_cond_kwargs', None)
+                sdxl_cond=unet_kwargs.get("added_cond_kwargs", None),
             )
             extra_from_hooks = {}
             for hook in self.unet_hooks:
@@ -706,42 +677,40 @@ def unet_step(
                 if delta is None:
                     continue
                 if delta.down_block_additional_residuals is not None:
-                    unet_kwargs['down_block_additional_residuals'] = delta.down_block_additional_residuals
+                    unet_kwargs["down_block_additional_residuals"] = delta.down_block_additional_residuals
                 if delta.mid_block_additional_residual is not None:
-                    unet_kwargs['mid_block_additional_residual'] = delta.mid_block_additional_residual
+                    unet_kwargs["mid_block_additional_residual"] = delta.mid_block_additional_residual
                 if delta.added_cond_kwargs is not None:
                     # Merge SDXL cond if both exist
-                    base_added = unet_kwargs.get('added_cond_kwargs', {})
+                    base_added = unet_kwargs.get("added_cond_kwargs", {})
                     base_added.update(delta.added_cond_kwargs)
-                    unet_kwargs['added_cond_kwargs'] = base_added
-                if getattr(delta, 'extra_unet_kwargs', None):
+                    unet_kwargs["added_cond_kwargs"] = base_added
+                if getattr(delta, "extra_unet_kwargs", None):
                     # Merge extra kwargs from hooks (e.g., ipadapter_scale)
                     try:
                         extra_from_hooks.update(delta.extra_unet_kwargs)
                     except Exception:
                         pass
             if extra_from_hooks:
-                unet_kwargs['extra_unet_kwargs'] = extra_from_hooks
+                unet_kwargs["extra_unet_kwargs"] = extra_from_hooks
         except Exception as e:
             logger.error(f"unet_step: unet hook failed: {e}")
             raise
 
         # Extract potential ControlNet residual kwargs and generic extra kwargs (e.g., ipadapter_scale)
-        hook_down_res = unet_kwargs.get('down_block_additional_residuals', None)
-        hook_mid_res = unet_kwargs.get('mid_block_additional_residual', None)
-        hook_extra_kwargs = unet_kwargs.get('extra_unet_kwargs', None) if 'extra_unet_kwargs' in unet_kwargs else None
+        hook_down_res = unet_kwargs.get("down_block_additional_residuals", None)
+        hook_mid_res = unet_kwargs.get("mid_block_additional_residual", None)
+        hook_extra_kwargs = unet_kwargs.get("extra_unet_kwargs", None) if "extra_unet_kwargs" in unet_kwargs else None
 
         # Call UNet with appropriate conditioning
         if self.is_sdxl:
             try:
-
-                
                 # Detect UNet type and use appropriate calling convention
-                added_cond_kwargs = unet_kwargs.get('added_cond_kwargs', {})
-                
+                added_cond_kwargs = unet_kwargs.get("added_cond_kwargs", {})
+
                 # Check if this is a TensorRT engine or PyTorch UNet
                 is_tensorrt_engine = self._check_unet_tensorrt()
-                
+
                 if is_tensorrt_engine:
                     # TensorRT engine expects positional args + kwargs. IP-Adapter scale vector, if any, is provided by hooks via extra_unet_kwargs
                     extra_kwargs = {}
@@ -750,41 +719,42 @@ def unet_step(
 
                     # Include ControlNet residuals if provided by hooks
                     if hook_down_res is not None:
-                        extra_kwargs['down_block_additional_residuals'] = hook_down_res
+                        extra_kwargs["down_block_additional_residuals"] = hook_down_res
                     if hook_mid_res is not None:
-                        extra_kwargs['mid_block_additional_residual'] = hook_mid_res
+                        extra_kwargs["mid_block_additional_residual"] = hook_mid_res
 
                     model_pred, kvo_cache_out = self.unet(
-                        unet_kwargs['sample'],                    # latent_model_input (positional)
-                        unet_kwargs['timestep'],                  # timestep (positional)
-                        unet_kwargs['encoder_hidden_states'],     # encoder_hidden_states (positional)
+                        unet_kwargs["sample"],  # latent_model_input (positional)
+                        unet_kwargs["timestep"],  # timestep (positional)
+                        unet_kwargs["encoder_hidden_states"],  # encoder_hidden_states (positional)
                         kvo_cache=self.kvo_cache,
                         **extra_kwargs,
                         # For TRT engines, ensure SDXL cond shapes match engine builds; if engine expects 81 tokens (77+4), append dummy image tokens when none
-                        **added_cond_kwargs                       # SDXL conditioning as kwargs
+                        **added_cond_kwargs,  # SDXL conditioning as kwargs
                     )
                     self.update_kvo_cache(kvo_cache_out)
                 else:
                     # PyTorch UNet expects diffusers-style named arguments. Any processor scaling is handled by IP-Adapter hook
 
-                    call_kwargs = dict(
-                        sample=unet_kwargs['sample'],
-                        timestep=unet_kwargs['timestep'],
-                        encoder_hidden_states=unet_kwargs['encoder_hidden_states'],
-                        added_cond_kwargs=added_cond_kwargs,
-                        return_dict=False,
-                    )
+                    call_kwargs = {
+                        "sample": unet_kwargs["sample"],
+                        "timestep": unet_kwargs["timestep"],
+                        "encoder_hidden_states": unet_kwargs["encoder_hidden_states"],
+                        "added_cond_kwargs": added_cond_kwargs,
+                        "return_dict": False,
+                    }
                     # Include ControlNet residuals if present
                     if hook_down_res is not None:
-                        call_kwargs['down_block_additional_residuals'] = hook_down_res
+                        call_kwargs["down_block_additional_residuals"] = hook_down_res
                     if hook_mid_res is not None:
-                        call_kwargs['mid_block_additional_residual'] = hook_mid_res
+                        call_kwargs["mid_block_additional_residual"] = hook_mid_res
                     model_pred = self.unet(**call_kwargs)[0]
                     # No restoration for per-layer scale; next step will set again via updater/time factor
-                
+
             except Exception as e:
                 logger.error(f"[PIPELINE] unet_step: *** ERROR: SDXL UNet call failed: {e} ***")
                 import traceback
+
                 traceback.print_exc()
                 raise
         else:
@@ -798,9 +768,9 @@ def unet_step(
 
             # Include ControlNet residuals if present
             if hook_down_res is not None:
-                ip_scale_kw['down_block_additional_residuals'] = hook_down_res
+                ip_scale_kw["down_block_additional_residuals"] = hook_down_res
             if hook_mid_res is not None:
-                ip_scale_kw['mid_block_additional_residual'] = hook_mid_res
+                ip_scale_kw["mid_block_additional_residual"] = hook_mid_res
 
             model_pred, kvo_cache_out = self.unet(
                 x_t_latent_plus_uc,
@@ -821,23 +791,19 @@ def unet_step(
             noise_pred_uncond, noise_pred_text = model_pred.chunk(2)
         else:
             noise_pred_text = model_pred
-            
-        if self.guidance_scale > 1.0 and (
-            self.cfg_type == "self" or self.cfg_type == "initialize"
-        ):
+
+        if self.guidance_scale > 1.0 and (self.cfg_type == "self" or self.cfg_type == "initialize"):
             noise_pred_uncond = self.stock_noise * self.delta
-            
+
         if self.guidance_scale > 1.0 and self.cfg_type != "none":
-            model_pred = noise_pred_uncond + self.guidance_scale * (
-                noise_pred_text - noise_pred_uncond
-            )
+            model_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
         else:
             model_pred = noise_pred_text
 
         # compute the previous noisy sample x_t -> x_t-1
         if self.use_denoising_batch:
             denoised_batch = self.scheduler_step_batch(model_pred, x_t_latent, idx)
-            
+
             if self.cfg_type == "self" or self.cfg_type == "initialize":
                 scaled_noise = self.beta_prod_t_sqrt * self.stock_noise
                 delta_x = self.scheduler_step_batch(model_pred, scaled_noise, idx)
@@ -857,9 +823,7 @@ def unet_step(
                     dim=0,
                 )
                 delta_x = delta_x / beta_next
-                init_noise = torch.concat(
-                    [self.init_noise[1:], self.init_noise[0:1]], dim=0
-                )
+                init_noise = torch.concat([self.init_noise[1:], self.init_noise[0:1]], dim=0)
                 self.stock_noise = init_noise + delta_x
 
         else:
@@ -890,11 +854,11 @@ def encode_image(self, image_tensors: torch.Tensor) -> torch.Tensor:
         )
         with torch.autocast("cuda", dtype=self.dtype):
             img_latent = retrieve_latents(self.vae.encode(image_tensors), self.generator)
-        
+
         img_latent = img_latent * self.vae.config.scaling_factor
-        
+
         x_t_latent = self.add_noise(img_latent, self.init_noise[0], 0)
-        
+
         return x_t_latent
 
     def decode_image(self, x_0_pred_out: torch.Tensor) -> torch.Tensor:
@@ -905,16 +869,14 @@ def decode_image(self, x_0_pred_out: torch.Tensor) -> torch.Tensor:
 
     def predict_x0_batch(self, x_t_latent: torch.Tensor) -> torch.Tensor:
         prev_latent_batch = self.x_t_latent_buffer
-        
+
         # LCM supports our denoising-batch trick. TCD must use standard scheduler.step() sequentially
         # but now properly processes ControlNet hooks through unet_step()
         if self.use_denoising_batch and isinstance(self.scheduler, LCMScheduler):
             t_list = self.sub_timesteps_tensor
             if self.denoising_steps_num > 1:
                 x_t_latent = torch.cat((x_t_latent, prev_latent_batch), dim=0)
-                self.stock_noise = torch.cat(
-                    (self.init_noise[0:1], self.stock_noise[:-1]), dim=0
-                )
+                self.stock_noise = torch.cat((self.init_noise[0:1], self.stock_noise[:-1]), dim=0)
             x_0_pred_batch, model_pred = self.unet_step(x_t_latent, t_list)
 
             if self.denoising_steps_num > 1:
@@ -925,9 +887,7 @@ def predict_x0_batch(self, x_t_latent: torch.Tensor) -> torch.Tensor:
                         + self.beta_prod_t_sqrt[1:] * self.init_noise[1:]
                     )
                 else:
-                    self.x_t_latent_buffer = (
-                        self.alpha_prod_t_sqrt[1:] * x_0_pred_batch[:-1]
-                    )
+                    self.x_t_latent_buffer = self.alpha_prod_t_sqrt[1:] * x_0_pred_batch[:-1]
             else:
                 x_0_pred_out = x_0_pred_batch
                 self.x_t_latent_buffer = None
@@ -944,15 +904,25 @@ def predict_x0_batch(self, x_t_latent: torch.Tensor) -> torch.Tensor:
                 # For TCD, use the same UNet calling logic as LCM to ensure ControlNet hooks are processed
                 if isinstance(self.scheduler, TCDScheduler):
                     # Use unet_step to process ControlNet hooks and get proper noise prediction
-                    t_expanded = t.view(1,).repeat(self.frame_bff_size,)
+                    t_expanded = t.view(
+                        1,
+                    ).repeat(
+                        self.frame_bff_size,
+                    )
                     x_0_pred, model_pred = self.unet_step(sample, t_expanded, idx)
-                    
+
                     # Apply TCD scheduler step to the guided noise prediction
                     step_out = self.scheduler.step(model_pred, t, sample)
-                    sample = getattr(step_out, "prev_sample", step_out[0] if isinstance(step_out, (tuple, list)) else step_out)
+                    sample = getattr(
+                        step_out, "prev_sample", step_out[0] if isinstance(step_out, (tuple, list)) else step_out
+                    )
                 else:
                     # Original LCM logic for non-batched mode
-                    t = t.view(1,).repeat(self.frame_bff_size,)
+                    t = t.view(
+                        1,
+                    ).repeat(
+                        self.frame_bff_size,
+                    )
                     x_0_pred, model_pred = self.unet_step(sample, t, idx)
                     if idx < len(self.sub_timesteps_tensor) - 1:
                         if self.do_add_noise:
@@ -972,21 +942,17 @@ def predict_x0_batch(self, x_t_latent: torch.Tensor) -> torch.Tensor:
         return x_0_pred_out
 
     @torch.inference_mode()
-    def __call__(
-        self, x: Union[torch.Tensor, PIL.Image.Image, np.ndarray] = None
-    ) -> torch.Tensor:
+    def __call__(self, x: Union[torch.Tensor, PIL.Image.Image, np.ndarray] = None) -> torch.Tensor:
         start = torch.cuda.Event(enable_timing=True)
         end = torch.cuda.Event(enable_timing=True)
         start.record()
-        
+
         if x is not None:
-            x = self.image_processor.preprocess(x, self.height, self.width).to(
-                device=self.device, dtype=self.dtype
-            )
-            
+            x = self.image_processor.preprocess(x, self.height, self.width).to(device=self.device, dtype=self.dtype)
+
             # IMAGE PREPROCESSING HOOKS: After built-in preprocessing, before filtering
             x = self._apply_image_preprocessing_hooks(x)
-            
+
             if self.similar_image_filter:
                 x = self.similar_filter(x)
                 if x is None:
@@ -996,7 +962,7 @@ def __call__(
 
             self.last_frame_was_skipped = False
             x_t_latent = self.encode_image(x)
-            
+
             # LATENT PREPROCESSING HOOKS: After VAE encoding, before diffusion
             x_t_latent = self._apply_latent_preprocessing_hooks(x_t_latent)
         else:
@@ -1004,94 +970,101 @@ def __call__(
             x_t_latent = torch.randn((1, 4, self.latent_height, self.latent_width)).to(
                 device=self.device, dtype=self.dtype
             )
-        
+
         x_0_pred_out = self.predict_x0_batch(x_t_latent)
-        
+
         # LATENT POSTPROCESSING HOOKS: After diffusion, before VAE decoding
         x_0_pred_out = self._apply_latent_postprocessing_hooks(x_0_pred_out)
-        
+
         # Store latent result for latent feedback processors (reuse pre-allocated buffer)
         if self._latent_cache is None:
             self._latent_cache = torch.empty_like(x_0_pred_out)
         self._latent_cache.copy_(x_0_pred_out)
         self.prev_latent_result = self._latent_cache
 
-        x_output = self.decode_image(x_0_pred_out).clone()
+        _decoded = self.decode_image(x_0_pred_out)
+        if self._image_decode_buf is None:
+            self._image_decode_buf = torch.empty_like(_decoded)
+        self._image_decode_buf.copy_(_decoded)
+        x_output = self._image_decode_buf
 
         # IMAGE POSTPROCESSING HOOKS: After VAE decoding, before final output
         x_output = self._apply_image_postprocessing_hooks(x_output)
 
-        # Clone for skip-frame cache — TRT VAE buffer is reused on next decode call
-        self.prev_image_result = x_output.clone()
+        # Copy into pre-allocated skip-frame cache — TRT VAE buffer is reused on next decode call
+        if self._prev_image_buf is None:
+            self._prev_image_buf = torch.empty_like(x_output)
+        self._prev_image_buf.copy_(x_output)
+        self.prev_image_result = self._prev_image_buf
         end.record()
         end.synchronize()  # Wait only for this event, not all streams globally
         inference_time = start.elapsed_time(end) / 1000
         self.inference_time_ema = 0.9 * self.inference_time_ema + 0.1 * inference_time
-        
+
         return x_output
 
     # =========================================================================
     # Pipeline Hook Helper Methods (Phase 3: Performance-Optimized Hot Path)
     # =========================================================================
-    
+
     def _apply_image_preprocessing_hooks(self, x: torch.Tensor) -> torch.Tensor:
         """Apply image preprocessing hooks with minimal hot path overhead."""
         # Early exit - zero overhead when no hooks registered
         if not self.image_preprocessing_hooks:
             return x
-        
+
         # Single context object creation to minimize allocation overhead
         image_ctx = ImageCtx(image=x, width=self.width, height=self.height)
-        
+
         # Direct iteration - no additional function calls
         for hook in self.image_preprocessing_hooks:
             image_ctx = hook(image_ctx)
-        
+
         return image_ctx.image
-    
+
     def _apply_image_postprocessing_hooks(self, x: torch.Tensor) -> torch.Tensor:
         """Apply image postprocessing hooks with minimal hot path overhead."""
         # Early exit - zero overhead when no hooks registered
         if not self.image_postprocessing_hooks:
             return x
-        
+
         # Single context object creation to minimize allocation overhead
         image_ctx = ImageCtx(image=x, width=self.width, height=self.height)
-        
+
         # Direct iteration - no additional function calls
         for hook in self.image_postprocessing_hooks:
             image_ctx = hook(image_ctx)
-        
+
         return image_ctx.image
-    
+
     def _apply_latent_preprocessing_hooks(self, latent: torch.Tensor) -> torch.Tensor:
         """Apply latent preprocessing hooks with minimal hot path overhead."""
         # Early exit - zero overhead when no hooks registered
         if not self.latent_preprocessing_hooks:
             return latent
-        
+
         # Single context object creation to minimize allocation overhead
         latent_ctx = LatentCtx(latent=latent)
-        
+
         # Direct iteration - no additional function calls
         for hook in self.latent_preprocessing_hooks:
             latent_ctx = hook(latent_ctx)
-        
+
         return latent_ctx.latent
-    
+
     def _apply_latent_postprocessing_hooks(self, latent: torch.Tensor) -> torch.Tensor:
         """Apply latent postprocessing hooks with minimal hot path overhead."""
-        # Early exit - zero overhead when no hooks registered  
+        # Early exit - zero overhead when no hooks registered
         if not self.latent_postprocessing_hooks:
             return latent
-        
+
         # Single context object creation to minimize allocation overhead
         latent_ctx = LatentCtx(latent=latent)
-        
+
         # Direct iteration - no additional function calls
         for hook in self.latent_postprocessing_hooks:
             latent_ctx = hook(latent_ctx)
-        
+
         return latent_ctx.latent
 
     @torch.inference_mode()
@@ -1101,18 +1074,21 @@ def txt2img(self, batch_size: int = 1) -> torch.Tensor:
                 device=self.device, dtype=self.dtype
             )
         )
-        
+
         # LATENT POSTPROCESSING HOOKS: After diffusion, before VAE decoding
         x_0_pred_out = self._apply_latent_postprocessing_hooks(x_0_pred_out)
-        
+
         # Store latent result for latent feedback processors (reuse pre-allocated buffer)
         if self._latent_cache is None:
             self._latent_cache = torch.empty_like(x_0_pred_out)
         self._latent_cache.copy_(x_0_pred_out)
         self.prev_latent_result = self._latent_cache
 
-        
-        x_output = self.decode_image(x_0_pred_out).clone()
+        _decoded = self.decode_image(x_0_pred_out)
+        if self._image_decode_buf is None:
+            self._image_decode_buf = torch.empty_like(_decoded)
+        self._image_decode_buf.copy_(_decoded)
+        x_output = self._image_decode_buf
 
         # IMAGE POSTPROCESSING HOOKS: After VAE decoding, before final output
         x_output = self._apply_image_postprocessing_hooks(x_output)
@@ -1126,26 +1102,31 @@ def txt2img_sd_turbo(self, batch_size: int = 1) -> torch.Tensor:
             device=self.device,
             dtype=self.dtype,
         )
-        
+
         # Prepare UNet call arguments
         unet_kwargs = {
-            'sample': x_t_latent,
-            'timestep': self.sub_timesteps_tensor,
-            'encoder_hidden_states': self.prompt_embeds,
-            'return_dict': False,
+            "sample": x_t_latent,
+            "timestep": self.sub_timesteps_tensor,
+            "encoder_hidden_states": self.prompt_embeds,
+            "return_dict": False,
         }
-        
+
         # Add SDXL-specific conditioning if this is an SDXL model
-        if self.is_sdxl and hasattr(self, 'add_text_embeds') and hasattr(self, 'add_time_ids'):
+        if self.is_sdxl and hasattr(self, "add_text_embeds") and hasattr(self, "add_time_ids"):
             if self.add_text_embeds is not None and self.add_time_ids is not None:
                 # For txt2img, replicate conditioning to match batch size
-                add_text_embeds = self.add_text_embeds[1:2].repeat(batch_size, 1) if self.add_text_embeds.shape[0] > 1 else self.add_text_embeds.repeat(batch_size, 1)
-                add_time_ids = self.add_time_ids[1:2].repeat(batch_size, 1) if self.add_time_ids.shape[0] > 1 else self.add_time_ids.repeat(batch_size, 1)
-                
-                unet_kwargs['added_cond_kwargs'] = {
-                    'text_embeds': add_text_embeds,
-                    'time_ids': add_time_ids
-                }
+                add_text_embeds = (
+                    self.add_text_embeds[1:2].repeat(batch_size, 1)
+                    if self.add_text_embeds.shape[0] > 1
+                    else self.add_text_embeds.repeat(batch_size, 1)
+                )
+                add_time_ids = (
+                    self.add_time_ids[1:2].repeat(batch_size, 1)
+                    if self.add_time_ids.shape[0] > 1
+                    else self.add_time_ids.repeat(batch_size, 1)
+                )
+
+                unet_kwargs["added_cond_kwargs"] = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
 
         # Call UNet with appropriate conditioning
         if self.is_sdxl:
@@ -1158,24 +1139,21 @@ def txt2img_sd_turbo(self, batch_size: int = 1) -> torch.Tensor:
                 encoder_hidden_states=self.prompt_embeds,
                 return_dict=False,
             )
-            
-        x_0_pred_out = (
-            x_t_latent - self.beta_prod_t_sqrt * model_pred
-        ) / self.alpha_prod_t_sqrt
-        
+
+        x_0_pred_out = (x_t_latent - self.beta_prod_t_sqrt * model_pred) / self.alpha_prod_t_sqrt
+
         # LATENT POSTPROCESSING HOOKS: After diffusion, before VAE decoding
         x_0_pred_out = self._apply_latent_postprocessing_hooks(x_0_pred_out)
-        
+
         # Store latent result for latent feedback processors (reuse pre-allocated buffer)
         if self._latent_cache is None:
             self._latent_cache = torch.empty_like(x_0_pred_out)
         self._latent_cache.copy_(x_0_pred_out)
         self.prev_latent_result = self._latent_cache
 
-        
         x_output = self.decode_image(x_0_pred_out)
-        
+
         # IMAGE POSTPROCESSING HOOKS: After VAE decoding, before final output
         x_output = self._apply_image_postprocessing_hooks(x_output)
-        
+
         return x_output

From 6f353e589c0b2b8c693fd7ef93a842c7004358d7 Mon Sep 17 00:00:00 2001
From: "INTER.Tech" <INTER.Tech@interiam.co>
Date: Sat, 4 Apr 2026 03:17:13 -0400
Subject: [PATCH 10/11] fix(l2-cache): add TRT activation caching path to
 setup_l2_persistence
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

UNet2DConditionModelEngine has no named_parameters() — the previous code
crashed with AttributeError when TRT acceleration was enabled.

Two-path dispatch based on UNet type:
- PyTorch nn.Module: existing Tier 2 cudaStreamSetAttribute weight-pinning path
- TRT engine wrapper: new set_trt_persistent_cache() using
  IExecutionContext.persistent_cache_limit for activation caching in L2

TRT's persistent_cache_limit checks cudaLimitPersistingL2CacheSize at
assignment time (not context-creation time), so Tier 1 reservation must
precede the set call — which is the existing execution order.

Adds hasattr guard in pin_hot_unet_weights() so TRT engines short-circuit
cleanly without attempting named_parameters() iteration.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/streamdiffusion/tools/cuda_l2_cache.py | 79 +++++++++++++++++++---
 1 file changed, 68 insertions(+), 11 deletions(-)

diff --git a/src/streamdiffusion/tools/cuda_l2_cache.py b/src/streamdiffusion/tools/cuda_l2_cache.py
index cdafcaa9..a2e54f27 100644
--- a/src/streamdiffusion/tools/cuda_l2_cache.py
+++ b/src/streamdiffusion/tools/cuda_l2_cache.py
@@ -321,6 +321,10 @@ def pin_hot_unet_weights(
         if not tier1_ok:
             return 0
 
+    # TRT engine objects don't expose PyTorch parameters — skip Tier 2 gracefully
+    if not hasattr(unet, "named_parameters"):
+        return 0
+
     # Tier 2: Set access policy on hot attention weights
     # Target: to_q, to_k, to_v, to_out weights in hot transformer blocks.
     # These are small-to-medium GEMMs that benefit most from L2 hits.
@@ -351,15 +355,64 @@ def pin_hot_unet_weights(
     return pinned_count
 
 
-def setup_l2_persistence(unet: torch.nn.Module) -> bool:
+def set_trt_persistent_cache(unet, persist_mb: int = L2_PERSIST_MB) -> bool:
+    """
+    Enable TRT activation caching in L2 for a TensorRT UNet engine.
+
+    Sets IExecutionContext.persistent_cache_limit so TRT retains intermediate
+    activations in the L2 persisting region already reserved by Tier 1.
+
+    TRT checks the current cudaLimitPersistingL2CacheSize at assignment time
+    (not at context creation), so calling this after reserve_l2_persisting_cache()
+    is correct — no reordering of engine initialization is needed.
+
+    Args:
+        unet: UNet2DConditionModelEngine (must have .engine.context attribute).
+        persist_mb: Target L2 budget in MB. Uses Tier 1 reservation size (L2/2),
+                    which is guaranteed <= persistingL2CacheMaxSize on Ampere+.
+
+    Returns:
+        True if activation caching was enabled successfully.
+    """
+    if not L2_PERSIST_ENABLED:
+        return False
+
+    try:
+        context = unet.engine.context
+    except AttributeError:
+        return False
+
+    if not hasattr(context, "persistent_cache_limit"):
+        return False
+
+    persist_bytes = persist_mb * 1024 * 1024
+    try:
+        context.persistent_cache_limit = persist_bytes
+        actual = context.persistent_cache_limit
+        print(
+            f"[L2] TRT UNet activation caching: {actual / (1024 * 1024):.0f}MB "
+            f"of L2 persisting region allocated for activation persistence"
+        )
+        return actual > 0
+    except Exception as e:
+        print(f"[L2] TRT persistent_cache_limit failed: {e}")
+        return False
+
+
+def setup_l2_persistence(unet) -> bool:
     """
     Main entry point: set up L2 cache persistence for UNet inference.
 
-    Call this AFTER model is loaded and BEFORE torch.compile.
-    For best results with frozen weights, call AFTER torch.compile with freezing=True.
+    Dispatches between two strategies based on UNet type:
+    - PyTorch UNet (nn.Module): pin hot attention weight tensors via
+      cudaStreamSetAttribute access policy windows (Tier 2).
+    - TRT UNet engine: enable TRT's native activation caching in L2 via
+      IExecutionContext.persistent_cache_limit.
+
+    Both paths share Tier 1 (cudaDeviceSetLimit L2 reservation).
 
     Args:
-        unet: The UNet model on CUDA.
+        unet: The UNet model — either a PyTorch nn.Module or a TRT engine wrapper.
 
     Returns:
         True if at least Tier 1 (L2 reservation) succeeded.
@@ -376,12 +429,16 @@ def setup_l2_persistence(unet: torch.nn.Module) -> bool:
     tier1_ok = reserve_l2_persisting_cache(L2_PERSIST_MB)
 
     if tier1_ok:
-        # Tier 2: per-tensor access policy (best-effort)
-        pinned = pin_hot_unet_weights(unet, persist_mb=0)  # Tier 1 already reserved
-        if pinned == 0:
-            print(
-                "[L2] Tier 2 access policy skipped (call pin_hot_unet_weights() "
-                "after compile+freeze for per-tensor control)"
-            )
+        if hasattr(unet, "named_parameters"):
+            # PyTorch path: pin hot attention weight tensors in L2
+            pinned = pin_hot_unet_weights(unet, persist_mb=0)  # Tier 1 already reserved
+            if pinned == 0:
+                print(
+                    "[L2] Tier 2 access policy skipped (call pin_hot_unet_weights() "
+                    "after compile+freeze for per-tensor control)"
+                )
+        else:
+            # TRT engine path: use TRT's native activation caching instead
+            set_trt_persistent_cache(unet, persist_mb=L2_PERSIST_MB)
 
     return tier1_ok

From 0c44db24502dc3a175088d2834cc6239b82185c6 Mon Sep 17 00:00:00 2001
From: "INTER.Tech" <INTER.Tech@interiam.co>
Date: Sat, 4 Apr 2026 04:02:08 -0400
Subject: [PATCH 11/11] fix: clamp TRT persistent_cache_limit to
 L2_cache_size//2 to avoid exceeding hardware max

---
 src/streamdiffusion/tools/cuda_l2_cache.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/streamdiffusion/tools/cuda_l2_cache.py b/src/streamdiffusion/tools/cuda_l2_cache.py
index a2e54f27..2ca5c40e 100644
--- a/src/streamdiffusion/tools/cuda_l2_cache.py
+++ b/src/streamdiffusion/tools/cuda_l2_cache.py
@@ -385,7 +385,8 @@ def set_trt_persistent_cache(unet, persist_mb: int = L2_PERSIST_MB) -> bool:
     if not hasattr(context, "persistent_cache_limit"):
         return False
 
-    persist_bytes = persist_mb * 1024 * 1024
+    props = torch.cuda.get_device_properties(torch.cuda.current_device())
+    persist_bytes = min(persist_mb * 1024 * 1024, props.L2_cache_size // 2)
     try:
         context.persistent_cache_limit = persist_bytes
         actual = context.persistent_cache_limit