From 124dd5e743658150ba57de804823026942d4f9fa Mon Sep 17 00:00:00 2001
From: "INTER.Tech" <INTER.Tech@interiam.co>
Date: Sat, 4 Apr 2026 03:21:15 -0400
Subject: [PATCH 01/10] perf: clean up deprecated TRT 10.x API usage in engine
 builder and preprocessing

---
 .../acceleration/tensorrt/utilities.py        | 34 +++++++++++--------
 .../processors/realesrgan_trt.py              |  2 +-
 .../tools/compile_raft_tensorrt.py            |  2 +-
 3 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/src/streamdiffusion/acceleration/tensorrt/utilities.py b/src/streamdiffusion/acceleration/tensorrt/utilities.py
index 48a6c319..83391271 100644
--- a/src/streamdiffusion/acceleration/tensorrt/utilities.py
+++ b/src/streamdiffusion/acceleration/tensorrt/utilities.py
@@ -261,11 +261,9 @@ def build(
 
         if workspace_size > 0:
             config_kwargs["memory_pool_limits"] = {trt.MemoryPoolType.WORKSPACE: workspace_size}
-        if not enable_all_tactics:
-            config_kwargs["tactic_sources"] = [
-                1 << int(trt.TacticSource.CUBLAS),
-                1 << int(trt.TacticSource.CUBLAS_LT),
-            ]
+        # tactic_sources restriction removed: TacticSource.CUBLAS (deprecated TRT 10.0)
+        # and CUBLAS_LT (deprecated TRT 9.0) are no longer meaningful on TRT 10.x.
+        # TRT uses its default tactic selection for all builds regardless of enable_all_tactics.
 
         engine = engine_from_network(
             network_from_onnx_path(onnx_path, flags=[trt.OnnxParserFlag.NATIVE_INSTANCENORM]),
@@ -314,19 +312,19 @@ def _build_fp8(self, onnx_path, input_profile, workspace_size, enable_all_tactic
             )
 
         config = builder.create_builder_config()
-        # TRT 10.12+ with STRONGLY_TYPED network: precision flags (FP8, FP16, TF32)
-        # must NOT be set — the Q/DQ node annotations dictate precision directly.
-        # Older TRT versions need both the BuilderFlag and the network flag.
+        # BuilderFlag.STRONGLY_TYPED was removed in TRT 10.12; the network-level flag
+        # (NetworkDefinitionCreationFlag.STRONGLY_TYPED, line ~304) is now the only
+        # mechanism. On older TRT versions where BuilderFlag.STRONGLY_TYPED still exists,
+        # we also set precision flags on the config so the builder considers FP8/FP16 kernels.
         if hasattr(trt.BuilderFlag, 'STRONGLY_TYPED'):
-            # TRT < 10.12: set all precision flags + STRONGLY_TYPED on config
+            # TRT < 10.12: BuilderFlag.STRONGLY_TYPED exists — set precision flags and
+            # the builder-level STRONGLY_TYPED flag alongside the network-level flag.
             config.set_flag(trt.BuilderFlag.FP8)
             config.set_flag(trt.BuilderFlag.FP16)
             config.set_flag(trt.BuilderFlag.TF32)
             config.set_flag(trt.BuilderFlag.STRONGLY_TYPED)
-        else:
-            # TRT 10.12+: NetworkDefinitionCreationFlag.STRONGLY_TYPED (line 304)
-            # handles precision; setting FP8 flag causes API Usage Error.
-            pass
+        # else: TRT 10.12+ — NetworkDefinitionCreationFlag.STRONGLY_TYPED (set on network
+        # creation above) is sufficient; Q/DQ node annotations dictate precision directly.
 
         if workspace_size > 0:
             config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, workspace_size)
@@ -392,7 +390,10 @@ def allocate_buffers(self, shape_dict=None, device="cuda"):
             mode = self.engine.get_tensor_mode(name)
 
             if mode == trt.TensorIOMode.INPUT:
-                self.context.set_input_shape(name, shape)
+                if not self.context.set_input_shape(name, shape):
+                    raise RuntimeError(
+                        f"TensorRT: set_input_shape failed for '{name}' with shape {shape}"
+                    )
 
             tensor = torch.empty(tuple(shape), dtype=numpy_to_torch_dtype_dict[dtype_np]).to(device=device)
             self.tensors[name] = tensor
@@ -481,7 +482,10 @@ def infer(self, feed_dict, stream, use_cuda_graph=False):
             self.tensors[name].copy_(buf)
 
         for name, tensor in self.tensors.items():
-            self.context.set_tensor_address(name, tensor.data_ptr())
+            if not self.context.set_tensor_address(name, tensor.data_ptr()):
+                raise RuntimeError(
+                    f"TensorRT: set_tensor_address failed for '{name}'"
+                )
 
         if use_cuda_graph:
             if self.cuda_graph_instance is not None:
diff --git a/src/streamdiffusion/preprocessing/processors/realesrgan_trt.py b/src/streamdiffusion/preprocessing/processors/realesrgan_trt.py
index e7009274..18adfbb2 100644
--- a/src/streamdiffusion/preprocessing/processors/realesrgan_trt.py
+++ b/src/streamdiffusion/preprocessing/processors/realesrgan_trt.py
@@ -307,7 +307,7 @@ def _build_tensorrt_engine(self):
         try:
             # Create builder and network
             builder = trt.Builder(trt.Logger(trt.Logger.WARNING))
-            network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
+            network = builder.create_network()  # EXPLICIT_BATCH deprecated/ignored in TRT 10.x
             parser = trt.OnnxParser(network, trt.Logger(trt.Logger.WARNING))
             
             # Parse ONNX model
diff --git a/src/streamdiffusion/tools/compile_raft_tensorrt.py b/src/streamdiffusion/tools/compile_raft_tensorrt.py
index 8dec3a76..8734987e 100644
--- a/src/streamdiffusion/tools/compile_raft_tensorrt.py
+++ b/src/streamdiffusion/tools/compile_raft_tensorrt.py
@@ -144,7 +144,7 @@ def build_tensorrt_engine(
     
     try:
         builder = trt.Builder(trt.Logger(trt.Logger.INFO))
-        network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
+        network = builder.create_network()  # EXPLICIT_BATCH deprecated/ignored in TRT 10.x
         parser = trt.OnnxParser(network, trt.Logger(trt.Logger.WARNING))
         
         logger.info("Parsing ONNX model...")

From 00108d36e7fb6db4540f33bb36b2d8000316a3bf Mon Sep 17 00:00:00 2001
From: "INTER.Tech" <INTER.Tech@interiam.co>
Date: Sat, 4 Apr 2026 14:45:09 -0400
Subject: [PATCH 02/10] fix(fp8): remove direct_io_types/simplify, make
 allocate_buffers FP8-safe
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove `direct_io_types=True` from ModelOpt quantize_kwargs — it caused
engine I/O tensors to be typed as FLOAT8E4M3FN, which crashes at runtime
because `trt.nptype()` has no numpy equivalent for FP8.

Remove `simplify=True` — always fails with protobuf >2GB parse error on
our external-data-format ONNX (graceful fallback, but wastes ~1 min).

Make `Engine.allocate_buffers` and `TensorRTEngine.allocate_buffers` FP8-
resilient: catch TypeError from `trt.nptype()` and fall back to
`torch.float8_e4m3fn` directly, bypassing the numpy intermediate.

FP8 ONNX must be regenerated (delete unet.engine.fp8.onnx* + unet.engine,
keep timing.cache). Entropy calibration and calibrate_per_node are retained.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../acceleration/tensorrt/fp8_quantize.py     |  34 +-
 .../acceleration/tensorrt/utilities.py        | 458 ++++++++++++++++--
 .../processors/temporal_net_tensorrt.py       |  20 +-
 3 files changed, 448 insertions(+), 64 deletions(-)

diff --git a/src/streamdiffusion/acceleration/tensorrt/fp8_quantize.py b/src/streamdiffusion/acceleration/tensorrt/fp8_quantize.py
index 2a0c2d2e..66e5a899 100644
--- a/src/streamdiffusion/acceleration/tensorrt/fp8_quantize.py
+++ b/src/streamdiffusion/acceleration/tensorrt/fp8_quantize.py
@@ -223,9 +223,10 @@ def quantize_onnx_fp8(
         onnx_fp8_path: Output FP8 quantized ONNX path (*.fp8.onnx).
         calibration_data: Unused. Kept for backward compatibility.
         quantize_mha: Enable FP8 quantization of multi-head attention ops.
-                      Recommended: True. Requires TRT 10+ and compute 8.9+.
-        percentile: Percentile for activation range calibration.
-                    1.0 = no clipping (safest for first run).
+                      Kept False — MHA analysis via ORT inference adds ~3 hours to build.
+                      Non-MHA ops (Conv, Gemm, MatMul outside MHA) are still FP8.
+        percentile: Unused. Kept for backward compatibility (entropy calibration
+                    does not use percentile clipping).
         alpha: SmoothQuant alpha — balances quantization difficulty between
                activations (alpha→0) and weights (alpha→1). 0.8 is optimal
                for transformer attention layers.
@@ -256,7 +257,7 @@ def quantize_onnx_fp8(
     logger.info(f"[FP8] Starting ONNX FP8 quantization")
     logger.info(f"[FP8]   Input:  {onnx_opt_path} ({input_size_mb:.0f} MB)")
     logger.info(f"[FP8]   Output: {onnx_fp8_path}")
-    logger.info(f"[FP8]   Config: quantize_mha={quantize_mha}, percentile={percentile}, alpha={alpha}")
+    logger.info(f"[FP8]   Config: quantize_mha={quantize_mha}, calibration=entropy, alpha={alpha}")
     logger.info(f"[FP8]   Calibration: RandomDataProvider with calibration_shapes (model_data={'provided' if model_data is not None else 'none'})")
 
     # Patch ByteSize() for >2GB ONNX models: modelopt calls onnx_model.ByteSize()
@@ -373,8 +374,10 @@ def _safe_byte_size(self):
     quantize_kwargs = {
         "quantize_mode": "fp8",
         "output_path": onnx_fp8_path,
-        "calibration_method": "percentile",
-        "percentile": percentile,
+        # entropy: minimizes KL divergence to find optimal clipping point for each tensor.
+        # Better than percentile=1.0 (no clipping) which allows outliers to stretch the
+        # quantization range, reducing precision for the bulk of activations.
+        "calibration_method": "entropy",
         "alpha": alpha,
         "use_external_data_format": True,
         # override_shapes replaces dynamic dims in the ONNX model itself with static
@@ -388,20 +391,23 @@ def _safe_byte_size(self):
         # Use default EPs ["cpu","cuda:0","trt"] — CPU-only would fail on this FP16 SDXL
         # model because ORT's mandatory CastFloat16Transformer inserts Cast nodes that
         # conflict with existing Cast nodes in the upsampler conv.
-        # disable_mha_qdq controls modelopt's MHA analysis. When True, MHA MatMul
-        # nodes are excluded from FP8 quantization WITHOUT running ORT inference.
-        # Non-MHA ops (Conv, Linear, LayerNorm) still get FP8 Q/DQ nodes.
+        # disable_mha_qdq=True: skip MHA pattern analysis (avoids 3-hour ORT inference
+        # pass over the full model graph). Non-MHA ops (Conv, Gemm, MatMul outside MHA)
+        # still get FP8 Q/DQ nodes via the normal KGEN/CASK path.
         "disable_mha_qdq": not quantize_mha,
+        # calibrate_per_node: calibrate one node at a time to reduce peak VRAM during
+        # calibration. Essential for large UNets (83 inputs, 7993 nodes) to avoid OOM.
+        "calibrate_per_node": True,
     }
 
     try:
         modelopt_quantize(onnx_opt_path, **quantize_kwargs)
     except TypeError as e:
-        # Older nvidia-modelopt versions may not support alpha/disable_mha_qdq.
-        # Retry with base parameters only.
-        logger.warning(f"[FP8] Retrying without alpha/disable_mha_qdq (TypeError: {e})")
-        quantize_kwargs.pop("alpha", None)
-        quantize_kwargs.pop("disable_mha_qdq", None)
+        # Older nvidia-modelopt versions may not support newer kwargs.
+        # Strip down to base parameters and retry.
+        logger.warning(f"[FP8] Retrying with reduced kwargs (TypeError: {e})")
+        for _k in ("alpha", "disable_mha_qdq", "calibrate_per_node"):
+            quantize_kwargs.pop(_k, None)
         modelopt_quantize(onnx_opt_path, **quantize_kwargs)
     except Exception as e:
         # MHA analysis (disable_mha_qdq=False) requires an ORT inference run that
diff --git a/src/streamdiffusion/acceleration/tensorrt/utilities.py b/src/streamdiffusion/acceleration/tensorrt/utilities.py
index 83391271..d20f1d16 100644
--- a/src/streamdiffusion/acceleration/tensorrt/utilities.py
+++ b/src/streamdiffusion/acceleration/tensorrt/utilities.py
@@ -22,7 +22,9 @@
 
 # Set up logger for this module
 import logging
+import os
 from collections import OrderedDict
+from dataclasses import dataclass
 from typing import Optional, Union
 
 import numpy as np
@@ -40,14 +42,7 @@
 from PIL import Image
 from polygraphy import cuda
 from polygraphy.backend.common import bytes_from_path
-from polygraphy.backend.trt import (
-    CreateConfig,
-    Profile,
-    engine_from_bytes,
-    engine_from_network,
-    network_from_onnx_path,
-    save_engine,
-)
+from polygraphy.backend.trt import engine_from_bytes
 
 from .models.models import CLIP, VAE, BaseModel, UNet, VAEEncoder
 
@@ -59,6 +54,244 @@
 from ...model_detection import detect_model
 
 
+# ---------------------------------------------------------------------------
+# GPU Hardware Profile — hardware-aware TRT builder configuration
+# ---------------------------------------------------------------------------
+
+@dataclass
+class GPUBuildProfile:
+    """
+    Hardware-aware TRT builder configuration derived from CUDA device properties.
+
+    All parameters are auto-selected based on GPU architecture tier:
+      - Ampere  (CC 8.0–8.8): Conservative — small L2, preserve VRAM
+      - Ada     (CC 8.9):      Balanced   — large L2, benefit from deeper tiling/opt
+      - Blackwell (CC 12.0+):  Aggressive — massive L2, max search depth
+    """
+    gpu_name: str
+    compute_capability: tuple
+    l2_cache_bytes: int
+    vram_bytes: int
+    sm_count: int
+    tier: str  # "ampere", "ada", "blackwell", "unknown"
+
+    # IBuilderConfig parameters
+    builder_optimization_level: int     # 0–5; higher = better kernels, longer build
+    tiling_optimization_level: str      # "NONE"/"FAST"/"MODERATE"/"FULL"
+    l2_limit_for_tiling: int            # bytes; target L2 budget for tiling
+    max_aux_streams: int                # reserved; NOT applied (TRT heuristic is better)
+    sparse_weights: bool                # examine weights for 2:4 sparsity (Ampere+)
+    enable_runtime_activation_resize: bool  # RUNTIME_ACTIVATION_RESIZE_10_10
+    max_workspace_cap_bytes: int        # hard cap on workspace (before free-mem calc)
+
+
+def detect_gpu_profile(device: int = 0) -> GPUBuildProfile:
+    """
+    Detect the current GPU and return hardware-optimal TRT builder parameters.
+
+    Called once at the start of every engine build so that all IBuilderConfig
+    settings are tuned to the exact GPU running the build.
+
+    Tiers and rationale
+    -------------------
+    Ampere (CC 8.0–8.8, e.g. RTX 3090 — 6 MiB L2, 82 SMs):
+      - Opt level 4: always compiles dynamic kernels (better than level-3 heuristics)
+      - Tiling FAST (static shapes only): small L2 gains little from deep search
+      - 8 GiB workspace cap: conserve VRAM on 24 GB cards
+
+    Ada Lovelace (CC 8.9, e.g. RTX 4090 — 72 MiB L2, 128 SMs):
+      - Opt level 4: dynamic kernels without level-5 profiling OOM risk
+      - Tiling MODERATE (static shapes only): 12× more L2 makes tiling worthwhile
+      - 12 GiB workspace cap
+
+    Blackwell (CC 12.0+, e.g. RTX 5090 — 128 MiB L2, ~170 SMs):
+      - Opt level 4: same rationale — level 5 causes OOM during tactic profiling
+      - Tiling FULL (static shapes only): massive L2 warrants widest search
+      - 16 GiB workspace cap
+
+    Note: tiling_optimization_level and l2_limit_for_tiling are only effective for
+    static-shape engines. TRT confirms: "Graph contains symbolic shape, l2tc doesn't
+    take effect." For dynamic-shape builds (our default), these are skipped entirely
+    to avoid warning spam and wasted build time.
+
+    max_aux_streams is NOT set — TRT's own heuristic is better than a fixed value.
+    Setting it explicitly causes "[MS] Multi stream is disabled" warnings on simple
+    models (VAE) without proven benefit on complex ones (UNet).
+    """
+    try:
+        props = torch.cuda.get_device_properties(device)
+    except Exception as e:
+        logger.warning(f"[TRT Build] Could not query GPU properties: {e} — using fallback profile")
+        return _fallback_profile()
+
+    cc = (props.major, props.minor)
+    l2 = props.L2_cache_size
+    vram = props.total_memory
+    sms = props.multi_processor_count
+
+    # --- Tier selection ---
+    # opt_level=4 for all tiers: always compiles dynamic kernels (better than
+    # level-3 heuristics) without level-5's "compare dynamic vs static" extra pass
+    # which OOMs during tactic profiling on dynamic-shape engines (160 GiB request).
+    if cc >= (12, 0):
+        tier = "blackwell"
+        opt_level = 4
+        tiling = "FULL"
+        max_ws_cap = 16 * (2 ** 30)   # 16 GiB cap
+    elif cc >= (8, 9):                 # Ada Lovelace (8.9 exactly)
+        tier = "ada"
+        opt_level = 4
+        tiling = "MODERATE"
+        max_ws_cap = 12 * (2 ** 30)   # 12 GiB cap
+    elif cc >= (8, 0):                 # Ampere (8.0 – 8.8)
+        tier = "ampere"
+        opt_level = 4
+        tiling = "FAST"
+        max_ws_cap = 8 * (2 ** 30)    # 8 GiB cap
+    else:
+        # Pre-Ampere or unknown — use conservative defaults
+        tier = "unknown"
+        opt_level = 3
+        tiling = "NONE"
+        max_ws_cap = 8 * (2 ** 30)
+
+    profile = GPUBuildProfile(
+        gpu_name=props.name,
+        compute_capability=cc,
+        l2_cache_bytes=l2,
+        vram_bytes=vram,
+        sm_count=sms,
+        tier=tier,
+        builder_optimization_level=opt_level,
+        tiling_optimization_level=tiling,
+        l2_limit_for_tiling=l2,        # use full L2 as tiling budget (static builds only)
+        max_aux_streams=0,              # 0 = let TRT decide (avoids "[MS] disabled" spam)
+        sparse_weights=True,            # always examine; no downside if not sparse
+        enable_runtime_activation_resize=True,
+        max_workspace_cap_bytes=max_ws_cap,
+    )
+
+    logger.info(
+        f"[TRT Build] GPU detected: {props.name} | "
+        f"CC {cc[0]}.{cc[1]} | Tier: {tier} | "
+        f"L2: {l2 // (1024 * 1024)} MiB | VRAM: {vram // (1024 ** 3)} GiB | "
+        f"opt_level={opt_level}"
+    )
+    return profile
+
+
+def _fallback_profile() -> GPUBuildProfile:
+    """Conservative fallback when GPU detection fails."""
+    return GPUBuildProfile(
+        gpu_name="unknown",
+        compute_capability=(8, 0),
+        l2_cache_bytes=6 * 1024 * 1024,
+        vram_bytes=24 * (2 ** 30),
+        sm_count=82,
+        tier="unknown",
+        builder_optimization_level=3,
+        tiling_optimization_level="NONE",
+        l2_limit_for_tiling=6 * 1024 * 1024,
+        max_aux_streams=0,              # reserved; NOT applied
+        sparse_weights=False,
+        enable_runtime_activation_resize=True,
+        max_workspace_cap_bytes=8 * (2 ** 30),
+    )
+
+
+def _apply_gpu_profile_to_config(
+    config: "trt.IBuilderConfig",
+    gpu_profile: Optional[GPUBuildProfile],
+    dynamic_shapes: bool = True,
+) -> None:
+    """
+    Apply hardware-aware IBuilderConfig parameters that Polygraphy does not expose.
+
+    Called for both FP16 and FP8 builds after the config object is created.
+    All settings gracefully degrade if the TRT version doesn't support a feature.
+
+    Args:
+        config: TRT IBuilderConfig to modify.
+        gpu_profile: Hardware-detected build parameters from detect_gpu_profile().
+        dynamic_shapes: Whether this engine uses dynamic input shapes.
+            - True  (default): tiling and l2_limit skipped — TRT confirms these have
+              no effect on symbolic-shape graphs and only produce warning spam.
+            - False (static): tiling and l2_limit applied for full L2 cache benefit.
+    """
+    if gpu_profile is None:
+        return
+
+    # builder_optimization_level (0–5):
+    #   4 = always compiles dynamic kernels (better than level-3 heuristics)
+    #   5 = additionally compares dynamic vs static kernels — causes OOM during
+    #       tactic profiling on dynamic-shape engines (160 GiB requests observed).
+    # We use level 4 for all tiers to get the dynamic-kernel benefit without the
+    # level-5 exhaustive comparison that OOMs.
+    try:
+        config.builder_optimization_level = gpu_profile.builder_optimization_level
+        logger.info(f"[TRT Config] builder_optimization_level={gpu_profile.builder_optimization_level}")
+    except AttributeError:
+        logger.debug("[TRT Config] builder_optimization_level not supported — skipping")
+
+    # tiling_optimization_level + l2_limit_for_tiling:
+    # TRT's L2 tiling cache optimization requires static/concrete shapes to work.
+    # For dynamic-shape engines, TRT emits: "Graph contains symbolic shape, l2tc
+    # doesn't take effect" for every applicable layer — pure warning spam with zero
+    # benefit. Skipped when dynamic_shapes=True.
+    if not dynamic_shapes and gpu_profile.tiling_optimization_level != "NONE":
+        try:
+            tiling_map = {
+                "NONE": trt.TilingOptimizationLevel.NONE,
+                "FAST": trt.TilingOptimizationLevel.FAST,
+                "MODERATE": trt.TilingOptimizationLevel.MODERATE,
+                "FULL": trt.TilingOptimizationLevel.FULL,
+            }
+            tiling_level = tiling_map.get(gpu_profile.tiling_optimization_level, trt.TilingOptimizationLevel.NONE)
+            config.tiling_optimization_level = tiling_level
+            logger.info(f"[TRT Config] tiling_optimization_level={gpu_profile.tiling_optimization_level}")
+        except AttributeError:
+            logger.debug("[TRT Config] tiling_optimization_level not supported — skipping")
+
+        try:
+            if gpu_profile.l2_limit_for_tiling > 0:
+                config.l2_limit_for_tiling = gpu_profile.l2_limit_for_tiling
+                logger.info(
+                    f"[TRT Config] l2_limit_for_tiling={gpu_profile.l2_limit_for_tiling // (1024 * 1024)} MiB"
+                )
+        except AttributeError:
+            logger.debug("[TRT Config] l2_limit_for_tiling not supported — skipping")
+    elif dynamic_shapes:
+        logger.debug(
+            "[TRT Config] tiling_optimization_level/l2_limit skipped — dynamic shapes "
+            "(would produce '[l2tc] VALIDATE FAIL' warnings with no effect)"
+        )
+
+    # max_aux_streams: NOT SET — let TRT use its own heuristic.
+    # Setting an explicit value causes "[MS] Multi stream is disabled" warnings on
+    # any model where TRT can't assign that many streams (e.g. VAE decoder which is
+    # too sequential). TRT's heuristic silently chooses the right value per model.
+
+    # SPARSE_WEIGHTS: let TRT examine weight tensors for structured 2:4 sparsity
+    # and use Sparse Tensor Core kernels if suitable. Zero downside for dense weights.
+    if gpu_profile.sparse_weights:
+        try:
+            config.set_flag(trt.BuilderFlag.SPARSE_WEIGHTS)
+            logger.info("[TRT Config] SPARSE_WEIGHTS enabled")
+        except Exception:
+            logger.debug("[TRT Config] SPARSE_WEIGHTS not supported — skipping")
+
+    # RUNTIME_ACTIVATION_RESIZE_10_10: allows update_device_memory_size_for_shapes()
+    # to shrink activation memory when actual input shapes are smaller than max profile
+    # dims. Our engines use dynamic shapes (min 256 → max 1024), so running at 512x512
+    # can save ~50–75% of peak activation VRAM compared to always allocating for 1024.
+    if gpu_profile.enable_runtime_activation_resize:
+        try:
+            config.set_preview_feature(trt.PreviewFeature.RUNTIME_ACTIVATION_RESIZE_10_10, True)
+            logger.info("[TRT Config] RUNTIME_ACTIVATION_RESIZE_10_10 enabled")
+        except Exception:
+            logger.debug("[TRT Config] RUNTIME_ACTIVATION_RESIZE_10_10 not supported — skipping")
+
+
 # Map of numpy dtype -> torch dtype
 numpy_to_torch_dtype_dict = {
     np.uint8: torch.uint8,
@@ -244,42 +477,116 @@ def build(
         timing_cache=None,
         workspace_size=0,
         fp8=False,
+        gpu_profile: Optional["GPUBuildProfile"] = None,
+        dynamic_shapes: bool = True,
     ):
         logger.info(f"Building TensorRT engine for {onnx_path}: {self.engine_path}")
 
         if fp8:
-            self._build_fp8(onnx_path, input_profile, workspace_size, enable_all_tactics)
+            self._build_fp8(
+                onnx_path, input_profile, workspace_size, enable_all_tactics,
+                timing_cache=timing_cache, gpu_profile=gpu_profile,
+                dynamic_shapes=dynamic_shapes,
+            )
             return
 
-        p = Profile()
+        # --- Build using raw TRT API for full IBuilderConfig access ---
+        # Polygraphy's CreateConfig does not expose: tiling_optimization_level,
+        # l2_limit_for_tiling, max_aux_streams, builder_optimization_level,
+        # set_preview_feature, or SPARSE_WEIGHTS. We use the raw API (same as
+        # the FP8 path) so all parameters are available for both precision paths.
+
+        build_logger = trt.Logger(trt.Logger.WARNING)
+        builder = trt.Builder(build_logger)
+
+        network_flags = 0
+        network = builder.create_network(network_flags)
+
+        parser = trt.OnnxParser(network, build_logger)
+        parser.set_flag(trt.OnnxParserFlag.NATIVE_INSTANCENORM)
+        success = parser.parse_from_file(onnx_path)
+        if not success:
+            errors = [parser.get_error(i) for i in range(parser.num_errors)]
+            raise RuntimeError(
+                f"TRT ONNX parser failed for FP16 engine: {onnx_path}\n"
+                + "\n".join(str(e) for e in errors)
+            )
+
+        config = builder.create_builder_config()
+
+        # Precision flags
+        if fp16:
+            config.set_flag(trt.BuilderFlag.FP16)
+        config.set_flag(trt.BuilderFlag.TF32)
+
+        if enable_refit:
+            config.set_flag(trt.BuilderFlag.REFIT)
+
+        # Workspace
+        if workspace_size > 0:
+            config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, workspace_size)
+
+        # Optimization profile
         if input_profile:
+            profile = builder.create_optimization_profile()
             for name, dims in input_profile.items():
-                assert len(dims) == 3
-                p.add(name, min=dims[0], opt=dims[1], max=dims[2])
+                assert len(dims) == 3, f"Expected (min, opt, max) for {name}"
+                profile.set_shape(name, min=dims[0], opt=dims[1], max=dims[2])
+            config.add_optimization_profile(profile)
+
+        # Timing cache — load existing or create fresh
+        cache_data = b""
+        if timing_cache and os.path.exists(timing_cache):
+            try:
+                with open(timing_cache, "rb") as f:
+                    cache_data = f.read()
+                logger.info(f"[TRT Build] Loaded timing cache: {timing_cache} ({len(cache_data) // 1024} KB)")
+            except Exception as e:
+                logger.warning(f"[TRT Build] Could not load timing cache {timing_cache}: {e} — starting fresh")
+                cache_data = b""
+        trt_cache = config.create_timing_cache(cache_data)
+        config.set_timing_cache(trt_cache, ignore_mismatch=False)
+
+        # Apply hardware-aware profile parameters
+        _apply_gpu_profile_to_config(config, gpu_profile, dynamic_shapes=dynamic_shapes)
+
+        # Build and serialize
+        logger.info(f"[TRT Build] Building FP16 engine (raw API): {self.engine_path}")
+        serialized = builder.build_serialized_network(network, config)
+        if serialized is None:
+            raise RuntimeError(
+                f"TRT FP16 engine build failed for {onnx_path}. "
+                "Check TRT logs above for details."
+            )
 
-        config_kwargs = {}
+        with open(self.engine_path, "wb") as f:
+            f.write(serialized)
 
-        if workspace_size > 0:
-            config_kwargs["memory_pool_limits"] = {trt.MemoryPoolType.WORKSPACE: workspace_size}
-        # tactic_sources restriction removed: TacticSource.CUBLAS (deprecated TRT 10.0)
-        # and CUBLAS_LT (deprecated TRT 9.0) are no longer meaningful on TRT 10.x.
-        # TRT uses its default tactic selection for all builds regardless of enable_all_tactics.
-
-        engine = engine_from_network(
-            network_from_onnx_path(onnx_path, flags=[trt.OnnxParserFlag.NATIVE_INSTANCENORM]),
-            config=CreateConfig(
-                fp16=fp16,
-                tf32=True,
-                refittable=enable_refit,
-                profiles=[p],
-                load_timing_cache=timing_cache,
-                **config_kwargs,
-            ),
-            save_timing_cache=timing_cache,
-        )
-        save_engine(engine, path=self.engine_path)
+        # Save timing cache for next build
+        if timing_cache:
+            try:
+                updated_cache = config.get_timing_cache()
+                if updated_cache is not None:
+                    os.makedirs(os.path.dirname(timing_cache), exist_ok=True)
+                    with open(timing_cache, "wb") as f:
+                        f.write(updated_cache.serialize())
+                    logger.info(f"[TRT Build] Saved timing cache: {timing_cache}")
+            except Exception as e:
+                logger.warning(f"[TRT Build] Could not save timing cache: {e}")
 
-    def _build_fp8(self, onnx_path, input_profile, workspace_size, enable_all_tactics):
+        size_bytes = getattr(serialized, 'nbytes', None) or len(serialized)
+        logger.info(f"[TRT Build] FP16 engine saved: {self.engine_path} ({size_bytes / 1024 / 1024:.0f} MB)")
+
+    def _build_fp8(
+        self,
+        onnx_path,
+        input_profile,
+        workspace_size,
+        enable_all_tactics,
+        timing_cache=None,
+        gpu_profile: Optional["GPUBuildProfile"] = None,
+        dynamic_shapes: bool = True,
+    ):
         """
         Build a TRT engine from a Q/DQ-annotated FP8 ONNX using the raw TRT builder API.
 
@@ -292,17 +599,23 @@ def _build_fp8(self, onnx_path, input_profile, workspace_size, enable_all_tactic
             input_profile: Dict of {name: (min, opt, max)} shapes.
             workspace_size: TRT workspace limit in bytes.
             enable_all_tactics: If True, allow all TRT tactic sources.
+            timing_cache: Path to timing cache file for load/save.
+            gpu_profile: Hardware-aware build parameters from detect_gpu_profile().
+            dynamic_shapes: Whether the engine uses dynamic input shapes.
         """
-        TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
+        build_logger = trt.Logger(trt.Logger.WARNING)
 
-        builder = trt.Builder(TRT_LOGGER)
+        builder = trt.Builder(build_logger)
 
         # STRONGLY_TYPED: required for FP8. Tells TRT to use the data-type annotations
         # from Q/DQ nodes rather than running its own precision heuristics.
         network_flags = 1 << int(trt.NetworkDefinitionCreationFlag.STRONGLY_TYPED)
         network = builder.create_network(network_flags)
 
-        parser = trt.OnnxParser(network, TRT_LOGGER)
+        parser = trt.OnnxParser(network, build_logger)
+        # NATIVE_INSTANCENORM: use TRT's fused InstanceNorm/GroupNorm kernel instead
+        # of decomposing into primitive ops. Diffusion UNets use GroupNorm heavily.
+        parser.set_flag(trt.OnnxParserFlag.NATIVE_INSTANCENORM)
         success = parser.parse_from_file(onnx_path)
         if not success:
             errors = [parser.get_error(i) for i in range(parser.num_errors)]
@@ -313,9 +626,9 @@ def _build_fp8(self, onnx_path, input_profile, workspace_size, enable_all_tactic
 
         config = builder.create_builder_config()
         # BuilderFlag.STRONGLY_TYPED was removed in TRT 10.12; the network-level flag
-        # (NetworkDefinitionCreationFlag.STRONGLY_TYPED, line ~304) is now the only
-        # mechanism. On older TRT versions where BuilderFlag.STRONGLY_TYPED still exists,
-        # we also set precision flags on the config so the builder considers FP8/FP16 kernels.
+        # (NetworkDefinitionCreationFlag.STRONGLY_TYPED, set on network creation above)
+        # is now the only mechanism. On older TRT versions where BuilderFlag.STRONGLY_TYPED
+        # still exists, we also set precision flags on the config.
         if hasattr(trt.BuilderFlag, 'STRONGLY_TYPED'):
             # TRT < 10.12: BuilderFlag.STRONGLY_TYPED exists — set precision flags and
             # the builder-level STRONGLY_TYPED flag alongside the network-level flag.
@@ -336,6 +649,22 @@ def _build_fp8(self, onnx_path, input_profile, workspace_size, enable_all_tactic
                 profile.set_shape(name, min=dims[0], opt=dims[1], max=dims[2])
             config.add_optimization_profile(profile)
 
+        # Timing cache — load existing or create fresh
+        cache_data = b""
+        if timing_cache and os.path.exists(timing_cache):
+            try:
+                with open(timing_cache, "rb") as f:
+                    cache_data = f.read()
+                logger.info(f"[FP8] Loaded timing cache: {timing_cache} ({len(cache_data) // 1024} KB)")
+            except Exception as e:
+                logger.warning(f"[FP8] Could not load timing cache {timing_cache}: {e} — starting fresh")
+                cache_data = b""
+        trt_cache = config.create_timing_cache(cache_data)
+        config.set_timing_cache(trt_cache, ignore_mismatch=False)
+
+        # Apply hardware-aware profile parameters
+        _apply_gpu_profile_to_config(config, gpu_profile, dynamic_shapes=dynamic_shapes)
+
         logger.info(f"[FP8] Building TRT FP8 engine (STRONGLY_TYPED): {self.engine_path}")
         serialized = builder.build_serialized_network(network, config)
         if serialized is None:
@@ -347,6 +676,18 @@ def _build_fp8(self, onnx_path, input_profile, workspace_size, enable_all_tactic
         with open(self.engine_path, "wb") as f:
             f.write(serialized)
 
+        # Save timing cache for next build
+        if timing_cache:
+            try:
+                updated_cache = config.get_timing_cache()
+                if updated_cache is not None:
+                    os.makedirs(os.path.dirname(timing_cache), exist_ok=True)
+                    with open(timing_cache, "wb") as f:
+                        f.write(updated_cache.serialize())
+                    logger.info(f"[FP8] Saved timing cache: {timing_cache}")
+            except Exception as e:
+                logger.warning(f"[FP8] Could not save timing cache: {e}")
+
         size_bytes = getattr(serialized, 'nbytes', None) or len(serialized)
         logger.info(f"[FP8] Engine saved: {self.engine_path} ({size_bytes / 1024 / 1024:.0f} MB)")
 
@@ -386,7 +727,16 @@ def allocate_buffers(self, shape_dict=None, device="cuda"):
             else:
                 shape = self.engine.get_tensor_shape(name)
 
-            dtype_np = trt.nptype(self.engine.get_tensor_dtype(name))
+            trt_dtype = self.engine.get_tensor_dtype(name)
+            try:
+                dtype_np = trt.nptype(trt_dtype)
+                torch_dtype = numpy_to_torch_dtype_dict[dtype_np]
+            except TypeError:
+                # FP8 (FLOAT8E4M3FN) has no numpy equivalent — map directly to torch
+                if trt_dtype == trt.DataType.FP8:
+                    torch_dtype = torch.float8_e4m3fn
+                else:
+                    raise
             mode = self.engine.get_tensor_mode(name)
 
             if mode == trt.TensorIOMode.INPUT:
@@ -395,7 +745,7 @@ def allocate_buffers(self, shape_dict=None, device="cuda"):
                         f"TensorRT: set_input_shape failed for '{name}' with shape {shape}"
                     )
 
-            tensor = torch.empty(tuple(shape), dtype=numpy_to_torch_dtype_dict[dtype_np]).to(device=device)
+            tensor = torch.empty(tuple(shape), dtype=torch_dtype).to(device=device)
             self.tensors[name] = tensor
 
         # Cache allocation parameters for reuse check
@@ -600,14 +950,31 @@ def build_engine(
     build_enable_refit: bool = False,
     fp8: bool = False,
 ):
+    # --- Step 0: Detect GPU and select hardware-optimal build parameters ---
+    gpu_profile = detect_gpu_profile(device=torch.cuda.current_device())
+
+    # --- Workspace sizing: leave 2 GiB for activations, cap per GPU tier ---
     _, free_mem, _ = cudart.cudaMemGetInfo()
-    GiB = 2**30
+    GiB = 2 ** 30
     if free_mem > 6 * GiB:
         activation_carveout = 2 * GiB
-        max_workspace_size = min(free_mem - activation_carveout, 8 * GiB)
+        max_workspace_size = min(
+            free_mem - activation_carveout,
+            gpu_profile.max_workspace_cap_bytes,
+        )
     else:
         max_workspace_size = 0
-    logger.info(f"TRT workspace: free_mem={free_mem / GiB:.1f}GiB, max_workspace={max_workspace_size / GiB:.1f}GiB")
+    logger.info(
+        f"[TRT Build] Workspace: free={free_mem / GiB:.1f} GiB, "
+        f"cap={gpu_profile.max_workspace_cap_bytes / GiB:.1f} GiB, "
+        f"allocated={max_workspace_size / GiB:.1f} GiB"
+    )
+
+    # --- Timing cache: shared per engine directory ---
+    # Cache is stored alongside the engine files so it persists across rebuilds.
+    engine_dir = os.path.dirname(engine_path)
+    timing_cache_path = os.path.join(engine_dir, "timing.cache")
+
     engine = Engine(engine_path)
     input_profile = model_data.get_input_profile(
         opt_batch_size,
@@ -622,8 +989,11 @@ def build_engine(
         input_profile=input_profile,
         enable_refit=build_enable_refit,
         enable_all_tactics=build_all_tactics,
+        timing_cache=timing_cache_path,
         workspace_size=max_workspace_size,
         fp8=fp8,
+        gpu_profile=gpu_profile,
+        dynamic_shapes=build_dynamic_shape,
     )
 
     return engine
diff --git a/src/streamdiffusion/preprocessing/processors/temporal_net_tensorrt.py b/src/streamdiffusion/preprocessing/processors/temporal_net_tensorrt.py
index 8932976e..8151722a 100644
--- a/src/streamdiffusion/preprocessing/processors/temporal_net_tensorrt.py
+++ b/src/streamdiffusion/preprocessing/processors/temporal_net_tensorrt.py
@@ -77,8 +77,18 @@ def allocate_buffers(self, device="cuda", input_shape=None):
         for idx in range(self.engine.num_io_tensors):
             name = self.engine.get_tensor_name(idx)
             shape = self.context.get_tensor_shape(name)
-            dtype = trt.nptype(self.engine.get_tensor_dtype(name))
-            
+            trt_dtype = self.engine.get_tensor_dtype(name)
+            try:
+                dtype_np = trt.nptype(trt_dtype)
+                torch_dtype = numpy_to_torch_dtype_dict[dtype_np]
+            except TypeError:
+                # FP8 (FLOAT8E4M3FN) has no numpy equivalent — map directly to torch
+                if trt_dtype == trt.DataType.FP8:
+                    torch_dtype = torch.float8_e4m3fn
+                else:
+                    raise
+
+
             if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
                 # For dynamic shapes, use provided input_shape
                 if input_shape is not None and any(dim == -1 for dim in shape):
@@ -96,10 +106,8 @@ def allocate_buffers(self, device="cuda", input_shape=None):
                     f"Tensor '{name}' still has dynamic dimensions {shape} after setting input shapes. "
                     f"Please provide input_shape parameter to allocate_buffers()."
                 )
-            
-            tensor = torch.empty(
-                tuple(shape), dtype=numpy_to_torch_dtype_dict[dtype]
-            ).to(device=device)
+
+            tensor = torch.empty(tuple(shape), dtype=torch_dtype).to(device=device)
             self.tensors[name] = tensor
 
     def infer(self, feed_dict, stream=None):

From b016f6af85bb3b862e0c81afd6a8f7767f8d720f Mon Sep 17 00:00:00 2001
From: "INTER.Tech" <INTER.Tech@interiam.co>
Date: Sat, 4 Apr 2026 20:39:34 -0400
Subject: [PATCH 03/10] perf(trt): static spatial shapes + tactic cleanup for
 engine builder
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Resolution is always known before inference and never changes, so all
three engine types (UNet, VAE encoder, VAE decoder, ControlNet) now
build with static spatial profiles (min=opt=max at exact resolution).

Static shapes unlock:
- tiling_optimization_level (FAST/MODERATE/FULL by GPU tier) — was
  skipped for all dynamic builds with 'symbolic shape, l2tc doesn't
  take effect' warning
- l2_limit_for_tiling — now applied for full L2 cache budget
- Geometry-specific kernel selection instead of range-covering kernels
- Tighter CUDA graph buffer allocation (exact dims vs worst-case 1024²)
- Faster builds: single-point tactic search vs 4× spatial range

Key fixes:
- get_minmax_dims(): static_shape flag was dead code — hardcoded to
  always return 256-1024 range regardless of the flag
- UNet.get_input_profile(): separation logic (opt != min padding) now
  guarded behind `if not static_shape` — was incorrectly padding opt
  away from min for static engines where min==opt==max is correct
- ControlNetTRT.get_input_profile(): had its own hardcoded 384-1024
  range that bypassed get_minmax_dims() entirely; now respects
  static_shape flag
- ControlNet residual scaling: max(min+1,...) guard now bypassed for
  static shapes where min==max; exact dims used directly
- Engine paths: add --res-{H}x{W} suffix for static builds to prevent
  cache collisions between different resolutions

Dead code removal:
- build_all_tactics / enable_all_tactics parameter excised from entire
  call chain (wrapper → builder → utilities → Engine.build/_build_fp8)
  TRT 10.12 defaults already enable EDGE_MASK_CONVOLUTIONS +
  JIT_CONVOLUTIONS; CUBLAS/CUBLAS_LT/CUDNN all deprecated and disabled

Tactic tuning:
- avg_timing_iterations=4 added to _apply_gpu_profile_to_config()
  Default 1 produces noisy single-sample measurements; 4 iterations
  give stable tactic rankings with negligible extra build time

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../acceleration/tensorrt/builder.py          |   9 +-
 .../acceleration/tensorrt/engine_manager.py   | 321 ++++---
 .../tensorrt/models/controlnet_models.py      | 303 ++++---
 .../acceleration/tensorrt/models/models.py    |  97 +-
 .../acceleration/tensorrt/utilities.py        |  18 +-
 src/streamdiffusion/wrapper.py                | 852 +++++++++---------
 6 files changed, 847 insertions(+), 753 deletions(-)

diff --git a/src/streamdiffusion/acceleration/tensorrt/builder.py b/src/streamdiffusion/acceleration/tensorrt/builder.py
index e12451aa..6b4934ff 100644
--- a/src/streamdiffusion/acceleration/tensorrt/builder.py
+++ b/src/streamdiffusion/acceleration/tensorrt/builder.py
@@ -66,7 +66,6 @@ def build(
         build_enable_refit: bool = False,
         build_static_batch: bool = False,
         build_dynamic_shape: bool = True,
-        build_all_tactics: bool = False,
         onnx_opset: int = 17,
         force_engine_build: bool = False,
         force_onnx_export: bool = False,
@@ -84,7 +83,6 @@ def build(
             "opt_resolution": f"{opt_image_width}x{opt_image_height}",
             "dynamic_range": f"{min_image_resolution}-{max_image_resolution}" if build_dynamic_shape else "static",
             "batch_size": opt_batch_size,
-            "build_all_tactics": build_all_tactics,
             "stages": {},
         }
 
@@ -206,7 +204,6 @@ def build(
                 opt_batch_size=opt_batch_size,
                 build_static_batch=build_static_batch,
                 build_dynamic_shape=build_dynamic_shape,
-                build_all_tactics=build_all_tactics,
                 build_enable_refit=build_enable_refit,
                 fp8=fp8_trt,
             )
@@ -226,10 +223,10 @@ def build(
         _build_logger.warning(f"[BUILD] {engine_filename} complete: {total_elapsed:.1f}s total")
         _write_build_stats(engine_path, stats)
 
-        # Cleanup ONNX artifacts — preserve .engine, .fp8.onnx, and build_stats.json
+        # Cleanup ONNX artifacts — preserve .engine, .fp8.onnx, timing.cache, and build_stats.json
         # Two-pass deletion to handle Windows file locks (gc.collect releases Python handles)
-        _keep_suffixes = (".engine", ".fp8.onnx")
-        _keep_exact = {"build_stats.json"}
+        _keep_suffixes = (".engine", ".fp8.onnx", ".cache")
+        _keep_exact = {"build_stats.json", "timing.cache"}
         engine_dir = os.path.dirname(engine_path)
         _to_delete = []
         for file in os.listdir(engine_dir):
diff --git a/src/streamdiffusion/acceleration/tensorrt/engine_manager.py b/src/streamdiffusion/acceleration/tensorrt/engine_manager.py
index 9b2e1df0..471f0de4 100644
--- a/src/streamdiffusion/acceleration/tensorrt/engine_manager.py
+++ b/src/streamdiffusion/acceleration/tensorrt/engine_manager.py
@@ -1,17 +1,18 @@
-
 import hashlib
 import logging
 from enum import Enum
 from pathlib import Path
-from typing import Any, Optional, Dict
+from typing import Any, Dict, Optional
+
 
 logger = logging.getLogger(__name__)
 
 
 class EngineType(Enum):
     """Engine types supported by the TensorRT engine manager."""
+
     UNET = "unet"
-    VAE_ENCODER = "vae_encoder" 
+    VAE_ENCODER = "vae_encoder"
     VAE_DECODER = "vae_decoder"
     CONTROLNET = "controlnet"
     SAFETY_CHECKER = "safety_checker"
@@ -20,62 +21,64 @@ class EngineType(Enum):
 class EngineManager:
     """
     Universal TensorRT engine manager using factory pattern.
-    
+
     Consolidates all engine management logic into a single class:
     - Path generation (moves create_prefix from wrapper.py)
-    - Compilation (moves compile_* calls from wrapper.py)  
+    - Compilation (moves compile_* calls from wrapper.py)
     - Loading (returns appropriate engine objects)
     """
-    
+
     def __init__(self, engine_dir: str):
         """Initialize with engine directory."""
         self.engine_dir = Path(engine_dir)
         self.engine_dir.mkdir(parents=True, exist_ok=True)
-        
+
         # Import the existing compile functions from tensorrt/__init__.py
         from streamdiffusion.acceleration.tensorrt import (
-            compile_unet, compile_vae_encoder, compile_vae_decoder, compile_safety_checker, compile_controlnet
+            compile_controlnet,
+            compile_safety_checker,
+            compile_unet,
+            compile_vae_decoder,
+            compile_vae_encoder,
         )
-        from streamdiffusion.acceleration.tensorrt.runtime_engines.unet_engine import (
-            UNet2DConditionModelEngine
-        )
-        from streamdiffusion.acceleration.tensorrt.runtime_engines.controlnet_engine import (
-            ControlNetModelEngine
-        )
-        
+        from streamdiffusion.acceleration.tensorrt.runtime_engines.controlnet_engine import ControlNetModelEngine
+        from streamdiffusion.acceleration.tensorrt.runtime_engines.unet_engine import UNet2DConditionModelEngine
+
         # TODO: add function to get use_cuda_graph from kwargs
         # Engine configurations - maps each type to its compile function and loader
         self._configs = {
             EngineType.UNET: {
-                'filename': 'unet.engine',
-                'compile_fn': compile_unet,
-                'loader': lambda path, cuda_stream, **kwargs: UNet2DConditionModelEngine(
+                "filename": "unet.engine",
+                "compile_fn": compile_unet,
+                "loader": lambda path, cuda_stream, **kwargs: UNet2DConditionModelEngine(
                     str(path), cuda_stream, use_cuda_graph=True
-                )
+                ),
             },
             EngineType.VAE_ENCODER: {
-                'filename': 'vae_encoder.engine', 
-                'compile_fn': compile_vae_encoder,
-                'loader': lambda path, cuda_stream, **kwargs: str(path)  # Return path for AutoencoderKLEngine
+                "filename": "vae_encoder.engine",
+                "compile_fn": compile_vae_encoder,
+                "loader": lambda path, cuda_stream, **kwargs: str(path),  # Return path for AutoencoderKLEngine
             },
             EngineType.VAE_DECODER: {
-                'filename': 'vae_decoder.engine',
-                'compile_fn': compile_vae_decoder, 
-                'loader': lambda path, cuda_stream, **kwargs: str(path)  # Return path for AutoencoderKLEngine
+                "filename": "vae_decoder.engine",
+                "compile_fn": compile_vae_decoder,
+                "loader": lambda path, cuda_stream, **kwargs: str(path),  # Return path for AutoencoderKLEngine
             },
             EngineType.CONTROLNET: {
-                'filename': 'cnet.engine',
-                'compile_fn': compile_controlnet,
-                'loader': lambda path, cuda_stream, **kwargs: ControlNetModelEngine(
-                    str(path), cuda_stream, use_cuda_graph=kwargs.get('use_cuda_graph', False),
-                    model_type=kwargs.get('model_type', 'sd15')
-                )
+                "filename": "cnet.engine",
+                "compile_fn": compile_controlnet,
+                "loader": lambda path, cuda_stream, **kwargs: ControlNetModelEngine(
+                    str(path),
+                    cuda_stream,
+                    use_cuda_graph=kwargs.get("use_cuda_graph", False),
+                    model_type=kwargs.get("model_type", "sd15"),
+                ),
             },
             EngineType.SAFETY_CHECKER: {
-                'filename': 'safety_checker.engine',
-                'compile_fn': compile_safety_checker,
-                'loader': lambda path, cuda_stream, **kwargs: str(path)
-            }
+                "filename": "safety_checker.engine",
+                "compile_fn": compile_safety_checker,
+                "loader": lambda path, cuda_stream, **kwargs: str(path),
+            },
         }
 
     def _lora_signature(self, lora_dict: Dict[str, float]) -> str:
@@ -93,54 +96,58 @@ def _lora_signature(self, lora_dict: Dict[str, float]) -> str:
         h = hashlib.sha1(canon.encode("utf-8")).hexdigest()[:10]
         return f"{len(lora_dict)}-{h}"
 
-    def get_engine_path(self,
-                       engine_type: EngineType,
-                       model_id_or_path: str,
-                       max_batch_size: int,
-                       min_batch_size: int,
-                       mode: str,
-                       use_tiny_vae: bool,
-                       lora_dict: Optional[Dict[str, float]] = None,
-                       ipadapter_scale: Optional[float] = None,
-                       ipadapter_tokens: Optional[int] = None,
-                       controlnet_model_id: Optional[str] = None,
-                       is_faceid: Optional[bool] = None,
-                       use_cached_attn: bool = False,
-                       use_controlnet: bool = False,
-                       fp8: bool = False
-                       ) -> Path:
+    def get_engine_path(
+        self,
+        engine_type: EngineType,
+        model_id_or_path: str,
+        max_batch_size: int,
+        min_batch_size: int,
+        mode: str,
+        use_tiny_vae: bool,
+        lora_dict: Optional[Dict[str, float]] = None,
+        ipadapter_scale: Optional[float] = None,
+        ipadapter_tokens: Optional[int] = None,
+        controlnet_model_id: Optional[str] = None,
+        is_faceid: Optional[bool] = None,
+        use_cached_attn: bool = False,
+        use_controlnet: bool = False,
+        fp8: bool = False,
+        resolution: Optional[tuple] = None,
+    ) -> Path:
         """
         Generate engine path using wrapper.py's current logic.
-        
+
         Moves and consolidates create_prefix() function from wrapper.py lines 995-1014.
         Special handling for ControlNet engines which use model_id-based directories.
         """
-        filename = self._configs[engine_type]['filename']
-        
+        filename = self._configs[engine_type]["filename"]
+
         if engine_type == EngineType.CONTROLNET:
             # ControlNet engines use special model_id-based directory structure
             if controlnet_model_id is None:
                 raise ValueError("get_engine_path: controlnet_model_id required for CONTROLNET engines")
-            
+
             # Convert model_id to directory name format (replace "/" with "_")
             model_dir_name = controlnet_model_id.replace("/", "_")
-            
-            # Use ControlNetEnginePool naming convention: dynamic engines with 384-1024 range
-            prefix = f"controlnet_{model_dir_name}--min_batch-{min_batch_size}--max_batch-{max_batch_size}--dyn-384-1024"
+
+            if resolution is not None:
+                prefix = f"controlnet_{model_dir_name}--min_batch-{min_batch_size}--max_batch-{max_batch_size}--res-{resolution[0]}x{resolution[1]}"
+            else:
+                prefix = f"controlnet_{model_dir_name}--min_batch-{min_batch_size}--max_batch-{max_batch_size}--dyn-384-1024"
             return self.engine_dir / prefix / filename
         else:
             # Standard engines use the unified prefix format
             # Extract base name (from wrapper.py lines 1002-1003)
             maybe_path = Path(model_id_or_path)
             base_name = maybe_path.stem if maybe_path.exists() else model_id_or_path
-            
+
             # Create prefix (from wrapper.py lines 1005-1013)
             prefix = f"{base_name}--tiny_vae-{use_tiny_vae}--min_batch-{min_batch_size}--max_batch-{max_batch_size}"
-            
+
             # IP-Adapter differentiation: add type and (optionally) tokens
             # Keep scale out of identity for runtime control, but include a type flag to separate caches
             if is_faceid is True:
-                prefix += f"--fid"
+                prefix += "--fid"
             if ipadapter_tokens is not None:
                 prefix += f"--tokens{ipadapter_tokens}"
 
@@ -156,9 +163,12 @@ def get_engine_path(self,
                     prefix += "--fp8"
 
             prefix += f"--mode-{mode}"
-            
+
+            if resolution is not None:
+                prefix += f"--res-{resolution[0]}x{resolution[1]}"
+
             return self.engine_dir / prefix / filename
-    
+
     def _get_embedding_dim_for_model_type(self, model_type: str) -> int:
         """Get embedding dimension based on model type."""
         if model_type.lower() in ["sdxl"]:
@@ -167,8 +177,10 @@ def _get_embedding_dim_for_model_type(self, model_type: str) -> int:
             return 1024
         else:  # sd15 and others
             return 768
-    
-    def _execute_compilation(self, compile_fn, engine_path: Path, model, model_config, batch_size: int, kwargs: Dict) -> None:
+
+    def _execute_compilation(
+        self, compile_fn, engine_path: Path, model, model_config, batch_size: int, kwargs: Dict
+    ) -> None:
         """Execute compilation with common pattern to eliminate duplication."""
         compile_fn(
             model,
@@ -177,140 +189,155 @@ def _execute_compilation(self, compile_fn, engine_path: Path, model, model_confi
             str(engine_path) + ".opt.onnx",
             str(engine_path),
             opt_batch_size=batch_size,
-            engine_build_options=kwargs.get('engine_build_options', {})
+            engine_build_options=kwargs.get("engine_build_options", {}),
         )
-    
+
     def _prepare_controlnet_models(self, kwargs: Dict):
         """Prepare ControlNet models for compilation."""
-        from streamdiffusion.acceleration.tensorrt.models.controlnet_models import create_controlnet_model
         import torch
-        
-        model_type = kwargs.get('model_type', 'sd15')
-        max_batch_size = kwargs['max_batch_size']
-        min_batch_size = kwargs['min_batch_size']
+
+        from streamdiffusion.acceleration.tensorrt.models.controlnet_models import create_controlnet_model
+
+        model_type = kwargs.get("model_type", "sd15")
+        max_batch_size = kwargs["max_batch_size"]
+        min_batch_size = kwargs["min_batch_size"]
         embedding_dim = self._get_embedding_dim_for_model_type(model_type)
-        
+
         # Create ControlNet model configuration
         controlnet_model = create_controlnet_model(
             model_type=model_type,
-            unet=kwargs.get('unet'),
-            model_path=kwargs.get('model_path', ""),
+            unet=kwargs.get("unet"),
+            model_path=kwargs.get("model_path", ""),
             max_batch_size=max_batch_size,
             min_batch_size=min_batch_size,
             embedding_dim=embedding_dim,
-            conditioning_channels=kwargs.get('conditioning_channels', 3)
+            conditioning_channels=kwargs.get("conditioning_channels", 3),
         )
-        
+
         # Prepare ControlNet model for compilation
-        pytorch_model = kwargs['model'].to(dtype=torch.float16)
-        
+        pytorch_model = kwargs["model"].to(dtype=torch.float16)
+
         return pytorch_model, controlnet_model
-    
-    def _get_default_controlnet_build_options(self) -> Dict:
+
+    def _get_default_controlnet_build_options(
+        self,
+        opt_image_height: int = 704,
+        opt_image_width: int = 704,
+        build_dynamic_shape: bool = False,
+    ) -> Dict:
         """Get default engine build options for ControlNet engines."""
-        return {
-            'opt_image_height': 704,  # Dynamic optimal resolution
-            'opt_image_width': 704,
-            'build_dynamic_shape': True,
-            'min_image_resolution': 384,
-            'max_image_resolution': 1024,
-            'build_static_batch': False,
-            'build_all_tactics': True,
+        opts = {
+            "opt_image_height": opt_image_height,
+            "opt_image_width": opt_image_width,
+            "build_dynamic_shape": build_dynamic_shape,
+            "build_static_batch": False,
         }
-    
-    def compile_and_load_engine(self, 
-                               engine_type: EngineType, 
-                               engine_path: Path,
-                               load_engine: bool = True,
-                               **kwargs) -> Any:
+        if build_dynamic_shape:
+            opts["min_image_resolution"] = 384
+            opts["max_image_resolution"] = 1024
+        return opts
+
+    def compile_and_load_engine(
+        self, engine_type: EngineType, engine_path: Path, load_engine: bool = True, **kwargs
+    ) -> Any:
         """
         Universal compile and load logic for all engine types.
-        
+
         Moves compilation blocks from wrapper.py lines 1200-1252, 1254-1283, 1285-1313.
         """
         if not engine_path.exists():
             # Get the appropriate compile function for this engine type
             config = self._configs[engine_type]
-            compile_fn = config['compile_fn']
-            
+            compile_fn = config["compile_fn"]
+
             # Ensure parent directory exists
             engine_path.parent.mkdir(parents=True, exist_ok=True)
-            
+
             # Handle engine-specific compilation requirements
             if engine_type == EngineType.VAE_DECODER:
                 # VAE decoder requires modifying forward method during compilation
-                stream_vae = kwargs['stream_vae']
+                stream_vae = kwargs["stream_vae"]
                 stream_vae.forward = stream_vae.decode
                 try:
-                    self._execute_compilation(compile_fn, engine_path, kwargs['model'], kwargs['model_config'], kwargs['batch_size'], kwargs)
+                    self._execute_compilation(
+                        compile_fn, engine_path, kwargs["model"], kwargs["model_config"], kwargs["batch_size"], kwargs
+                    )
                 finally:
                     # Always clean up the forward attribute
                     delattr(stream_vae, "forward")
             elif engine_type == EngineType.CONTROLNET:
                 # ControlNet requires special model creation and compilation
                 model, model_config = self._prepare_controlnet_models(kwargs)
-                self._execute_compilation(compile_fn, engine_path, model, model_config, kwargs['batch_size'], kwargs)
+                self._execute_compilation(compile_fn, engine_path, model, model_config, kwargs["batch_size"], kwargs)
             else:
                 # Standard compilation for UNet and VAE encoder
-                self._execute_compilation(compile_fn, engine_path, kwargs['model'], kwargs['model_config'], kwargs['batch_size'], kwargs)
+                self._execute_compilation(
+                    compile_fn, engine_path, kwargs["model"], kwargs["model_config"], kwargs["batch_size"], kwargs
+                )
         else:
-            logger.info(f"EngineManager: engine_path already exists, skipping compile")
-            
+            logger.info("EngineManager: engine_path already exists, skipping compile")
+
         if load_engine:
             return self.load_engine(engine_type, engine_path, **kwargs)
         else:
-            logger.info(f"EngineManager: load_engine is False, skipping load engine")
+            logger.info("EngineManager: load_engine is False, skipping load engine")
             return None
-    
+
     def load_engine(self, engine_type: EngineType, engine_path: Path, **kwargs: Dict) -> Any:
         """Load engine with type-specific handling."""
         config = self._configs[engine_type]
-        loader = config['loader']
-        
+        loader = config["loader"]
+
         if engine_type == EngineType.UNET:
             # UNet engine needs special handling for metadata and error recovery
-            loaded_engine = loader(engine_path, kwargs.get('cuda_stream'))
+            loaded_engine = loader(engine_path, kwargs.get("cuda_stream"))
             self._set_unet_metadata(loaded_engine, kwargs)
             return loaded_engine
         elif engine_type == EngineType.CONTROLNET:
             # ControlNet engine needs model_type parameter
-            return loader(engine_path, kwargs.get('cuda_stream'), 
-                         model_type=kwargs.get('model_type', 'sd15'),
-                         use_cuda_graph=kwargs.get('use_cuda_graph', False))
+            return loader(
+                engine_path,
+                kwargs.get("cuda_stream"),
+                model_type=kwargs.get("model_type", "sd15"),
+                use_cuda_graph=kwargs.get("use_cuda_graph", False),
+            )
         else:
-            return loader(engine_path, kwargs.get('cuda_stream'))
-    
+            return loader(engine_path, kwargs.get("cuda_stream"))
+
     def _set_unet_metadata(self, loaded_engine, kwargs: Dict) -> None:
         """Set metadata on UNet engine for runtime use."""
-        setattr(loaded_engine, 'use_control', kwargs.get('use_controlnet_trt', False))
-        setattr(loaded_engine, 'use_ipadapter', kwargs.get('use_ipadapter_trt', False))
-        
-        if kwargs.get('use_controlnet_trt', False):
-            setattr(loaded_engine, 'unet_arch', kwargs.get('unet_arch', {}))
-            
-        if kwargs.get('use_ipadapter_trt', False):
-            setattr(loaded_engine, 'ipadapter_arch', kwargs.get('unet_arch', {}))
+        setattr(loaded_engine, "use_control", kwargs.get("use_controlnet_trt", False))
+        setattr(loaded_engine, "use_ipadapter", kwargs.get("use_ipadapter_trt", False))
+
+        if kwargs.get("use_controlnet_trt", False):
+            setattr(loaded_engine, "unet_arch", kwargs.get("unet_arch", {}))
+
+        if kwargs.get("use_ipadapter_trt", False):
+            setattr(loaded_engine, "ipadapter_arch", kwargs.get("unet_arch", {}))
             # number of IP-attention layers for runtime vector sizing
-            if 'num_ip_layers' in kwargs and kwargs['num_ip_layers'] is not None:
-                setattr(loaded_engine, 'num_ip_layers', kwargs['num_ip_layers'])
-        
-    
-    def get_or_load_controlnet_engine(self, 
-                                    model_id: str,
-                                    pytorch_model: Any,
-                                    load_engine=True,
-                                    model_type: str = "sd15",
-                                    batch_size: int = 1,
-                                    min_batch_size: int = 1,
-                                    max_batch_size: int = 4,
-                                    cuda_stream = None,
-                                    use_cuda_graph: bool = False,
-                                    unet = None,
-                                    model_path: str = "",
-                                    conditioning_channels: int = 3) -> Any:
+            if "num_ip_layers" in kwargs and kwargs["num_ip_layers"] is not None:
+                setattr(loaded_engine, "num_ip_layers", kwargs["num_ip_layers"])
+
+    def get_or_load_controlnet_engine(
+        self,
+        model_id: str,
+        pytorch_model: Any,
+        load_engine=True,
+        model_type: str = "sd15",
+        batch_size: int = 1,
+        min_batch_size: int = 1,
+        max_batch_size: int = 4,
+        cuda_stream=None,
+        use_cuda_graph: bool = False,
+        unet=None,
+        model_path: str = "",
+        conditioning_channels: int = 3,
+        opt_image_height: int = 704,
+        opt_image_width: int = 704,
+    ) -> Any:
         """
         Get or load ControlNet engine, providing unified interface for ControlNet management.
-        
+
         Replaces ControlNetEnginePool.get_or_load_engine functionality.
         """
         # Generate engine path using ControlNet-specific logic
@@ -321,9 +348,10 @@ def get_or_load_controlnet_engine(self,
             min_batch_size=min_batch_size,
             mode="",  # Not used for ControlNet
             use_tiny_vae=False,  # Not used for ControlNet
-            controlnet_model_id=model_id
+            controlnet_model_id=model_id,
+            resolution=(opt_image_height, opt_image_width),
         )
-        
+
         # Compile and load ControlNet engine
         return self.compile_and_load_engine(
             EngineType.CONTROLNET,
@@ -339,5 +367,8 @@ def get_or_load_controlnet_engine(self,
             unet=unet,
             model_path=model_path,
             conditioning_channels=conditioning_channels,
-            engine_build_options=self._get_default_controlnet_build_options()
-        )
\ No newline at end of file
+            engine_build_options=self._get_default_controlnet_build_options(
+                opt_image_height=opt_image_height,
+                opt_image_width=opt_image_width,
+            ),
+        )
diff --git a/src/streamdiffusion/acceleration/tensorrt/models/controlnet_models.py b/src/streamdiffusion/acceleration/tensorrt/models/controlnet_models.py
index 9a9c6f83..cefed320 100644
--- a/src/streamdiffusion/acceleration/tensorrt/models/controlnet_models.py
+++ b/src/streamdiffusion/acceleration/tensorrt/models/controlnet_models.py
@@ -1,50 +1,49 @@
 """ControlNet TensorRT model definitions for compilation"""
 
-from typing import List, Dict, Optional
-from .models import BaseModel
-from ..export_wrappers.unet_sdxl_export import SDXLConditioningHandler, get_sdxl_tensorrt_config
-from ....model_detection import detect_model
+from typing import Dict, List
+
 import torch
 
+from ....model_detection import detect_model
+from ..export_wrappers.unet_sdxl_export import SDXLConditioningHandler
+from .models import BaseModel
+
 
 class ControlNetTRT(BaseModel):
     """TensorRT model definition for ControlNet compilation"""
-    
-    def __init__(self, 
-                 fp16: bool = True,
-                 device: str = "cuda",
-                 min_batch_size: int = 1,
-                 max_batch_size: int = 4,
-                 embedding_dim: int = 768,
-                 unet_dim: int = 4,
-                 conditioning_channels: int = 3,
-                 **kwargs):
+
+    def __init__(
+        self,
+        fp16: bool = True,
+        device: str = "cuda",
+        min_batch_size: int = 1,
+        max_batch_size: int = 4,
+        embedding_dim: int = 768,
+        unet_dim: int = 4,
+        conditioning_channels: int = 3,
+        **kwargs,
+    ):
         super().__init__(
             fp16=fp16,
             device=device,
             max_batch_size=max_batch_size,
             min_batch_size=min_batch_size,
             embedding_dim=embedding_dim,
-            **kwargs
+            **kwargs,
         )
         self.unet_dim = unet_dim
         self.conditioning_channels = conditioning_channels if conditioning_channels is not None else 3
         self.name = "ControlNet"
-        
+
     def get_input_names(self) -> List[str]:
         """Get input names for ControlNet TensorRT engine"""
-        return [
-            "sample",
-            "timestep",
-            "encoder_hidden_states",
-            "controlnet_cond"
-        ]
-        
+        return ["sample", "timestep", "encoder_hidden_states", "controlnet_cond"]
+
     def get_output_names(self) -> List[str]:
         """Get output names for ControlNet TensorRT engine"""
         down_names = [f"down_block_{i:02d}" for i in range(12)]
         return down_names + ["mid_block"]
-        
+
     def get_dynamic_axes(self) -> Dict[str, Dict[int, str]]:
         """Get dynamic axes configuration for variable input shapes"""
         return {
@@ -53,43 +52,40 @@ def get_dynamic_axes(self) -> Dict[str, Dict[int, str]]:
             "timestep": {0: "B"},
             "controlnet_cond": {0: "B", 2: "H_ctrl", 3: "W_ctrl"},
             **{f"down_block_{i:02d}": {0: "B", 2: "H", 3: "W"} for i in range(12)},
-            "mid_block": {0: "B", 2: "H", 3: "W"}
+            "mid_block": {0: "B", 2: "H", 3: "W"},
         }
-    
-    def get_input_profile(self, batch_size, image_height, image_width, 
-                         static_batch, static_shape):
-        """Generate TensorRT input profiles for ControlNet with dynamic 384-1024 range"""
+
+    def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape):
+        """Generate TensorRT input profiles for ControlNet."""
         min_batch = batch_size if static_batch else self.min_batch
         max_batch = batch_size if static_batch else self.max_batch
-        
-        # Force dynamic shapes for universal engines (384-1024 range)
-        min_ctrl_h = 384  # Changed from 256 to 512 to match min resolution
-        max_ctrl_h = 1024
-        min_ctrl_w = 384  # Changed from 256 to 512 to match min resolution
-        max_ctrl_w = 1024
-        
-        # Use a flexible optimal resolution that's in the middle of the range
-        # This allows the engine to handle both smaller and larger resolutions
-        opt_ctrl_h = 704  # Middle of 512-1024 range
-        opt_ctrl_w = 704  # Middle of 512-1024 range
-        
-        # Calculate latent dimensions
-        min_latent_h = min_ctrl_h // 8  # 64
-        max_latent_h = max_ctrl_h // 8  # 128
-        min_latent_w = min_ctrl_w // 8  # 64
-        max_latent_w = max_ctrl_w // 8  # 128
-        opt_latent_h = opt_ctrl_h // 8  # 96
-        opt_latent_w = opt_ctrl_w // 8  # 96
-        
+
+        if static_shape:
+            # Static: min=opt=max at exact resolution — enables L2 tiling & geometry kernels
+            min_ctrl_h = max_ctrl_h = opt_ctrl_h = image_height
+            min_ctrl_w = max_ctrl_w = opt_ctrl_w = image_width
+        else:
+            min_ctrl_h = 384
+            max_ctrl_h = 1024
+            opt_ctrl_h = 704
+            min_ctrl_w = 384
+            max_ctrl_w = 1024
+            opt_ctrl_w = 704
+
+        min_latent_h = min_ctrl_h // 8
+        max_latent_h = max_ctrl_h // 8
+        min_latent_w = min_ctrl_w // 8
+        max_latent_w = max_ctrl_w // 8
+        opt_latent_h = opt_ctrl_h // 8
+        opt_latent_w = opt_ctrl_w // 8
+
         profile = {
             "sample": [
                 (min_batch, self.unet_dim, min_latent_h, min_latent_w),
                 (batch_size, self.unet_dim, opt_latent_h, opt_latent_w),
                 (max_batch, self.unet_dim, max_latent_h, max_latent_w),
             ],
-            "timestep": [
-                (min_batch,), (batch_size,), (max_batch,)
-            ],
+            "timestep": [(min_batch,), (batch_size,), (max_batch,)],
             "encoder_hidden_states": [
                 (min_batch, 77, self.embedding_dim),
                 (batch_size, 77, self.embedding_dim),
@@ -101,29 +97,28 @@ def get_input_profile(self, batch_size, image_height, image_width,
                 (max_batch, self.conditioning_channels, max_ctrl_h, max_ctrl_w),
             ],
         }
-        
+
         return profile
-    
+
     def get_sample_input(self, batch_size, image_height, image_width):
         """Generate sample inputs for ONNX export"""
         latent_height = image_height // 8
         latent_width = image_width // 8
         dtype = torch.float16 if self.fp16 else torch.float32
-        
+
         return (
-            torch.randn(batch_size, self.unet_dim, latent_height, latent_width, 
-                       dtype=dtype, device=self.device),
+            torch.randn(batch_size, self.unet_dim, latent_height, latent_width, dtype=dtype, device=self.device),
             torch.ones(batch_size, dtype=torch.float32, device=self.device),
-            torch.randn(batch_size, 77, self.embedding_dim, 
-                       dtype=dtype, device=self.device),
-            torch.randn(batch_size, self.conditioning_channels, image_height, image_width, 
-                       dtype=dtype, device=self.device)
+            torch.randn(batch_size, 77, self.embedding_dim, dtype=dtype, device=self.device),
+            torch.randn(
+                batch_size, self.conditioning_channels, image_height, image_width, dtype=dtype, device=self.device
+            ),
         )
 
 
 class ControlNetSDXLTRT(ControlNetTRT):
     """SDXL-specific ControlNet TensorRT model definition"""
-    
+
     def __init__(self, unet=None, model_path="", **kwargs):
         # Use new model detection if UNet provided
         if unet is not None:
@@ -132,113 +127,133 @@ def __init__(self, unet=None, model_path="", **kwargs):
 
             # Create a config dict compatible with SDXLConditioningHandler
             config = {
-                'is_sdxl': detection_result['is_sdxl'],
-                'has_time_cond': detection_result['architecture_details']['has_time_conditioning'],
-                'has_addition_embed': detection_result['architecture_details']['has_addition_embeds'],
-                'model_type': detection_result['model_type'],
-                'is_turbo': detection_result['is_turbo'],
-                'is_sd3': detection_result['is_sd3'],
-                'confidence': detection_result['confidence'],
-                'architecture_details': detection_result['architecture_details'],
-                'compatibility_info': detection_result['compatibility_info']
+                "is_sdxl": detection_result["is_sdxl"],
+                "has_time_cond": detection_result["architecture_details"]["has_time_conditioning"],
+                "has_addition_embed": detection_result["architecture_details"]["has_addition_embeds"],
+                "model_type": detection_result["model_type"],
+                "is_turbo": detection_result["is_turbo"],
+                "is_sd3": detection_result["is_sd3"],
+                "confidence": detection_result["confidence"],
+                "architecture_details": detection_result["architecture_details"],
+                "compatibility_info": detection_result["compatibility_info"],
             }
             conditioning_handler = SDXLConditioningHandler(config)
             conditioning_spec = conditioning_handler.get_conditioning_spec()
-            
+
             # Set embedding_dim from sophisticated detection
-            kwargs.setdefault('embedding_dim', conditioning_spec['context_dim'])
-        
+            kwargs.setdefault("embedding_dim", conditioning_spec["context_dim"])
+
         # Set SDXL-specific defaults
-        kwargs.setdefault('embedding_dim', 2048)  # SDXL uses 2048-dim embeddings
-        kwargs.setdefault('unet_dim', 4)          # SDXL latent channels
-        
+        kwargs.setdefault("embedding_dim", 2048)  # SDXL uses 2048-dim embeddings
+        kwargs.setdefault("unet_dim", 4)  # SDXL latent channels
+
         super().__init__(**kwargs)
-        
+
         # SDXL ControlNet output specifications - 9 down blocks + 1 mid block
         # Following the pattern from UNet implementation:
         self.sdxl_output_channels = {
             # Initial sample
-            'down_block_00': 320,   # Initial: 320 channels
-            # Block 0 residuals  
-            'down_block_01': 320,   # Block0: 320 channels
-            'down_block_02': 320,   # Block0: 320 channels
-            'down_block_03': 320,   # Block0: 320 channels
+            "down_block_00": 320,  # Initial: 320 channels
+            # Block 0 residuals
+            "down_block_01": 320,  # Block0: 320 channels
+            "down_block_02": 320,  # Block0: 320 channels
+            "down_block_03": 320,  # Block0: 320 channels
             # Block 1 residuals
-            'down_block_04': 640,   # Block1: 640 channels
-            'down_block_05': 640,   # Block1: 640 channels
-            'down_block_06': 640,   # Block1: 640 channels
+            "down_block_04": 640,  # Block1: 640 channels
+            "down_block_05": 640,  # Block1: 640 channels
+            "down_block_06": 640,  # Block1: 640 channels
             # Block 2 residuals
-            'down_block_07': 1280,  # Block2: 1280 channels
-            'down_block_08': 1280,  # Block2: 1280 channels
+            "down_block_07": 1280,  # Block2: 1280 channels
+            "down_block_08": 1280,  # Block2: 1280 channels
             # Mid block
-            'mid_block': 1280       # Mid: 1280 channels
+            "mid_block": 1280,  # Mid: 1280 channels
         }
-    
+
     def get_shape_dict(self, batch_size, image_height, image_width):
         """Override to provide SDXL-specific output shapes for 9 down blocks"""
         # Get base input shapes
         base_shapes = super().get_shape_dict(batch_size, image_height, image_width)
-        
+
         # Add conditioning_scale to input shapes (scalar tensor)
         base_shapes["conditioning_scale"] = ()  # Scalar tensor has empty shape
-        
+
         # Calculate latent dimensions
         latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
-        
+
         # SDXL output shapes matching UNet pattern:
         # Pattern: [88x88] + [88x88, 88x88, 44x44] + [44x44, 44x44, 22x22] + [22x22, 22x22]
         sdxl_output_shapes = {
             # Initial sample (no downsampling)
-            'down_block_00': (batch_size, 320, latent_height, latent_width),        # 88x88
+            "down_block_00": (batch_size, 320, latent_height, latent_width),  # 88x88
             # Block 0 residuals
-            'down_block_01': (batch_size, 320, latent_height, latent_width),        # 88x88  
-            'down_block_02': (batch_size, 320, latent_height, latent_width),        # 88x88
-            'down_block_03': (batch_size, 320, latent_height // 2, latent_width // 2),  # 44x44 (downsampled)
+            "down_block_01": (batch_size, 320, latent_height, latent_width),  # 88x88
+            "down_block_02": (batch_size, 320, latent_height, latent_width),  # 88x88
+            "down_block_03": (batch_size, 320, latent_height // 2, latent_width // 2),  # 44x44 (downsampled)
             # Block 1 residuals
-            'down_block_04': (batch_size, 640, latent_height // 2, latent_width // 2),  # 44x44
-            'down_block_05': (batch_size, 640, latent_height // 2, latent_width // 2),  # 44x44
-            'down_block_06': (batch_size, 640, latent_height // 4, latent_width // 4),  # 22x22 (downsampled)
-            # Block 2 residuals  
-            'down_block_07': (batch_size, 1280, latent_height // 4, latent_width // 4), # 22x22
-            'down_block_08': (batch_size, 1280, latent_height // 4, latent_width // 4), # 22x22
+            "down_block_04": (batch_size, 640, latent_height // 2, latent_width // 2),  # 44x44
+            "down_block_05": (batch_size, 640, latent_height // 2, latent_width // 2),  # 44x44
+            "down_block_06": (batch_size, 640, latent_height // 4, latent_width // 4),  # 22x22 (downsampled)
+            # Block 2 residuals
+            "down_block_07": (batch_size, 1280, latent_height // 4, latent_width // 4),  # 22x22
+            "down_block_08": (batch_size, 1280, latent_height // 4, latent_width // 4),  # 22x22
             # Mid block
-            'mid_block': (batch_size, 1280, latent_height // 4, latent_width // 4),     # 22x22
+            "mid_block": (batch_size, 1280, latent_height // 4, latent_width // 4),  # 22x22
         }
-        
+
         # Combine base inputs with SDXL outputs
         base_shapes.update(sdxl_output_shapes)
         return base_shapes
-    
+
     def get_sample_input(self, batch_size, image_height, image_width):
         """Override to provide SDXL-specific sample tensors with correct input format"""
         latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
         dtype = torch.float16 if self.fp16 else torch.float32
-        
+
         # SDXL ControlNet inputs (wrapper expects 7 inputs including SDXL conditioning)
         base_inputs = (
-            torch.randn(batch_size, self.unet_dim, latent_height, latent_width, 
-                       dtype=dtype, device=self.device),  # sample
+            torch.randn(
+                batch_size, self.unet_dim, latent_height, latent_width, dtype=dtype, device=self.device
+            ),  # sample
             torch.ones(batch_size, dtype=torch.float32, device=self.device),  # timestep
-            torch.randn(batch_size, self.text_maxlen, self.embedding_dim, 
-                       dtype=dtype, device=self.device),  # encoder_hidden_states
-            torch.randn(batch_size, self.conditioning_channels, image_height, image_width, 
-                       dtype=dtype, device=self.device),  # controlnet_cond
+            torch.randn(
+                batch_size, self.text_maxlen, self.embedding_dim, dtype=dtype, device=self.device
+            ),  # encoder_hidden_states
+            torch.randn(
+                batch_size, self.conditioning_channels, image_height, image_width, dtype=dtype, device=self.device
+            ),  # controlnet_cond
             torch.tensor(1.0, dtype=torch.float32, device=self.device),  # conditioning_scale
             torch.randn(batch_size, 1280, dtype=dtype, device=self.device),  # text_embeds
-            torch.randn(batch_size, 6, dtype=dtype, device=self.device),     # time_ids
+            torch.randn(batch_size, 6, dtype=dtype, device=self.device),  # time_ids
         )
-        
+
         return base_inputs
-    
+
     def get_input_names(self):
         """Override to provide SDXL-specific input names"""
-        return ["sample", "timestep", "encoder_hidden_states", "controlnet_cond", "conditioning_scale", "text_embeds", "time_ids"]
-    
+        return [
+            "sample",
+            "timestep",
+            "encoder_hidden_states",
+            "controlnet_cond",
+            "conditioning_scale",
+            "text_embeds",
+            "time_ids",
+        ]
+
     def get_output_names(self):
         """Override to provide SDXL-specific output names that match wrapper return format"""
-        return ["down_block_00", "down_block_01", "down_block_02", "down_block_03", 
-                "down_block_04", "down_block_05", "down_block_06", "down_block_07", 
-                "down_block_08", "mid_block"]
+        return [
+            "down_block_00",
+            "down_block_01",
+            "down_block_02",
+            "down_block_03",
+            "down_block_04",
+            "down_block_05",
+            "down_block_06",
+            "down_block_07",
+            "down_block_08",
+            "mid_block",
+        ]
 
     def get_dynamic_axes(self) -> Dict[str, Dict[int, str]]:
         """Get dynamic axes configuration for variable input shapes"""
@@ -250,51 +265,49 @@ def get_dynamic_axes(self) -> Dict[str, Dict[int, str]]:
             "text_embeds": {0: "B"},
             "time_ids": {0: "B"},
             **{f"down_block_{i:02d}": {0: "B", 2: "H", 3: "W"} for i in range(9)},
-            "mid_block": {0: "B", 2: "H", 3: "W"}
+            "mid_block": {0: "B", 2: "H", 3: "W"},
         }
-    
-    def get_input_profile(self, batch_size, image_height, image_width, 
-                         static_batch, static_shape):
+
+    def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape):
         """Override to provide SDXL-specific input profiles including text_embeds and time_ids"""
         # Get base profiles from parent class
-        profile = super().get_input_profile(batch_size, image_height, image_width, 
-                                           static_batch, static_shape)
-        
+        profile = super().get_input_profile(batch_size, image_height, image_width, static_batch, static_shape)
+
         # Add SDXL-specific input profiles with dynamic batch dimension
         min_batch = batch_size if static_batch else self.min_batch
         max_batch = batch_size if static_batch else self.max_batch
-        
+
         # conditioning_scale is a scalar (empty shape)
         profile["conditioning_scale"] = [
             (),  # min
             (),  # opt
             (),  # max
         ]
-        
+
         # text_embeds has shape (batch, 1280)
         profile["text_embeds"] = [
-            (min_batch, 1280),      # min
-            (batch_size, 1280),     # opt
-            (max_batch, 1280),      # max
+            (min_batch, 1280),  # min
+            (batch_size, 1280),  # opt
+            (max_batch, 1280),  # max
         ]
-        
+
         # time_ids has shape (batch, 6)
         profile["time_ids"] = [
-            (min_batch, 6),         # min
-            (batch_size, 6),        # opt
-            (max_batch, 6),         # max
+            (min_batch, 6),  # min
+            (batch_size, 6),  # opt
+            (max_batch, 6),  # max
         ]
-        
+
         return profile
 
 
-def create_controlnet_model(model_type: str = "sd15", 
-                           unet=None, model_path: str = "",
-                           conditioning_channels: int = 3,
-                           **kwargs) -> ControlNetTRT:
+def create_controlnet_model(
+    model_type: str = "sd15", unet=None, model_path: str = "", conditioning_channels: int = 3, **kwargs
+) -> ControlNetTRT:
     """Factory function to create appropriate ControlNet TensorRT model"""
     if model_type.lower() in ["sdxl"]:
-        return ControlNetSDXLTRT(unet=unet, model_path=model_path, 
-                                conditioning_channels=conditioning_channels, **kwargs)
+        return ControlNetSDXLTRT(
+            unet=unet, model_path=model_path, conditioning_channels=conditioning_channels, **kwargs
+        )
     else:
-        return ControlNetTRT(conditioning_channels=conditioning_channels, **kwargs) 
\ No newline at end of file
+        return ControlNetTRT(conditioning_channels=conditioning_channels, **kwargs)
diff --git a/src/streamdiffusion/acceleration/tensorrt/models/models.py b/src/streamdiffusion/acceleration/tensorrt/models/models.py
index 6de37236..62f7490d 100644
--- a/src/streamdiffusion/acceleration/tensorrt/models/models.py
+++ b/src/streamdiffusion/acceleration/tensorrt/models/models.py
@@ -150,12 +150,8 @@ def check_dims(self, batch_size, image_height, image_width):
         return (latent_height, latent_width)
 
     def get_minmax_dims(self, batch_size, image_height, image_width, static_batch, static_shape):
-        # Following ComfyUI TensorRT approach: ensure proper min ≤ opt ≤ max constraints
-        # Even with static_batch=True, we need different min/max to avoid TensorRT constraint violations
-
         if static_batch:
-            # For static batch, still provide range to avoid min=opt=max constraint violation
-            min_batch = max(1, batch_size - 1)  # At least 1, but allow some range
+            min_batch = max(1, batch_size - 1)
             max_batch = batch_size
         else:
             min_batch = self.min_batch
@@ -164,16 +160,23 @@ def get_minmax_dims(self, batch_size, image_height, image_width, static_batch, s
         latent_height = image_height // 8
         latent_width = image_width // 8
 
-        # Force dynamic shapes for height/width to enable runtime resolution changes
-        # Always use 384-1024 range regardless of static_shape flag
-        min_image_height = self.min_image_shape
-        max_image_height = self.max_image_shape
-        min_image_width = self.min_image_shape
-        max_image_width = self.max_image_shape
-        min_latent_height = self.min_latent_shape
-        max_latent_height = self.max_latent_shape
-        min_latent_width = self.min_latent_shape
-        max_latent_width = self.max_latent_shape
+        if static_shape:
+            # Static: min=opt=max — TRT selects geometry-specific kernels,
+            # enables L2 tiling, and CUDA graphs avoid worst-case allocation.
+            min_image_height = max_image_height = image_height
+            min_image_width = max_image_width = image_width
+            min_latent_height = max_latent_height = latent_height
+            min_latent_width = max_latent_width = latent_width
+        else:
+            # Dynamic: full range for runtime resolution flexibility
+            min_image_height = self.min_image_shape
+            max_image_height = self.max_image_shape
+            min_image_width = self.min_image_shape
+            max_image_width = self.max_image_shape
+            min_latent_height = self.min_latent_shape
+            max_latent_height = self.max_latent_shape
+            min_latent_width = self.min_latent_shape
+            max_latent_width = self.max_latent_shape
 
         return (
             min_batch,
@@ -586,23 +589,29 @@ def get_input_profile(self, batch_size, image_height, image_width, static_batch,
         opt_latent_height = min(max(latent_height, min_latent_height), max_latent_height)
         opt_latent_width = min(max(latent_width, min_latent_width), max_latent_width)
 
-        # Ensure no dimension equality that causes constraint violations
-        if opt_latent_height == min_latent_height and min_latent_height < max_latent_height:
-            opt_latent_height = min(min_latent_height + 8, max_latent_height)  # Add 8 pixels for separation
-        if opt_latent_width == min_latent_width and min_latent_width < max_latent_width:
-            opt_latent_width = min(min_latent_width + 8, max_latent_width)
+        # For dynamic shapes, ensure opt != min to satisfy TRT constraint (min < opt <= max).
+        # For static shapes min == opt == max is correct and intentional — skip separation.
+        if not static_shape:
+            if opt_latent_height == min_latent_height and min_latent_height < max_latent_height:
+                opt_latent_height = min(min_latent_height + 8, max_latent_height)
+            if opt_latent_width == min_latent_width and min_latent_width < max_latent_width:
+                opt_latent_width = min(min_latent_width + 8, max_latent_width)
 
         # Image dimensions for ControlNet inputs
-        min_image_h, max_image_h = self.min_image_shape, self.max_image_shape
-        min_image_w, max_image_w = self.min_image_shape, self.max_image_shape
-        opt_image_height = min(max(image_height, min_image_h), max_image_h)
-        opt_image_width = min(max(image_width, min_image_w), max_image_w)
-
-        # Ensure image dimension separation as well
-        if opt_image_height == min_image_h and min_image_h < max_image_h:
-            opt_image_height = min(min_image_h + 64, max_image_h)  # Add 64 pixels for separation
-        if opt_image_width == min_image_w and min_image_w < max_image_w:
-            opt_image_width = min(min_image_w + 64, max_image_w)
+        if static_shape:
+            min_image_h = max_image_h = image_height
+            min_image_w = max_image_w = image_width
+            opt_image_height = image_height
+            opt_image_width = image_width
+        else:
+            min_image_h, max_image_h = self.min_image_shape, self.max_image_shape
+            min_image_w, max_image_w = self.min_image_shape, self.max_image_shape
+            opt_image_height = min(max(image_height, min_image_h), max_image_h)
+            opt_image_width = min(max(image_width, min_image_w), max_image_w)
+            if opt_image_height == min_image_h and min_image_h < max_image_h:
+                opt_image_height = min(min_image_h + 64, max_image_h)
+            if opt_image_width == min_image_w and min_image_w < max_image_w:
+                opt_image_width = min(min_image_w + 64, max_image_w)
 
         profile = {
             "sample": [
@@ -641,18 +650,22 @@ def get_input_profile(self, batch_size, image_height, image_width, static_batch,
                 control_height = shape_spec["height"]
                 control_width = shape_spec["width"]
 
-                # Create optimization profile with proper spatial dimension scaling
-                # Scale the spatial dimensions proportionally with the main latent dimensions
-                scale_h = opt_latent_height / latent_height if latent_height > 0 else 1.0
-                scale_w = opt_latent_width / latent_width if latent_width > 0 else 1.0
-
-                min_control_h = max(1, int(control_height * min_latent_height / latent_height))
-                max_control_h = max(min_control_h + 1, int(control_height * max_latent_height / latent_height))
-                opt_control_h = max(min_control_h, min(int(control_height * scale_h), max_control_h))
-
-                min_control_w = max(1, int(control_width * min_latent_width / latent_width))
-                max_control_w = max(min_control_w + 1, int(control_width * max_latent_width / latent_width))
-                opt_control_w = max(min_control_w, min(int(control_width * scale_w), max_control_w))
+                if static_shape:
+                    # Static: all three identical — exact resolution, no padding
+                    min_control_h = max_control_h = opt_control_h = control_height
+                    min_control_w = max_control_w = opt_control_w = control_width
+                else:
+                    # Dynamic: scale proportionally with latent range
+                    scale_h = opt_latent_height / latent_height if latent_height > 0 else 1.0
+                    scale_w = opt_latent_width / latent_width if latent_width > 0 else 1.0
+
+                    min_control_h = max(1, int(control_height * min_latent_height / latent_height))
+                    max_control_h = max(min_control_h + 1, int(control_height * max_latent_height / latent_height))
+                    opt_control_h = max(min_control_h, min(int(control_height * scale_h), max_control_h))
+
+                    min_control_w = max(1, int(control_width * min_latent_width / latent_width))
+                    max_control_w = max(min_control_w + 1, int(control_width * max_latent_width / latent_width))
+                    opt_control_w = max(min_control_w, min(int(control_width * scale_w), max_control_w))
 
                 profile[name] = [
                     (min_batch, channels, min_control_h, min_control_w),  # min
diff --git a/src/streamdiffusion/acceleration/tensorrt/utilities.py b/src/streamdiffusion/acceleration/tensorrt/utilities.py
index d20f1d16..5c8f3663 100644
--- a/src/streamdiffusion/acceleration/tensorrt/utilities.py
+++ b/src/streamdiffusion/acceleration/tensorrt/utilities.py
@@ -291,6 +291,17 @@ def _apply_gpu_profile_to_config(
         except Exception:
             logger.debug("[TRT Config] RUNTIME_ACTIVATION_RESIZE_10_10 not supported — skipping")
 
+    # avg_timing_iterations: number of timing runs averaged per tactic candidate.
+    # Default 1 produces noisy measurements — occasional slow GPU clocks or cache
+    # miss can unfairly disqualify the best kernel. Value of 4 gives stable rankings
+    # with minimal extra build time (4× timing overhead, which is tiny vs. compilation).
+    # TRT 10.12 confirmed to support this property.
+    try:
+        config.avg_timing_iterations = 4
+        logger.info("[TRT Config] avg_timing_iterations=4")
+    except AttributeError:
+        logger.debug("[TRT Config] avg_timing_iterations not supported — skipping")
+
 
 # Map of numpy dtype -> torch dtype
 numpy_to_torch_dtype_dict = {
@@ -473,7 +484,6 @@ def build(
         fp16,
         input_profile=None,
         enable_refit=False,
-        enable_all_tactics=False,
         timing_cache=None,
         workspace_size=0,
         fp8=False,
@@ -484,7 +494,7 @@ def build(
 
         if fp8:
             self._build_fp8(
-                onnx_path, input_profile, workspace_size, enable_all_tactics,
+                onnx_path, input_profile, workspace_size,
                 timing_cache=timing_cache, gpu_profile=gpu_profile,
                 dynamic_shapes=dynamic_shapes,
             )
@@ -582,7 +592,6 @@ def _build_fp8(
         onnx_path,
         input_profile,
         workspace_size,
-        enable_all_tactics,
         timing_cache=None,
         gpu_profile: Optional["GPUBuildProfile"] = None,
         dynamic_shapes: bool = True,
@@ -598,7 +607,6 @@ def _build_fp8(
             onnx_path: Path to *.fp8.onnx (Q/DQ-annotated by fp8_quantize.py).
             input_profile: Dict of {name: (min, opt, max)} shapes.
             workspace_size: TRT workspace limit in bytes.
-            enable_all_tactics: If True, allow all TRT tactic sources.
             timing_cache: Path to timing cache file for load/save.
             gpu_profile: Hardware-aware build parameters from detect_gpu_profile().
             dynamic_shapes: Whether the engine uses dynamic input shapes.
@@ -946,7 +954,6 @@ def build_engine(
     opt_batch_size: int,
     build_static_batch: bool = False,
     build_dynamic_shape: bool = False,
-    build_all_tactics: bool = False,
     build_enable_refit: bool = False,
     fp8: bool = False,
 ):
@@ -988,7 +995,6 @@ def build_engine(
         fp16=True,
         input_profile=input_profile,
         enable_refit=build_enable_refit,
-        enable_all_tactics=build_all_tactics,
         timing_cache=timing_cache_path,
         workspace_size=max_workspace_size,
         fp8=fp8,
diff --git a/src/streamdiffusion/wrapper.py b/src/streamdiffusion/wrapper.py
index 0cadedd4..c215ad82 100644
--- a/src/streamdiffusion/wrapper.py
+++ b/src/streamdiffusion/wrapper.py
@@ -1,17 +1,18 @@
+import logging
 import os
 from pathlib import Path
-from typing import Dict, List, Literal, Optional, Union, Any, Tuple
+from typing import Any, Dict, List, Literal, Optional, Tuple, Union
 
-import torch
 import numpy as np
+import torch
+from diffusers import AutoencoderTiny, AutoPipelineForText2Image, StableDiffusionPipeline, StableDiffusionXLPipeline
 from PIL import Image
-from diffusers import AutoencoderTiny, StableDiffusionPipeline, StableDiffusionXLPipeline, AutoPipelineForText2Image
 
-from .pipeline import StreamDiffusion
-from .model_detection import detect_model
 from .image_utils import postprocess_image
+from .model_detection import detect_model
+from .pipeline import StreamDiffusion
+
 
-import logging
 logger = logging.getLogger(__name__)
 
 torch.set_grad_enabled(False)
@@ -67,6 +68,7 @@ class StreamDiffusionWrapper:
     - Use get_cache_info() to inspect cache statistics
     - Use clear_caches() to free memory
     """
+
     def __init__(
         self,
         model_id_or_path: str,
@@ -245,7 +247,7 @@ def __init__(
         """
         if compile_engines_only:
             logger.info("compile_engines_only is True, will only compile engines and not load the model")
-        
+
         # Store use_lcm_lora for backwards compatibility processing in _load_model
         self.use_lcm_lora = use_lcm_lora
 
@@ -253,7 +255,7 @@ def __init__(
         self.use_controlnet = use_controlnet
         self.use_ipadapter = use_ipadapter
         self.ipadapter_config = ipadapter_config
-        
+
         # Store pipeline hook configurations
         self.image_preprocessing_config = image_preprocessing_config
         self.image_postprocessing_config = image_postprocessing_config
@@ -262,20 +264,14 @@ def __init__(
 
         if mode == "txt2img":
             if cfg_type != "none":
-                raise ValueError(
-                    f"txt2img mode accepts only cfg_type = 'none', but got {cfg_type}"
-                )
+                raise ValueError(f"txt2img mode accepts only cfg_type = 'none', but got {cfg_type}")
             if use_denoising_batch and frame_buffer_size > 1:
                 if not self.sd_turbo:
-                    raise ValueError(
-                        "txt2img mode cannot use denoising batch with frame_buffer_size > 1."
-                    )
+                    raise ValueError("txt2img mode cannot use denoising batch with frame_buffer_size > 1.")
 
         if mode == "img2img":
             if not use_denoising_batch:
-                raise NotImplementedError(
-                    "img2img mode must use denoising batch for now."
-                )
+                raise NotImplementedError("img2img mode must use denoising batch for now.")
 
         self.device = device
         self.dtype = dtype
@@ -284,11 +280,7 @@ def __init__(
         self.mode = mode
         self.output_type = output_type
         self.frame_buffer_size = frame_buffer_size
-        self.batch_size = (
-            len(t_index_list) * frame_buffer_size
-            if use_denoising_batch
-            else frame_buffer_size
-        )
+        self.batch_size = len(t_index_list) * frame_buffer_size if use_denoising_batch else frame_buffer_size
         self.min_batch_size = min_batch_size
         self.max_batch_size = max_batch_size
 
@@ -307,7 +299,7 @@ def __init__(
             t_index_list=t_index_list,
             acceleration=acceleration,
             do_add_noise=do_add_noise,
-            use_lcm_lora=use_lcm_lora, # Deprecated:Backwards compatibility
+            use_lcm_lora=use_lcm_lora,  # Deprecated:Backwards compatibility
             use_tiny_vae=use_tiny_vae,
             cfg_type=cfg_type,
             engine_dir=engine_dir,
@@ -347,13 +339,15 @@ def __init__(
             "",
             "",
             num_inference_steps=50,
-            guidance_scale=1.1
-            if self.stream.cfg_type in ["full", "self", "initialize"]
-            else 1.0,
+            guidance_scale=1.1 if self.stream.cfg_type in ["full", "self", "initialize"] else 1.0,
             generator=torch.manual_seed(seed),
             seed=seed,
         )
 
+        # Offload text encoders to CPU after initial encoding to free ~1.6 GB VRAM (SDXL).
+        # They are reloaded on-demand before each prompt re-encoding call.
+        if acceleration == "tensorrt":
+            self._offload_text_encoders()
 
         # Set wrapper reference on parameter updater so it can access pipeline structure
         self.stream._param_updater.wrapper = self
@@ -362,14 +356,8 @@ def __init__(
         self._acceleration = acceleration
         self._engine_dir = engine_dir
 
-        # Prompt change tracking: skip text encoder reload when text is identical
-        self._last_prompt_texts: Optional[List[str]] = None
-        self._last_negative_prompt: Optional[str] = None
-
         if device_ids is not None:
-            self.stream.unet = torch.nn.DataParallel(
-                self.stream.unet, device_ids=device_ids
-            )
+            self.stream.unet = torch.nn.DataParallel(self.stream.unet, device_ids=device_ids)
 
         if enable_similar_image_filter:
             self.stream.enable_similar_image_filter(
@@ -418,7 +406,6 @@ def prepare(
             Method for interpolating between seed noise tensors, by default "linear".
         """
 
-
         # Handle both single prompt and prompt blending
         if isinstance(prompt, str):
             # Single prompt mode (legacy interface)
@@ -474,23 +461,10 @@ def prepare(
             raise TypeError(f"prepare: prompt must be str or List[Tuple[str, float]], got {type(prompt)}")
 
     def _offload_text_encoders(self) -> None:
-        """No-op during inference: text encoders kept on GPU.
+        """Move text encoders to CPU to free VRAM (~1.6 GB for SDXL).
 
-        Prompts change during inference while UNet/VAE/ControlNet are constant.
-        The ~1.6GB text encoder VRAM fits comfortably alongside other components
-        on a 24GB GPU, so the offload-reload cycle is pure overhead.
-        For maximum-VRAM scenarios (engine building, FP8 quantization),
-        use _force_offload_text_encoders() explicitly instead.
-        """
-
-    def _reload_text_encoders(self) -> None:
-        """No-op: text encoders remain on GPU (never offloaded during inference)."""
-
-    def _force_offload_text_encoders(self) -> None:
-        """Force-offload text encoders to CPU. Use during engine building or FP8 quantization only.
-
-        Frees ~1.6GB VRAM for maximum headroom during one-time build processes.
-        Call _force_reload_text_encoders() afterwards to restore state.
+        Called automatically after initial prepare() when using TRT acceleration.
+        Text encoders are reloaded to GPU before each prompt re-encoding call.
         """
         pipe = self.stream.pipe
         if hasattr(pipe, "text_encoder") and pipe.text_encoder is not None:
@@ -500,16 +474,16 @@ def _force_offload_text_encoders(self) -> None:
             if next(pipe.text_encoder_2.parameters(), None) is not None:
                 pipe.text_encoder_2 = pipe.text_encoder_2.to("cpu")
         torch.cuda.empty_cache()
-        logger.debug("[VRAM] Text encoders force-offloaded to CPU (engine build/quantization)")
+        logger.debug("[VRAM] Text encoders offloaded to CPU")
 
-    def _force_reload_text_encoders(self) -> None:
-        """Force-reload text encoders to GPU after engine building or FP8 quantization."""
+    def _reload_text_encoders(self) -> None:
+        """Move text encoders back to GPU before prompt re-encoding."""
         pipe = self.stream.pipe
         if hasattr(pipe, "text_encoder") and pipe.text_encoder is not None:
             pipe.text_encoder = pipe.text_encoder.to(self.device)
         if hasattr(pipe, "text_encoder_2") and pipe.text_encoder_2 is not None:
             pipe.text_encoder_2 = pipe.text_encoder_2.to(self.device)
-        logger.debug("[VRAM] Text encoders force-reloaded to GPU")
+        logger.debug("[VRAM] Text encoders reloaded to GPU")
 
     def update_prompt(
         self,
@@ -517,7 +491,7 @@ def update_prompt(
         negative_prompt: str = "",
         prompt_interpolation_method: Literal["linear", "slerp"] = "slerp",
         clear_blending: bool = True,
-        warn_about_conflicts: bool = True
+        warn_about_conflicts: bool = True,
     ) -> None:
         """
         Update to a new prompt or prompt blending configuration.
@@ -648,8 +622,8 @@ def update_stream_params(
             When False, weights > 1 will amplify noise.
         controlnet_config : Optional[List[Dict[str, Any]]]
             Complete ControlNet configuration list defining the desired state.
-            Each dict contains: model_id, preprocessor, conditioning_scale, enabled, 
-            preprocessor_params, etc. System will diff current vs desired state and 
+            Each dict contains: model_id, preprocessor, conditioning_scale, enabled,
+            preprocessor_params, etc. System will diff current vs desired state and
             perform minimal add/remove/update operations.
         ipadapter_config : Optional[Dict[str, Any]]
             IPAdapter configuration dict containing scale, style_image, etc.
@@ -658,26 +632,9 @@ def update_stream_params(
         safety_checker_threshold : Optional[float]
             The threshold for the safety checker.
         """
-        # Reload text encoders to GPU only when prompt text actually changed.
-        # OSC sends prompt updates at ~60 Hz even with identical text, so comparing
-        # against the last encoded texts avoids repeated ~1.6 GB CPU↔GPU transfers.
-        _new_prompt_texts = (
-            [p for p, _w in prompt_list] if prompt_list is not None else None
-        )
-        _texts_changed = (
-            _new_prompt_texts is not None
-            and _new_prompt_texts != self._last_prompt_texts
-        )
-        _neg_changed = (
-            negative_prompt is not None
-            and negative_prompt != self._last_negative_prompt
-        )
-        needs_encoding = _texts_changed or _neg_changed
+        # Reload text encoders to GPU if a new prompt needs encoding.
+        needs_encoding = prompt_list is not None or negative_prompt is not None
         if needs_encoding:
-            if _new_prompt_texts is not None:
-                self._last_prompt_texts = _new_prompt_texts
-            if negative_prompt is not None:
-                self._last_negative_prompt = negative_prompt
             self._reload_text_encoders()
         try:
             # Handle all parameters via parameter updater (including ControlNet)
@@ -733,45 +690,44 @@ def __call__(
         """
         if self.skip_diffusion:
             return self._process_skip_diffusion(image, prompt)
-        
+
         if self.mode == "img2img":
             return self.img2img(image, prompt)
         else:
             return self.txt2img(prompt)
 
     def _process_skip_diffusion(
-        self, 
-        image: Optional[Union[str, Image.Image, torch.Tensor]] = None, 
-        prompt: Optional[str] = None
+        self, image: Optional[Union[str, Image.Image, torch.Tensor]] = None, prompt: Optional[str] = None
     ) -> Union[Image.Image, List[Image.Image], torch.Tensor, np.ndarray]:
         """
         Process input directly without diffusion, applying pre/post processing hooks.
-        
+
         This method bypasses VAE encoding, diffusion, and VAE decoding, but still
         applies image preprocessing and postprocessing hooks for consistent processing.
-        
+
         Parameters
         ----------
         image : Optional[Union[str, Image.Image, torch.Tensor]]
             The image to process directly.
         prompt : Optional[str]
             Prompt (ignored in skip mode, but kept for API consistency).
-            
+
         Returns
         -------
         Union[Image.Image, List[Image.Image], torch.Tensor, np.ndarray]
             The processed image with hooks applied.
         """
 
-        #TODO: add safety checker call somewhere in this method
-
+        # TODO: add safety checker call somewhere in this method
 
         if self.mode == "txt2img":
-            raise RuntimeError("_process_skip_diffusion: skip_diffusion mode not applicable for txt2img - no input image")
-        
+            raise RuntimeError(
+                "_process_skip_diffusion: skip_diffusion mode not applicable for txt2img - no input image"
+            )
+
         if image is None:
             raise ValueError("_process_skip_diffusion: image required for skip diffusion mode")
-        
+
         # Handle input tensor normalization to [-1,1] pipeline range
         if isinstance(image, str) or isinstance(image, Image.Image):
             processed_tensor = self.preprocess_image(image)
@@ -783,19 +739,17 @@ def _process_skip_diffusion(
             preprocessor_input = image
 
         preprocessor_output = self.stream._apply_image_preprocessing_hooks(preprocessor_input)
-        
+
         # Convert [0,1] -> [-1,1] back to pipeline range for postprocessing hooks
         processed_tensor = self._normalize_on_gpu(preprocessor_output)
-        
+
         # Apply image postprocessing hooks (expect [-1,1] range - post-VAE decoding)
         processed_tensor = self.stream._apply_image_postprocessing_hooks(processed_tensor)
-        
+
         # Final postprocessing for output format
         return self.postprocess_image(processed_tensor, output_type=self.output_type)
 
-    def txt2img(
-        self, prompt: Optional[str] = None
-    ) -> Union[Image.Image, List[Image.Image], torch.Tensor, np.ndarray]:
+    def txt2img(self, prompt: Optional[str] = None) -> Union[Image.Image, List[Image.Image], torch.Tensor, np.ndarray]:
         """
         Performs txt2img.
 
@@ -812,12 +766,12 @@ def txt2img(
         """
         if prompt is not None:
             self.update_prompt(prompt, warn_about_conflicts=True)
-        
+
         if self.sd_turbo:
             image_tensor = self.stream.txt2img_sd_turbo(self.batch_size)
         else:
             image_tensor = self.stream.txt2img(self.frame_buffer_size)
-        
+
         image = self.postprocess_image(image_tensor, output_type=self.output_type)
 
         if self.use_safety_checker:
@@ -890,15 +844,15 @@ def preprocess_image(self, image: Union[str, Image.Image, torch.Tensor]) -> torc
         # Use stream's current resolution instead of wrapper's cached values
         current_width = self.stream.width
         current_height = self.stream.height
-        
+
         if isinstance(image, str):
             image = Image.open(image).convert("RGB").resize((current_width, current_height))
         if isinstance(image, Image.Image):
             image = image.convert("RGB").resize((current_width, current_height))
 
-        return self.stream.image_processor.preprocess(
-            image, current_height, current_width
-        ).to(device=self.device, dtype=self.dtype)
+        return self.stream.image_processor.preprocess(image, current_height, current_width).to(
+            device=self.device, dtype=self.dtype
+        )
 
     def postprocess_image(
         self, image_tensor: torch.Tensor, output_type: str = "pil"
@@ -927,7 +881,6 @@ def postprocess_image(
             denormalized = self._denormalize_on_gpu(image_tensor)
             return denormalized.cpu().permute(0, 2, 3, 1).float().numpy()
 
-
         # PIL output path (optimized)
         if output_type == "pil":
             if self.frame_buffer_size > 1:
@@ -935,7 +888,6 @@ def postprocess_image(
             else:
                 return self._tensor_to_pil_optimized(image_tensor)[0]
 
-
         # Fallback to original method for any unexpected output types
         if self.frame_buffer_size > 1:
             return postprocess_image(image_tensor.cpu(), output_type=output_type)
@@ -999,28 +951,23 @@ def _tensor_to_pil_optimized(self, image_tensor: torch.Tensor) -> List[Image.Ima
         # Denormalize on GPU first
         denormalized = self._denormalize_on_gpu(image_tensor)
 
-
         # Convert to uint8 on GPU to reduce transfer size
         # Scale to [0, 255] and convert to uint8
         # Scale to [0, 255] and convert to uint8
         uint8_tensor = (denormalized * 255).clamp(0, 255).to(torch.uint8)
 
-
         # Single efficient CPU transfer
         cpu_tensor = uint8_tensor.cpu()
 
-
         # Convert to HWC format for PIL
         # From BCHW to BHWC
         cpu_tensor = cpu_tensor.permute(0, 2, 3, 1)
 
-
         # Convert to PIL images efficiently
         pil_images = []
         for i in range(cpu_tensor.shape[0]):
             img_array = cpu_tensor[i].numpy()
 
-
             if img_array.shape[-1] == 1:
                 # Grayscale
                 pil_images.append(Image.fromarray(img_array.squeeze(-1), mode="L"))
@@ -1028,7 +975,6 @@ def _tensor_to_pil_optimized(self, image_tensor: torch.Tensor) -> List[Image.Ima
                 # RGB
                 pil_images.append(Image.fromarray(img_array))
 
-
         return pil_images
 
     def set_nsfw_fallback_img(self, height: int, width: int) -> None:
@@ -1184,7 +1130,7 @@ def _load_model(
             self.cleanup_gpu_memory()
         except Exception as e:
             logger.warning(f"GPU cleanup warning: {e}")
-        
+
         # Reset CUDA context to prevent corruption from previous runs
         torch.cuda.empty_cache()
         torch.cuda.synchronize()
@@ -1197,14 +1143,14 @@ def _load_model(
         # TODO: CAN we do this step with model_detection.py?
         is_sdxl_model = False
         model_path_lower = model_id_or_path.lower()
-        
+
         # Check path for SDXL indicators
-        if any(indicator in model_path_lower for indicator in ['sdxl', 'xl', '1024']):
+        if any(indicator in model_path_lower for indicator in ["sdxl", "xl", "1024"]):
             is_sdxl_model = True
             logger.info(f"_load_model: Path suggests SDXL model: {model_id_or_path}")
-        
+
         # For .safetensor files, we need to be more careful about pipeline selection
-        if model_id_or_path.endswith('.safetensors'):
+        if model_id_or_path.endswith(".safetensors"):
             # For .safetensor files, try SDXL pipeline first if path suggests SDXL
             if is_sdxl_model:
                 loading_methods = [
@@ -1216,14 +1162,14 @@ def _load_model(
                 loading_methods = [
                     (AutoPipelineForText2Image.from_pretrained, "AutoPipeline from_pretrained"),
                     (StableDiffusionPipeline.from_single_file, "SD from_single_file"),
-                    (StableDiffusionXLPipeline.from_single_file, "SDXL from_single_file")
+                    (StableDiffusionXLPipeline.from_single_file, "SDXL from_single_file"),
                 ]
         else:
             # For regular model directories or checkpoints, use the original order
             loading_methods = [
                 (AutoPipelineForText2Image.from_pretrained, "AutoPipeline from_pretrained"),
                 (StableDiffusionPipeline.from_single_file, "SD from_single_file"),
-                (StableDiffusionXLPipeline.from_single_file, "SDXL from_single_file")
+                (StableDiffusionXLPipeline.from_single_file, "SDXL from_single_file"),
             ]
 
         pipe = None
@@ -1233,19 +1179,19 @@ def _load_model(
                 logger.info(f"_load_model: Attempting to load with {method_name}...")
                 pipe = method(model_id_or_path).to(dtype=self.dtype)
                 logger.info(f"_load_model: Successfully loaded using {method_name}")
-                
+
                 # Verify that we have the right pipeline type for SDXL models
                 if is_sdxl_model and not isinstance(pipe, StableDiffusionXLPipeline):
                     logger.warning(f"_load_model: SDXL model detected but loaded with non-SDXL pipeline: {type(pipe)}")
                     # Try to explicitly load with SDXL pipeline instead
                     try:
-                        logger.info(f"_load_model: Retrying with StableDiffusionXLPipeline...")
+                        logger.info("_load_model: Retrying with StableDiffusionXLPipeline...")
                         pipe = StableDiffusionXLPipeline.from_single_file(model_id_or_path).to(dtype=self.dtype)
-                        logger.info(f"_load_model: Successfully loaded using SDXL pipeline on retry")
+                        logger.info("_load_model: Successfully loaded using SDXL pipeline on retry")
                     except Exception as retry_error:
                         logger.warning(f"_load_model: SDXL pipeline retry failed: {retry_error}")
                         # Continue with the originally loaded pipeline
-                
+
                 break
             except Exception as e:
                 logger.warning(f"_load_model: {method_name} failed: {e}")
@@ -1253,11 +1199,14 @@ def _load_model(
                 continue
 
         if pipe is None:
-            error_msg = f"_load_model: All loading methods failed for model '{model_id_or_path}'. Last error: {last_error}"
+            error_msg = (
+                f"_load_model: All loading methods failed for model '{model_id_or_path}'. Last error: {last_error}"
+            )
             logger.error(error_msg)
             if last_error:
                 logger.warning("Full traceback of last error:")
                 import traceback
+
                 traceback.print_exc()
             raise RuntimeError(error_msg)
         else:
@@ -1272,34 +1221,35 @@ def _load_model(
                 pipe.vae = pipe.vae.to(device=self.device)
 
         # If we get here, the model loaded successfully - break out of retry loop
-        logger.info(f"Model loading succeeded")
+        logger.info("Model loading succeeded")
 
         # Use comprehensive model detection instead of basic detection
         detection_result = detect_model(pipe.unet, pipe)
-        model_type = detection_result['model_type']
-        is_sdxl = detection_result['is_sdxl']
-        is_turbo = detection_result['is_turbo']
-        confidence = detection_result['confidence']
-        
+        model_type = detection_result["model_type"]
+        is_sdxl = detection_result["is_sdxl"]
+        is_turbo = detection_result["is_turbo"]
+        confidence = detection_result["confidence"]
+
         # Store comprehensive model info for later use (after TensorRT conversion)
         self._detected_model_type = model_type
         self._detection_confidence = confidence
         self._is_turbo = is_turbo
         self._is_sdxl = is_sdxl
-        
+
         logger.info(f"_load_model: Detected model type: {model_type} (confidence: {confidence:.2f})")
 
         # Auto-resolve IP-Adapter model/encoder paths for detected architecture.
         # Runs once here so both pre-TRT and post-TRT installation paths see the resolved cfg.
         if use_ipadapter and ipadapter_config:
             from streamdiffusion.modules.ipadapter_module import resolve_ipadapter_paths
+
             _ip_cfgs = ipadapter_config if isinstance(ipadapter_config, list) else [ipadapter_config]
             for _ip_cfg in _ip_cfgs:
                 resolve_ipadapter_paths(_ip_cfg, model_type, is_sdxl)
 
         # DEPRECATED: THIS WILL LOAD LCM_LORA IF USE_LCM_LORA IS TRUE
         # Validate backwards compatibility LCM LoRA selection using proper model detection
-        if hasattr(self, 'use_lcm_lora') and self.use_lcm_lora is not None:
+        if hasattr(self, "use_lcm_lora") and self.use_lcm_lora is not None:
             if self.use_lcm_lora and not self.sd_turbo:
                 if lora_dict is None:
                     lora_dict = {}
@@ -1314,11 +1264,13 @@ def _load_model(
                 else:
                     logger.info(f"LCM LoRA {lcm_lora} already present in lora_dict with scale {lora_dict[lcm_lora]}")
             else:
-                logger.info(f"LCM LoRA will not be loaded because use_lcm_lora is {self.use_lcm_lora} and sd_turbo is {self.sd_turbo}")
+                logger.info(
+                    f"LCM LoRA will not be loaded because use_lcm_lora is {self.use_lcm_lora} and sd_turbo is {self.sd_turbo}"
+                )
 
                 # Remove use_lcm_lora from self
                 self.use_lcm_lora = None
-                logger.info(f"use_lcm_lora has been removed from self")
+                logger.info("use_lcm_lora has been removed from self")
 
         # Get kvo_cache_structure before stream init (needed for TRT export wrapper).
         # Actual cache tensors are created AFTER stream init so we can use
@@ -1326,6 +1278,7 @@ def _load_model(
         # (e.g. TCD sets trt_unet_batch_size = frame_buffer_size, not denoising_steps * frame_buffer_size).
         if use_cached_attn:
             from streamdiffusion.acceleration.tensorrt.models.utils import get_kvo_cache_info
+
             _, kvo_cache_structure, _ = get_kvo_cache_info(pipe.unet, self.height, self.width)
         else:
             kvo_cache_structure = []
@@ -1341,7 +1294,7 @@ def _load_model(
             frame_buffer_size=self.frame_buffer_size,
             use_denoising_batch=self.use_denoising_batch,
             cfg_type=cfg_type,
-            lora_dict=lora_dict, # We pass this to include loras in engine path names
+            lora_dict=lora_dict,  # We pass this to include loras in engine path names
             normalize_prompt_weights=normalize_prompt_weights,
             normalize_seed_weights=normalize_seed_weights,
             scheduler=scheduler,
@@ -1356,26 +1309,28 @@ def _load_model(
         # so this must happen after StreamDiffusion.__init__ to get the correct value.
         if use_cached_attn:
             from streamdiffusion.acceleration.tensorrt.models.utils import create_kvo_cache
-            kvo_cache, _ = create_kvo_cache(pipe.unet,
-                                            batch_size=stream.trt_unet_batch_size,
-                                            cache_maxframes=cache_maxframes,
-                                            height=self.height,
-                                            width=self.width,
-                                            device=self.device,
-                                            dtype=self.dtype)
+
+            kvo_cache, _ = create_kvo_cache(
+                pipe.unet,
+                batch_size=stream.trt_unet_batch_size,
+                cache_maxframes=max_cache_maxframes,  # Allocate at max to avoid runtime resize race
+                height=self.height,
+                width=self.width,
+                device=self.device,
+                dtype=self.dtype,
+            )
             stream.kvo_cache = kvo_cache
 
-        
         # Load and properly merge LoRA weights using the standard diffusers approach
         lora_adapters_to_merge = []
         lora_scales_to_merge = []
-        
+
         # Collect all LoRA adapters and their scales from lora_dict
         if lora_dict is not None:
             for i, (lora_name, lora_scale) in enumerate(lora_dict.items()):
                 adapter_name = f"custom_lora_{i}"
                 logger.info(f"_load_model: Loading LoRA '{lora_name}' with scale {lora_scale}")
-                
+
                 try:
                     # Load LoRA weights with unique adapter name
                     stream.pipe.load_lora_weights(lora_name, adapter_name=adapter_name)
@@ -1386,22 +1341,22 @@ def _load_model(
                     logger.error(f"Failed to load LoRA {lora_name}: {e}")
                     # Continue with other LoRAs even if one fails
                     continue
-        
+
         # Merge all LoRA adapters using the proper diffusers method
         if lora_adapters_to_merge:
             try:
                 for adapter_name, scale in zip(lora_adapters_to_merge, lora_scales_to_merge):
                     logger.info(f"Merging individual LoRA: {adapter_name} with scale {scale}")
                     stream.pipe.fuse_lora(lora_scale=scale, adapter_names=[adapter_name])
-                
+
                 # Clean up after individual merging
                 stream.pipe.unload_lora_weights()
                 logger.info("Successfully merged LoRAs individually")
-                
+
             except Exception as fallback_error:
                 logger.error(f"LoRA merging fallback also failed: {fallback_error}")
                 logger.warning("Continuing without LoRA merging - LoRAs may not be applied correctly")
-                
+
                 # Clean up any partial state
                 try:
                     stream.pipe.unload_lora_weights()
@@ -1425,21 +1380,23 @@ def _load_model(
                 stream.pipe.enable_xformers_memory_efficient_attention()
             if acceleration == "tensorrt":
                 from polygraphy import cuda
+
                 from streamdiffusion.acceleration.tensorrt import TorchVAEEncoder
-                from streamdiffusion.acceleration.tensorrt.runtime_engines.unet_engine import AutoencoderKLEngine, NSFWDetectorEngine
+                from streamdiffusion.acceleration.tensorrt.engine_manager import EngineManager, EngineType
                 from streamdiffusion.acceleration.tensorrt.models.models import (
                     VAE,
+                    NSFWDetector,
                     UNet,
                     VAEEncoder,
-                    NSFWDetector,
                 )
-                from streamdiffusion.acceleration.tensorrt.engine_manager import EngineManager, EngineType
-                # Add ControlNet detection and support
-                from streamdiffusion.model_detection import (
-                    extract_unet_architecture,
-                    validate_architecture
+                from streamdiffusion.acceleration.tensorrt.runtime_engines.unet_engine import (
+                    AutoencoderKLEngine,
+                    NSFWDetectorEngine,
                 )
 
+                # Add ControlNet detection and support
+                from streamdiffusion.model_detection import extract_unet_architecture, validate_architecture
+
                 # Legacy TensorRT implementation (fallback)
                 # Initialize engine manager
                 engine_manager = EngineManager(engine_dir)
@@ -1450,32 +1407,32 @@ def _load_model(
                 unet_arch = {}
                 is_sdxl_model = False
                 load_engine = not compile_engines_only
-                
+
                 # Use the explicit use_ipadapter parameter
                 has_ipadapter = use_ipadapter
-                
+
                 # Determine IP-Adapter presence and token count directly from config (no legacy pipeline)
                 if has_ipadapter and not ipadapter_config:
                     has_ipadapter = False
-                
+
                 try:
                     # Use model detection results already computed during model loading
-                    model_type = getattr(self, '_detected_model_type', 'SD15')
-                    is_sdxl = getattr(self, '_is_sdxl', False)
-                    is_turbo = getattr(self, '_is_turbo', False)
-                    confidence = getattr(self, '_detection_confidence', 0.0)
-                    
+                    model_type = getattr(self, "_detected_model_type", "SD15")
+                    is_sdxl = getattr(self, "_is_sdxl", False)
+                    is_turbo = getattr(self, "_is_turbo", False)
+                    confidence = getattr(self, "_detection_confidence", 0.0)
+
                     if is_sdxl:
                         logger.info(f"Building TensorRT engines for SDXL model: {model_type}")
                         logger.info(f"   Turbo variant: {is_turbo}")
                         logger.info(f"   Detection confidence: {confidence:.2f}")
                     else:
                         logger.info(f"Building TensorRT engines for {model_type}")
-                    
+
                     # Enable IPAdapter TensorRT if configured and available
                     if has_ipadapter:
                         use_ipadapter_trt = True
-                    
+
                     # Only enable ControlNet for legacy TensorRT if ControlNet is actually being used
                     if self.use_controlnet:
                         try:
@@ -1486,7 +1443,7 @@ def _load_model(
                         except Exception as e:
                             logger.warning(f"   ControlNet architecture detection failed: {e}")
                             use_controlnet_trt = False
-                    
+
                     # Set up architecture info for enabled modes
                     if use_controlnet_trt and not use_ipadapter_trt:
                         # ControlNet only: Full architecture needed
@@ -1505,28 +1462,28 @@ def _load_model(
                     else:
                         # Neither enabled: Standard UNet
                         unet_arch = {}
-                        
+
                 except Exception as e:
                     logger.error(f"Advanced model detection failed: {e}")
                     logger.error("   Falling back to basic TensorRT")
-                    
+
                     # Fallback to basic detection
                     try:
                         detection_result = detect_model(stream.unet, None)
-                        model_type = detection_result['model_type']
-                        is_sdxl = detection_result['is_sdxl']
+                        model_type = detection_result["model_type"]
+                        is_sdxl = detection_result["is_sdxl"]
                         if self.use_controlnet:
                             unet_arch = extract_unet_architecture(stream.unet)
                             unet_arch = validate_architecture(unet_arch, model_type)
                             use_controlnet_trt = True
                     except Exception:
                         pass
-                
+
                 if not use_controlnet_trt and not self.use_controlnet:
                     logger.info("ControlNet not enabled, building engines without ControlNet support")
 
                 # Use the engine_dir parameter passed to this function, with fallback to instance variable
-                engine_dir = engine_dir if engine_dir else getattr(self, '_engine_dir', 'engines')
+                engine_dir = engine_dir if engine_dir else getattr(self, "_engine_dir", "engines")
 
                 # Resolve IP-Adapter runtime params from config
                 # Strength is now a runtime input, so we do NOT bake scale into engine identity
@@ -1535,9 +1492,9 @@ def _load_model(
                 if use_ipadapter_trt and has_ipadapter and ipadapter_config:
                     cfg0 = ipadapter_config[0] if isinstance(ipadapter_config, list) else ipadapter_config
                     # scale omitted from engine naming; runtime will pass ipadapter_scale vector
-                    ipadapter_tokens = cfg0.get('num_image_tokens', 4)
+                    ipadapter_tokens = cfg0.get("num_image_tokens", 4)
                     # Determine FaceID type from config for engine naming
-                    is_faceid = (cfg0['type'] == 'faceid')
+                    is_faceid = cfg0["type"] == "faceid"
                 # Generate engine paths using EngineManager
                 unet_path = engine_manager.get_engine_path(
                     EngineType.UNET,
@@ -1553,6 +1510,7 @@ def _load_model(
                     use_cached_attn=use_cached_attn,
                     use_controlnet=use_controlnet_trt,
                     fp8=fp8,
+                    resolution=(self.height, self.width),
                 )
                 vae_encoder_path = engine_manager.get_engine_path(
                     EngineType.VAE_ENCODER,
@@ -1564,7 +1522,8 @@ def _load_model(
                     lora_dict=lora_dict,
                     ipadapter_scale=ipadapter_scale,
                     ipadapter_tokens=ipadapter_tokens,
-                    is_faceid=is_faceid if use_ipadapter_trt else None
+                    is_faceid=is_faceid if use_ipadapter_trt else None,
+                    resolution=(self.height, self.width),
                 )
                 vae_decoder_path = engine_manager.get_engine_path(
                     EngineType.VAE_DECODER,
@@ -1576,7 +1535,8 @@ def _load_model(
                     lora_dict=lora_dict,
                     ipadapter_scale=ipadapter_scale,
                     ipadapter_tokens=ipadapter_tokens,
-                    is_faceid=is_faceid if use_ipadapter_trt else None
+                    is_faceid=is_faceid if use_ipadapter_trt else None,
+                    resolution=(self.height, self.width),
                 )
 
                 # Check if all required engines exist
@@ -1590,14 +1550,16 @@ def _load_model(
 
                 if missing_engines:
                     if build_engines_if_missing:
-                        logger.info(f"Missing TensorRT engines, building them...")
+                        logger.info("Missing TensorRT engines, building them...")
                         for engine in missing_engines:
                             logger.info(f"  - {engine}")
                     else:
-                        error_msg = f"Required TensorRT engines are missing and build_engines_if_missing=False:\n"
+                        error_msg = "Required TensorRT engines are missing and build_engines_if_missing=False:\n"
                         for engine in missing_engines:
                             error_msg += f"  - {engine}\n"
-                        error_msg += f"\nTo build engines, set build_engines_if_missing=True or run the build script manually."
+                        error_msg += (
+                            "\nTo build engines, set build_engines_if_missing=True or run the build script manually."
+                        )
                         raise RuntimeError(error_msg)
 
                 # Determine correct embedding dimension based on model type
@@ -1613,32 +1575,41 @@ def _load_model(
                 # Gather parameters for unified wrapper - validate IPAdapter first for consistent token count
                 control_input_names = None
                 num_tokens = 4  # Default for non-IPAdapter mode
-                
+
                 if use_ipadapter_trt:
                     # Use token count resolved from configuration (default to 4)
                     num_tokens = ipadapter_tokens if isinstance(ipadapter_tokens, int) else 4
 
                 # Compile UNet engine using EngineManager
-                logger.info(f"compile_and_load_engine: Compiling UNet engine for image size: {self.width}x{self.height}")
+                logger.info(
+                    f"compile_and_load_engine: Compiling UNet engine for image size: {self.width}x{self.height}"
+                )
                 try:
-                    logger.debug(f"compile_and_load_engine: use_ipadapter_trt={use_ipadapter_trt}, num_ip_layers={num_ip_layers}, tokens={num_tokens}")
+                    logger.debug(
+                        f"compile_and_load_engine: use_ipadapter_trt={use_ipadapter_trt}, num_ip_layers={num_ip_layers}, tokens={num_tokens}"
+                    )
                 except Exception:
                     pass
-                
+
                 # Note: LoRA weights have already been merged permanently during model loading
-                
+
                 # CRITICAL: Install IPAdapter module BEFORE TensorRT compilation to ensure processors are baked into engines
-                if use_ipadapter and ipadapter_config and not hasattr(stream, '_ipadapter_module'):
+                if use_ipadapter and ipadapter_config and not hasattr(stream, "_ipadapter_module"):
                     # Check if auto-resolution disabled IP-Adapter (e.g. no adapter released for this arch)
                     _cfg_check = ipadapter_config[0] if isinstance(ipadapter_config, list) else ipadapter_config
-                    if _cfg_check.get('enabled', True) is False:
+                    if _cfg_check.get("enabled", True) is False:
                         logger.info(
                             "IP-Adapter disabled by auto-resolution (no compatible adapter for this model). Skipping."
                         )
                         use_ipadapter_trt = False
                     else:
                         try:
-                            from streamdiffusion.modules.ipadapter_module import IPAdapterModule, IPAdapterConfig, IPAdapterType
+                            from streamdiffusion.modules.ipadapter_module import (
+                                IPAdapterConfig,
+                                IPAdapterModule,
+                                IPAdapterType,
+                            )
+
                             logger.info("Installing IPAdapter module before TensorRT compilation...")
 
                             # Snapshot processors before install — IPAdapter.set_ip_adapter() replaces them
@@ -1648,14 +1619,14 @@ def _load_model(
                             # Use first config if list provided
                             cfg = ipadapter_config[0] if isinstance(ipadapter_config, list) else ipadapter_config
                             ip_cfg = IPAdapterConfig(
-                                style_image_key=cfg.get('style_image_key') or 'ipadapter_main',
-                                num_image_tokens=cfg.get('num_image_tokens', 4),
-                                ipadapter_model_path=cfg['ipadapter_model_path'],
-                                image_encoder_path=cfg['image_encoder_path'],
-                                style_image=cfg.get('style_image'),
-                                scale=cfg.get('scale', 1.0),
-                                type=IPAdapterType(cfg.get('type', "regular")),
-                                insightface_model_name=cfg.get('insightface_model_name'),
+                                style_image_key=cfg.get("style_image_key") or "ipadapter_main",
+                                num_image_tokens=cfg.get("num_image_tokens", 4),
+                                ipadapter_model_path=cfg["ipadapter_model_path"],
+                                image_encoder_path=cfg["image_encoder_path"],
+                                style_image=cfg.get("style_image"),
+                                scale=cfg.get("scale", 1.0),
+                                type=IPAdapterType(cfg.get("type", "regular")),
+                                insightface_model_name=cfg.get("insightface_model_name"),
                             )
                             ip_module = IPAdapterModule(ip_cfg)
                             ip_module.install(stream)
@@ -1665,6 +1636,7 @@ def _load_model(
 
                             # Cleanup after IPAdapter installation
                             import gc
+
                             gc.collect()
                             torch.cuda.empty_cache()
                             torch.cuda.synchronize()
@@ -1672,12 +1644,16 @@ def _load_model(
                         except torch.cuda.OutOfMemoryError as oom_error:
                             logger.error(f"CUDA Out of Memory during early IPAdapter installation: {oom_error}")
                             logger.error("Try reducing batch size, using smaller models, or increasing GPU memory")
-                            raise RuntimeError("Insufficient VRAM for IPAdapter installation. Consider using a GPU with more memory or reducing model complexity.")
+                            raise RuntimeError(
+                                "Insufficient VRAM for IPAdapter installation. Consider using a GPU with more memory or reducing model complexity."
+                            )
 
                         except RuntimeError as rt_error:
                             if "size mismatch" in str(rt_error):
-                                unet_dim = getattr(getattr(stream, 'unet', None), 'config', None)
-                                unet_cross_attn = getattr(unet_dim, 'cross_attention_dim', 'unknown') if unet_dim else 'unknown'
+                                unet_dim = getattr(getattr(stream, "unet", None), "config", None)
+                                unet_cross_attn = (
+                                    getattr(unet_dim, "cross_attention_dim", "unknown") if unet_dim else "unknown"
+                                )
                                 logger.warning(
                                     f"IP-Adapter weights are incompatible with this model "
                                     f"(UNet cross_attention_dim={unet_cross_attn}). "
@@ -1690,23 +1666,24 @@ def _load_model(
                                 # them before load_state_dict() failed, leaving the UNet in a corrupted state
                                 try:
                                     stream.unet.set_attn_processor(_saved_unet_processors)
-                                    logger.info("Restored original UNet attention processors after IP-Adapter failure.")
+                                    logger.info(
+                                        "Restored original UNet attention processors after IP-Adapter failure."
+                                    )
                                 except Exception as restore_err:
                                     logger.warning(f"Could not restore UNet processors: {restore_err}")
                                 use_ipadapter_trt = False
                             else:
                                 import traceback
+
                                 traceback.print_exc()
                                 logger.error("Failed to install IPAdapterModule before TensorRT compilation")
                                 raise
 
                         except Exception as e:
                             import traceback
+
                             traceback.print_exc()
-                            logger.warning(
-                                f"Failed to install IPAdapterModule: {e}. "
-                                f"Continuing without IP-Adapter."
-                            )
+                            logger.warning(f"Failed to install IPAdapterModule: {e}. Continuing without IP-Adapter.")
                             try:
                                 stream.unet.set_attn_processor(_saved_unet_processors)
                                 logger.info("Restored original UNet attention processors after IP-Adapter failure.")
@@ -1719,20 +1696,23 @@ def _load_model(
                 # then construct UNet model with that value.
 
                 # Build a temporary unified wrapper to install processors and discover num_ip_layers
-                from streamdiffusion.acceleration.tensorrt.export_wrappers.unet_unified_export import UnifiedExportWrapper
+                from streamdiffusion.acceleration.tensorrt.export_wrappers.unet_unified_export import (
+                    UnifiedExportWrapper,
+                )
+
                 temp_wrapped_unet = UnifiedExportWrapper(
                     stream.unet,
                     use_controlnet=use_controlnet_trt,
                     use_ipadapter=use_ipadapter_trt,
                     control_input_names=None,
-                    num_tokens=num_tokens
+                    num_tokens=num_tokens,
                 )
 
                 num_ip_layers = None
                 if use_ipadapter_trt:
                     # Access underlying IPAdapter wrapper
-                    if hasattr(temp_wrapped_unet, 'ipadapter_wrapper') and temp_wrapped_unet.ipadapter_wrapper:
-                        num_ip_layers = getattr(temp_wrapped_unet.ipadapter_wrapper, 'num_ip_layers', None)
+                    if hasattr(temp_wrapped_unet, "ipadapter_wrapper") and temp_wrapped_unet.ipadapter_wrapper:
+                        num_ip_layers = getattr(temp_wrapped_unet.ipadapter_wrapper, "num_ip_layers", None)
                         if not isinstance(num_ip_layers, int) or num_ip_layers <= 0:
                             raise RuntimeError("Failed to determine num_ip_layers for IP-Adapter")
                         try:
@@ -1765,9 +1745,9 @@ def _load_model(
                 if use_controlnet_trt:
                     # Build control_input_names excluding ipadapter_scale so indices align to 3-base offset
                     all_input_names = unet_model.get_input_names()
-                    control_input_names = [name for name in all_input_names if name != 'ipadapter_scale']
+                    control_input_names = [name for name in all_input_names if name != "ipadapter_scale"]
 
-                # Unified compilation path 
+                # Unified compilation path
                 # Recreate wrapped_unet with control input names if needed (after unet_model is ready)
                 wrapped_unet = UnifiedExportWrapper(
                     stream.unet,
@@ -1780,6 +1760,7 @@ def _load_model(
 
                 if use_cached_attn:
                     from .acceleration.tensorrt.models.attention_processors import CachedSTAttnProcessor2_0
+
                     processors = stream.unet.attn_processors
                     for name, processor in processors.items():
                         # Target self-attention layers (attn1) by name — kvo_cache is only passed
@@ -1788,12 +1769,6 @@ def _load_model(
                         if name.endswith("attn1.processor") and not isinstance(processor, CachedSTAttnProcessor2_0):
                             processors[name] = CachedSTAttnProcessor2_0()
                     stream.unet.set_attn_processor(processors)
-                    # Enable pre-allocated buffers for runtime — ONNX export already completed above,
-                    # so the original clone/contiguous path was used for tracing. From here on, the
-                    # processors run only at Python runtime (non-TRT paths) and buffer reuse is safe.
-                    for proc in stream.unet.attn_processors.values():
-                        if isinstance(proc, CachedSTAttnProcessor2_0):
-                            proc._use_prealloc = True
 
                 # Compile VAE decoder engine using EngineManager
                 vae_decoder_model = VAE(
@@ -1812,13 +1787,10 @@ def _load_model(
                     cuda_stream=None,
                     stream_vae=stream.vae,
                     engine_build_options={
-                        'opt_image_height': self.height,
-                        'opt_image_width': self.width,
-                        'build_dynamic_shape': True,
-                        'min_image_resolution': 384,
-                        'max_image_resolution': 1024,
-                        'build_all_tactics': True,
-                    }
+                        "opt_image_height": self.height,
+                        "opt_image_width": self.width,
+                        "build_dynamic_shape": False,
+                    },
                 )
 
                 # Compile VAE encoder engine using EngineManager
@@ -1838,13 +1810,10 @@ def _load_model(
                     batch_size=self.batch_size if self.mode == "txt2img" else stream.frame_bff_size,
                     cuda_stream=None,
                     engine_build_options={
-                        'opt_image_height': self.height,
-                        'opt_image_width': self.width,
-                        'build_dynamic_shape': True,
-                        'min_image_resolution': 384,
-                        'max_image_resolution': 1024,
-                        'build_all_tactics': True,
-                    }
+                        "opt_image_height": self.height,
+                        "opt_image_width": self.width,
+                        "build_dynamic_shape": False,
+                    },
                 )
 
                 cuda_stream = cuda.Stream()
@@ -1854,6 +1823,25 @@ def _load_model(
 
                 try:
                     logger.info("Loading TensorRT UNet engine...")
+                    # Build engine_build_options, adding FP8 calibration callback when enabled.
+                    _unet_build_opts = {
+                        "opt_image_height": self.height,
+                        "opt_image_width": self.width,
+                        "build_dynamic_shape": False,
+                    }
+                    if fp8:
+                        from streamdiffusion.acceleration.tensorrt.fp8_quantize import (
+                            generate_unet_calibration_data,
+                        )
+                        _captured_model = unet_model
+                        _calib_batch = stream.trt_unet_batch_size
+                        _calib_h, _calib_w = self.height, self.width
+                        _unet_build_opts["fp8"] = True
+                        _unet_build_opts["onnx_opset"] = 19  # modelopt FP8 needs opset ≥19 for fp16 Q/DQ scales
+                        _unet_build_opts["calibration_data_fn"] = lambda: generate_unet_calibration_data(
+                            _captured_model, _calib_batch, _calib_h, _calib_w
+                        )
+
                     # Compile and load UNet engine using EngineManager
                     stream.unet = engine_manager.compile_and_load_engine(
                         EngineType.UNET,
@@ -1867,47 +1855,48 @@ def _load_model(
                         use_ipadapter_trt=use_ipadapter_trt,
                         unet_arch=unet_arch,
                         num_ip_layers=num_ip_layers if use_ipadapter_trt else None,
-                        engine_build_options={
-                            'opt_image_height': self.height,
-                            'opt_image_width': self.width,
-                            'build_all_tactics': True,
-                            'fp8': fp8,
-                        }
+                        engine_build_options=_unet_build_opts,
                     )
                     if load_engine:
                         logger.info("TensorRT UNet engine loaded successfully")
-                    
+
                 except Exception as e:
                     error_msg = str(e).lower()
-                    is_oom_error = ('out of memory' in error_msg or 'outofmemory' in error_msg or 
-                                   'oom' in error_msg or 'cuda error' in error_msg)
-                    
+                    is_oom_error = (
+                        "out of memory" in error_msg
+                        or "outofmemory" in error_msg
+                        or "oom" in error_msg
+                        or "cuda error" in error_msg
+                    )
+
                     if is_oom_error:
                         logger.error(f"TensorRT UNet engine OOM: {e}")
                         logger.info("Falling back to PyTorch UNet (no TensorRT acceleration)")
                         logger.info("This will be slower but should work with less memory")
-                        
+
                         # Clean up any partial TensorRT state
-                        if hasattr(stream, 'unet'):
+                        if hasattr(stream, "unet"):
                             try:
                                 del stream.unet
                             except:
                                 pass
-                        
+
                         self.cleanup_gpu_memory()
-                        
+
                         # Fall back to original PyTorch UNet
                         try:
                             logger.info("Loading PyTorch UNet as fallback...")
                             # Keep the original UNet from the pipe
-                            if hasattr(stream, 'pipe') and hasattr(stream.pipe, 'unet'):
+                            if hasattr(stream, "pipe") and hasattr(stream.pipe, "unet"):
                                 stream.unet = stream.pipe.unet
                                 logger.info("PyTorch UNet fallback successful")
                             else:
                                 raise RuntimeError("No PyTorch UNet available for fallback")
                         except Exception as fallback_error:
                             logger.error(f"PyTorch UNet fallback also failed: {fallback_error}")
-                            raise RuntimeError(f"Both TensorRT and PyTorch UNet loading failed. TensorRT error: {e}, Fallback error: {fallback_error}")
+                            raise RuntimeError(
+                                f"Both TensorRT and PyTorch UNet loading failed. TensorRT error: {e}, Fallback error: {fallback_error}"
+                            )
                     else:
                         # Non-OOM error, re-raise
                         logger.error(f"TensorRT UNet engine loading failed (non-OOM): {e}")
@@ -1915,7 +1904,9 @@ def _load_model(
 
                 if load_engine:
                     try:
-                        logger.info(f"Loading TensorRT VAE engines vae_encoder_path: {vae_encoder_path}, vae_decoder_path: {vae_decoder_path}")
+                        logger.info(
+                            f"Loading TensorRT VAE engines vae_encoder_path: {vae_encoder_path}, vae_decoder_path: {vae_decoder_path}"
+                        )
                         stream.vae = AutoencoderKLEngine(
                             str(vae_encoder_path),
                             str(vae_decoder_path),
@@ -1926,38 +1917,44 @@ def _load_model(
                         stream.vae.config = vae_config
                         stream.vae.dtype = vae_dtype
                         logger.info("TensorRT VAE engines loaded successfully")
-                        
+
                     except Exception as e:
                         error_msg = str(e).lower()
-                        is_oom_error = ('out of memory' in error_msg or 'outofmemory' in error_msg or 
-                                    'oom' in error_msg or 'cuda error' in error_msg)
-                        
+                        is_oom_error = (
+                            "out of memory" in error_msg
+                            or "outofmemory" in error_msg
+                            or "oom" in error_msg
+                            or "cuda error" in error_msg
+                        )
+
                         if is_oom_error:
                             logger.error(f"TensorRT VAE engine OOM: {e}")
                             logger.info("Falling back to PyTorch VAE (no TensorRT acceleration)")
                             logger.info("This will be slower but should work with less memory")
-                            
+
                             # Clean up any partial TensorRT state
-                            if hasattr(stream, 'vae'):
+                            if hasattr(stream, "vae"):
                                 try:
                                     del stream.vae
                                 except:
                                     pass
-                            
+
                             self.cleanup_gpu_memory()
-                            
+
                             # Fall back to original PyTorch VAE
                             try:
                                 logger.info("Loading PyTorch VAE as fallback...")
                                 # Keep the original VAE from the pipe
-                                if hasattr(stream, 'pipe') and hasattr(stream.pipe, 'vae'):
+                                if hasattr(stream, "pipe") and hasattr(stream.pipe, "vae"):
                                     stream.vae = stream.pipe.vae
                                     logger.info("PyTorch VAE fallback successful")
                                 else:
                                     raise RuntimeError("No PyTorch VAE available for fallback")
                             except Exception as fallback_error:
                                 logger.error(f"PyTorch VAE fallback also failed: {fallback_error}")
-                                raise RuntimeError(f"Both TensorRT and PyTorch VAE loading failed. TensorRT error: {e}, Fallback error: {fallback_error}")
+                                raise RuntimeError(
+                                    f"Both TensorRT and PyTorch VAE loading failed. TensorRT error: {e}, Fallback error: {fallback_error}"
+                                )
                         else:
                             # Non-OOM error, re-raise
                             logger.error(f"TensorRT VAE engine loading failed (non-OOM): {e}")
@@ -1978,6 +1975,7 @@ def _load_model(
                 if self.use_safety_checker or safety_checker_engine_exists:
                     if not safety_checker_engine_exists:
                         from transformers import AutoModelForImageClassification
+
                         self.safety_checker = AutoModelForImageClassification.from_pretrained(safety_checker_model_id)
 
                         safety_checker_model = NSFWDetector(
@@ -1995,7 +1993,7 @@ def _load_model(
                             cuda_stream=None,
                             load_engine=False,
                         )
-                    
+
                     if load_engine:
                         self.safety_checker = NSFWDetectorEngine(
                             safety_checker_path,
@@ -2003,7 +2001,7 @@ def _load_model(
                             use_cuda_graph=True,
                         )
                         logger.info("Safety Checker engine loaded successfully")
-                        
+
             if acceleration == "sfast":
                 from streamdiffusion.acceleration.sfast import (
                     accelerate_with_stable_fast,
@@ -2012,13 +2010,15 @@ def _load_model(
                 stream = accelerate_with_stable_fast(stream)
         except Exception:
             import traceback
+
             traceback.print_exc()
             raise Exception("Acceleration has failed.")
 
         # Install modules via hooks instead of patching (wrapper keeps forwarding updates only)
         if use_controlnet:
             try:
-                from streamdiffusion.modules.controlnet_module import ControlNetModule, ControlNetConfig
+                from streamdiffusion.modules.controlnet_module import ControlNetConfig, ControlNetModule
+
                 cn_module = ControlNetModule(device=self.device, dtype=self.dtype)
                 cn_module.install(stream)
                 # Normalize to list of configs
@@ -2030,28 +2030,28 @@ def _load_model(
                     else []
                 )
                 for cfg in configs:
-                    if not cfg.get('model_id'):
+                    if not cfg.get("model_id"):
                         continue
                     cn_cfg = ControlNetConfig(
-                        model_id=cfg['model_id'],
-                        preprocessor=cfg.get('preprocessor'),
-                        conditioning_scale=cfg.get('conditioning_scale', 1.0),
-                        enabled=cfg.get('enabled', True),
-                        conditioning_channels=cfg.get('conditioning_channels'),
-                        preprocessor_params=cfg.get('preprocessor_params'),
+                        model_id=cfg["model_id"],
+                        preprocessor=cfg.get("preprocessor"),
+                        conditioning_scale=cfg.get("conditioning_scale", 1.0),
+                        enabled=cfg.get("enabled", True),
+                        conditioning_channels=cfg.get("conditioning_channels"),
+                        preprocessor_params=cfg.get("preprocessor_params"),
                     )
-                    cn_module.add_controlnet(cn_cfg, control_image=cfg.get('control_image'))
+                    cn_module.add_controlnet(cn_cfg, control_image=cfg.get("control_image"))
                 # Expose for later updates if needed by caller code
                 stream._controlnet_module = cn_module
 
                 try:
                     compiled_cn_engines = []
                     for cfg, cn_model in zip(configs, cn_module.controlnets):
-                        if not cfg or not cfg.get('model_id') or cn_model is None:
+                        if not cfg or not cfg.get("model_id") or cn_model is None:
                             continue
                         try:
                             engine = engine_manager.get_or_load_controlnet_engine(
-                                model_id=cfg['model_id'],
+                                model_id=cfg["model_id"],
                                 pytorch_model=cn_model,
                                 model_type=model_type,
                                 batch_size=stream.trt_unet_batch_size,
@@ -2060,29 +2060,33 @@ def _load_model(
                                 cuda_stream=cuda_stream,
                                 use_cuda_graph=False,
                                 unet=None,
-                                model_path=cfg['model_id'],
+                                model_path=cfg["model_id"],
                                 load_engine=load_engine,
-                                conditioning_channels=cfg.get('conditioning_channels', 3)
+                                conditioning_channels=cfg.get("conditioning_channels", 3),
                             )
                             try:
-                                setattr(engine, 'model_id', cfg['model_id'])
+                                setattr(engine, "model_id", cfg["model_id"])
                             except Exception:
                                 pass
                             compiled_cn_engines.append(engine)
                         except Exception as e:
                             logger.warning(f"Failed to compile/load ControlNet engine for {cfg.get('model_id')}: {e}")
                     if compiled_cn_engines:
-                        setattr(stream, 'controlnet_engines', compiled_cn_engines)
+                        setattr(stream, "controlnet_engines", compiled_cn_engines)
                         try:
                             logger.info(f"Compiled/loaded {len(compiled_cn_engines)} ControlNet TensorRT engine(s)")
                         except Exception:
                             pass
                 except Exception:
                     import traceback
+
                     traceback.print_exc()
-                    logger.warning("ControlNet TensorRT engine build step encountered an issue; continuing with PyTorch ControlNet")
+                    logger.warning(
+                        "ControlNet TensorRT engine build step encountered an issue; continuing with PyTorch ControlNet"
+                    )
             except Exception:
                 import traceback
+
                 traceback.print_exc()
                 logger.error("Failed to install ControlNetModule")
                 raise
@@ -2091,24 +2095,30 @@ def _load_model(
         # This ensures processors are properly baked into the TensorRT engines
         # After TRT compilation, stream.unet is a UNet2DConditionModelEngine with no attn_processors —
         # skip IP-Adapter install entirely in that case.
-        if use_ipadapter and ipadapter_config and not hasattr(stream, '_ipadapter_module') and hasattr(stream.unet, 'attn_processors'):
+        if (
+            use_ipadapter
+            and ipadapter_config
+            and not hasattr(stream, "_ipadapter_module")
+            and hasattr(stream.unet, "attn_processors")
+        ):
             try:
-                from streamdiffusion.modules.ipadapter_module import IPAdapterModule, IPAdapterConfig, IPAdapterType
+                from streamdiffusion.modules.ipadapter_module import IPAdapterConfig, IPAdapterModule, IPAdapterType
+
                 # Use first config if list provided
                 cfg = ipadapter_config[0] if isinstance(ipadapter_config, list) else ipadapter_config
 
                 # Get adapter type from config
-                ipadapter_type = IPAdapterType(cfg['type'])
+                ipadapter_type = IPAdapterType(cfg["type"])
 
                 ip_cfg = IPAdapterConfig(
-                    style_image_key=cfg.get('style_image_key') or 'ipadapter_main',
-                    num_image_tokens=cfg.get('num_image_tokens', 4),
-                    ipadapter_model_path=cfg['ipadapter_model_path'],
-                    image_encoder_path=cfg['image_encoder_path'],
-                    style_image=cfg.get('style_image'),
-                    scale=cfg.get('scale', 1.0),
+                    style_image_key=cfg.get("style_image_key") or "ipadapter_main",
+                    num_image_tokens=cfg.get("num_image_tokens", 4),
+                    ipadapter_model_path=cfg["ipadapter_model_path"],
+                    image_encoder_path=cfg["image_encoder_path"],
+                    style_image=cfg.get("style_image"),
+                    scale=cfg.get("scale", 1.0),
                     type=ipadapter_type,
-                    insightface_model_name=cfg.get('insightface_model_name'),
+                    insightface_model_name=cfg.get("insightface_model_name"),
                 )
                 ip_module = IPAdapterModule(ip_cfg)
                 _saved_unet_processors_post = {name: proc for name, proc in stream.unet.attn_processors.items()}
@@ -2118,8 +2128,8 @@ def _load_model(
 
             except RuntimeError as rt_error:
                 if "size mismatch" in str(rt_error):
-                    unet_dim = getattr(getattr(stream, 'unet', None), 'config', None)
-                    unet_cross_attn = getattr(unet_dim, 'cross_attention_dim', 'unknown') if unet_dim else 'unknown'
+                    unet_dim = getattr(getattr(stream, "unet", None), "config", None)
+                    unet_cross_attn = getattr(unet_dim, "cross_attention_dim", "unknown") if unet_dim else "unknown"
                     logger.warning(
                         f"IP-Adapter weights are incompatible with this model "
                         f"(UNet cross_attention_dim={unet_cross_attn}). "
@@ -2131,12 +2141,14 @@ def _load_model(
                         logger.warning(f"Could not restore UNet processors: {restore_err}")
                 else:
                     import traceback
+
                     traceback.print_exc()
                     logger.error("Failed to install IPAdapterModule")
                     raise
 
             except Exception:
                 import traceback
+
                 traceback.print_exc()
                 logger.error("Failed to install IPAdapterModule")
                 raise
@@ -2144,45 +2156,49 @@ def _load_model(
         # Note: LoRA weights have already been merged permanently during model loading
 
         # Install pipeline hook modules (Phase 4: Configuration Integration)
-        if image_preprocessing_config and image_preprocessing_config.get('enabled', True):
+        if image_preprocessing_config and image_preprocessing_config.get("enabled", True):
             try:
                 from streamdiffusion.modules.image_processing_module import ImagePreprocessingModule
+
                 img_pre_module = ImagePreprocessingModule()
                 img_pre_module.install(stream)
-                for proc_config in image_preprocessing_config.get('processors', []):
+                for proc_config in image_preprocessing_config.get("processors", []):
                     img_pre_module.add_processor(proc_config)
                 stream._image_preprocessing_module = img_pre_module
             except Exception as e:
                 logger.error(f"Failed to install ImagePreprocessingModule: {e}")
-        
-        if image_postprocessing_config and image_postprocessing_config.get('enabled', True):
+
+        if image_postprocessing_config and image_postprocessing_config.get("enabled", True):
             try:
                 from streamdiffusion.modules.image_processing_module import ImagePostprocessingModule
+
                 img_post_module = ImagePostprocessingModule()
                 img_post_module.install(stream)
-                for proc_config in image_postprocessing_config.get('processors', []):
+                for proc_config in image_postprocessing_config.get("processors", []):
                     img_post_module.add_processor(proc_config)
                 stream._image_postprocessing_module = img_post_module
             except Exception as e:
                 logger.error(f"Failed to install ImagePostprocessingModule: {e}")
-        
-        if latent_preprocessing_config and latent_preprocessing_config.get('enabled', True):
+
+        if latent_preprocessing_config and latent_preprocessing_config.get("enabled", True):
             try:
                 from streamdiffusion.modules.latent_processing_module import LatentPreprocessingModule
+
                 latent_pre_module = LatentPreprocessingModule()
                 latent_pre_module.install(stream)
-                for proc_config in latent_preprocessing_config.get('processors', []):
+                for proc_config in latent_preprocessing_config.get("processors", []):
                     latent_pre_module.add_processor(proc_config)
                 stream._latent_preprocessing_module = latent_pre_module
             except Exception as e:
                 logger.error(f"Failed to install LatentPreprocessingModule: {e}")
-        
-        if latent_postprocessing_config and latent_postprocessing_config.get('enabled', True):
+
+        if latent_postprocessing_config and latent_postprocessing_config.get("enabled", True):
             try:
                 from streamdiffusion.modules.latent_processing_module import LatentPostprocessingModule
+
                 latent_post_module = LatentPostprocessingModule()
                 latent_post_module.install(stream)
-                for proc_config in latent_postprocessing_config.get('processors', []):
+                for proc_config in latent_postprocessing_config.get("processors", []):
                     latent_post_module.add_processor(proc_config)
                 stream._latent_postprocessing_module = latent_post_module
             except Exception as e:
@@ -2193,6 +2209,7 @@ def _load_model(
         # Requires Ampere+ (compute 8.0+). Expected gain: 2-5% end-to-end on SD1.5/SD-Turbo.
         try:
             from streamdiffusion.tools.cuda_l2_cache import setup_l2_persistence
+
             setup_l2_persistence(stream.unet)
         except Exception as e:
             logger.debug(f"L2 cache persistence setup skipped: {e}")
@@ -2202,39 +2219,45 @@ def _load_model(
     def get_last_processed_image(self, index: int) -> Optional[Image.Image]:
         """Forward get_last_processed_image call to the underlying ControlNet pipeline"""
         if not self.use_controlnet:
-            raise RuntimeError("get_last_processed_image: ControlNet support not enabled. Set use_controlnet=True in constructor.")
+            raise RuntimeError(
+                "get_last_processed_image: ControlNet support not enabled. Set use_controlnet=True in constructor."
+            )
 
         return self.stream.get_last_processed_image(index)
-        
+
     def cleanup_controlnets(self) -> None:
         """Cleanup ControlNet resources including background threads and VRAM"""
         if not self.use_controlnet:
             return
-            
-        if hasattr(self, 'stream') and self.stream and hasattr(self.stream, 'cleanup'):
+
+        if hasattr(self, "stream") and self.stream and hasattr(self.stream, "cleanup"):
             self.stream.cleanup_controlnets()
 
     def update_control_image(self, index: int, image: Union[str, Image.Image, torch.Tensor]) -> None:
         """Update control image for specific ControlNet index"""
         if not self.use_controlnet:
-            raise RuntimeError("update_control_image: ControlNet support not enabled. Set use_controlnet=True in constructor.")
+            raise RuntimeError(
+                "update_control_image: ControlNet support not enabled. Set use_controlnet=True in constructor."
+            )
         if not self.skip_diffusion:
             self.stream._controlnet_module.update_control_image_efficient(image, index=index)
         else:
             logger.debug("update_control_image: Skipping ControlNet update in skip diffusion mode")
 
-    def update_style_image(self, image: Union[str, Image.Image, torch.Tensor], is_stream: bool = False, style_key = "ipadapter_main") -> None:
+    def update_style_image(
+        self, image: Union[str, Image.Image, torch.Tensor], is_stream: bool = False, style_key="ipadapter_main"
+    ) -> None:
         """Update IPAdapter style image"""
         if not self.use_ipadapter:
-            raise RuntimeError("update_style_image: IPAdapter support not enabled. Set use_ipadapter=True in constructor.")
-        
+            raise RuntimeError(
+                "update_style_image: IPAdapter support not enabled. Set use_ipadapter=True in constructor."
+            )
+
         if not self.skip_diffusion:
             self.stream._param_updater.update_style_image(style_key, image, is_stream=is_stream)
         else:
             logger.debug("update_style_image: Skipping IPAdapter update in skip diffusion mode")
-        
-        
-        
+
     def clear_caches(self) -> None:
         """Clear all cached prompt embeddings and seed noise tensors."""
         self.stream._param_updater.clear_caches()
@@ -2261,52 +2284,58 @@ def get_stream_state(self, include_caches: bool = False) -> Dict[str, Any]:
         normalize_seed_weights = updater.get_normalize_seed_weights()
 
         # Core runtime params
-        guidance_scale = getattr(stream, 'guidance_scale', None)
-        delta = getattr(stream, 'delta', None)
-        t_index_list = list(getattr(stream, 't_list', []))
-        current_seed = getattr(stream, 'current_seed', None)
+        guidance_scale = getattr(stream, "guidance_scale", None)
+        delta = getattr(stream, "delta", None)
+        t_index_list = list(getattr(stream, "t_list", []))
+        current_seed = getattr(stream, "current_seed", None)
         num_inference_steps = None
         try:
-            if hasattr(stream, 'timesteps') and stream.timesteps is not None:
+            if hasattr(stream, "timesteps") and stream.timesteps is not None:
                 num_inference_steps = int(len(stream.timesteps))
         except Exception:
             pass
 
         # Resolution and model/pipeline info
         state: Dict[str, Any] = {
-            'width': getattr(stream, 'width', None),
-            'height': getattr(stream, 'height', None),
-            'latent_width': getattr(stream, 'latent_width', None),
-            'latent_height': getattr(stream, 'latent_height', None),
-            'device': getattr(stream, 'device', None).type if hasattr(getattr(stream, 'device', None), 'type') else getattr(stream, 'device', None),
-            'dtype': str(getattr(stream, 'dtype', None)),
-            'model_type': getattr(stream, 'model_type', None),
-            'is_sdxl': getattr(stream, 'is_sdxl', None),
-            'is_turbo': getattr(stream, 'is_turbo', None),
-            'cfg_type': getattr(stream, 'cfg_type', None),
-            'use_denoising_batch': getattr(stream, 'use_denoising_batch', None),
-            'batch_size': getattr(stream, 'batch_size', None),
-            'min_batch_size': getattr(stream, 'min_batch_size', None),
-            'max_batch_size': getattr(stream, 'max_batch_size', None),
+            "width": getattr(stream, "width", None),
+            "height": getattr(stream, "height", None),
+            "latent_width": getattr(stream, "latent_width", None),
+            "latent_height": getattr(stream, "latent_height", None),
+            "device": getattr(stream, "device", None).type
+            if hasattr(getattr(stream, "device", None), "type")
+            else getattr(stream, "device", None),
+            "dtype": str(getattr(stream, "dtype", None)),
+            "model_type": getattr(stream, "model_type", None),
+            "is_sdxl": getattr(stream, "is_sdxl", None),
+            "is_turbo": getattr(stream, "is_turbo", None),
+            "cfg_type": getattr(stream, "cfg_type", None),
+            "use_denoising_batch": getattr(stream, "use_denoising_batch", None),
+            "batch_size": getattr(stream, "batch_size", None),
+            "min_batch_size": getattr(stream, "min_batch_size", None),
+            "max_batch_size": getattr(stream, "max_batch_size", None),
         }
 
         # Blending state
-        state.update({
-            'prompt_list': prompts,
-            'seed_list': seeds,
-            'normalize_prompt_weights': normalize_prompt_weights,
-            'normalize_seed_weights': normalize_seed_weights,
-            'negative_prompt': getattr(updater, '_current_negative_prompt', ""),
-        })
+        state.update(
+            {
+                "prompt_list": prompts,
+                "seed_list": seeds,
+                "normalize_prompt_weights": normalize_prompt_weights,
+                "normalize_seed_weights": normalize_seed_weights,
+                "negative_prompt": getattr(updater, "_current_negative_prompt", ""),
+            }
+        )
 
         # Core runtime knobs
-        state.update({
-            'guidance_scale': guidance_scale,
-            'delta': delta,
-            't_index_list': t_index_list,
-            'current_seed': current_seed,
-            'num_inference_steps': num_inference_steps,
-        })
+        state.update(
+            {
+                "guidance_scale": guidance_scale,
+                "delta": delta,
+                "t_index_list": t_index_list,
+                "current_seed": current_seed,
+                "num_inference_steps": num_inference_steps,
+            }
+        )
 
         # Module configs (ControlNet, IP-Adapter)
         try:
@@ -2319,97 +2348,100 @@ def get_stream_state(self, include_caches: bool = False) -> Dict[str, Any]:
             ipadapter_config = None
         # Hook configs
         try:
-            image_preprocessing_config = updater._get_current_hook_config('image_preprocessing')
+            image_preprocessing_config = updater._get_current_hook_config("image_preprocessing")
         except Exception:
             image_preprocessing_config = []
         try:
-            image_postprocessing_config = updater._get_current_hook_config('image_postprocessing')
+            image_postprocessing_config = updater._get_current_hook_config("image_postprocessing")
         except Exception:
             image_postprocessing_config = []
         try:
-            latent_preprocessing_config = updater._get_current_hook_config('latent_preprocessing')
+            latent_preprocessing_config = updater._get_current_hook_config("latent_preprocessing")
         except Exception:
             latent_preprocessing_config = []
         try:
-            latent_postprocessing_config = updater._get_current_hook_config('latent_postprocessing')
+            latent_postprocessing_config = updater._get_current_hook_config("latent_postprocessing")
         except Exception:
             latent_postprocessing_config = []
-            
-        state.update({
-            'controlnet_config': controlnet_config,
-            'ipadapter_config': ipadapter_config,
-            'image_preprocessing_config': image_preprocessing_config,
-            'image_postprocessing_config': image_postprocessing_config,
-            'latent_preprocessing_config': latent_preprocessing_config,
-            'latent_postprocessing_config': latent_postprocessing_config,
-        })
+
+        state.update(
+            {
+                "controlnet_config": controlnet_config,
+                "ipadapter_config": ipadapter_config,
+                "image_preprocessing_config": image_preprocessing_config,
+                "image_postprocessing_config": image_postprocessing_config,
+                "latent_preprocessing_config": latent_preprocessing_config,
+                "latent_postprocessing_config": latent_postprocessing_config,
+            }
+        )
 
         # Optional caches
         if include_caches:
             try:
-                state['caches'] = updater.get_cache_info()
+                state["caches"] = updater.get_cache_info()
             except Exception:
-                state['caches'] = None
+                state["caches"] = None
 
         return state
-    
+
     def cleanup_gpu_memory(self) -> None:
         """Comprehensive GPU memory cleanup for model switching."""
         import gc
+
         import torch
-        
+
         logger.info("Cleaning up GPU memory...")
-        
+
         # Clear prompt caches
-        if hasattr(self, 'stream') and self.stream:
+        if hasattr(self, "stream") and self.stream:
             try:
                 self.stream._param_updater.clear_caches()
                 logger.info("   Cleared prompt caches")
             except:
                 pass
-        
+
         # Enhanced TensorRT engine cleanup
-        if hasattr(self, 'stream') and self.stream:
+        if hasattr(self, "stream") and self.stream:
             try:
                 # Cleanup UNet TensorRT engine
-                if hasattr(self.stream, 'unet'):
+                if hasattr(self.stream, "unet"):
                     unet_engine = self.stream.unet
                     logger.info("   Cleaning up TensorRT UNet engine...")
-                    
+
                     # Check if it's a TensorRT engine and cleanup properly
-                    if hasattr(unet_engine, 'engine') and hasattr(unet_engine.engine, '__del__'):
+                    if hasattr(unet_engine, "engine") and hasattr(unet_engine.engine, "__del__"):
                         try:
                             # Call the engine's destructor explicitly
                             unet_engine.engine.__del__()
                         except:
                             pass
-                    
+
                     # Clear all engine-related attributes
-                    if hasattr(unet_engine, 'context'):
+                    if hasattr(unet_engine, "context"):
                         try:
                             del unet_engine.context
                         except:
                             pass
-                    if hasattr(unet_engine, 'engine'):
+                    if hasattr(unet_engine, "engine"):
                         try:
                             del unet_engine.engine.engine  # TensorRT runtime engine
                             del unet_engine.engine
                         except:
                             pass
-                    
+
                     del self.stream.unet
                     logger.info("   UNet engine cleanup completed")
-                    
+
                 # Cleanup VAE TensorRT engines
-                if hasattr(self.stream, 'vae'):
+                if hasattr(self.stream, "vae"):
                     vae_engine = self.stream.vae
                     logger.info("   Cleaning up TensorRT VAE engines...")
-                    
+
                     # VAE has encoder and decoder engines
-                    for engine_name in ['vae_encoder', 'vae_decoder']:
+                    for engine_name in ["vae_encoder", "vae_decoder"]:
                         if hasattr(vae_engine, engine_name):
                             engine = getattr(vae_engine, engine_name)
-                            if hasattr(engine, 'engine') and hasattr(engine.engine, '__del__'):
+                            if hasattr(engine, "engine") and hasattr(engine.engine, "__del__"):
                                 try:
                                     engine.engine.__del__()
                                 except:
@@ -2418,12 +2450,12 @@ def cleanup_gpu_memory(self) -> None:
                                 delattr(vae_engine, engine_name)
                             except:
                                 pass
-                    
+
                     del self.stream.vae
                     logger.info("   VAE engines cleanup completed")
-                
+
                 # Cleanup ControlNet engine pool if it exists
-                if hasattr(self.stream, 'controlnet_engine_pool'):
+                if hasattr(self.stream, "controlnet_engine_pool"):
                     logger.info("   Cleaning up ControlNet engine pool...")
                     try:
                         self.stream.controlnet_engine_pool.cleanup()
@@ -2431,76 +2463,78 @@ def cleanup_gpu_memory(self) -> None:
                         logger.info("   ControlNet engine pool cleanup completed")
                     except:
                         pass
-                    
+
             except Exception as e:
                 logger.error(f"   TensorRT cleanup warning: {e}")
-        
+
         # Clear the entire stream object to free all models
-        if hasattr(self, 'stream'):
+        if hasattr(self, "stream"):
             try:
                 del self.stream
                 logger.info("   Cleared stream object")
             except:
                 pass
             self.stream = None
-        
+
         # Force multiple garbage collection cycles for thorough cleanup
         for i in range(3):
             gc.collect()
-        
+
         # Clear CUDA cache and cleanup IPC handles
         torch.cuda.empty_cache()
         torch.cuda.synchronize()
-        
+
         # Force additional memory cleanup
         torch.cuda.ipc_collect()
-        
+
         # Get memory info
         allocated = torch.cuda.memory_allocated() / (1024**3)  # GB
-        cached = torch.cuda.memory_reserved() / (1024**3)     # GB
+        cached = torch.cuda.memory_reserved() / (1024**3)  # GB
         logger.info(f"   GPU Memory after cleanup: {allocated:.2f}GB allocated, {cached:.2f}GB cached")
-        
+
         logger.info("   Enhanced GPU memory cleanup complete")
 
     def check_gpu_memory_for_engine(self, engine_size_gb: float) -> bool:
         """
         Check if there's enough GPU memory to load a TensorRT engine.
-        
+
         Args:
             engine_size_gb: Expected engine size in GB
-            
+
         Returns:
             True if enough memory is available, False otherwise
         """
         if not torch.cuda.is_available():
             return True  # Assume OK if CUDA not available
-        
+
         try:
             # Get current memory status
             allocated = torch.cuda.memory_allocated() / (1024**3)
             cached = torch.cuda.memory_reserved() / (1024**3)
-            
+
             # Get total GPU memory
             total_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)
             free_memory = total_memory - allocated
-            
+
             # Add 20% overhead for safety
             required_memory = engine_size_gb * 1.2
-            
-            logger.info(f"GPU Memory Check:")
+
+            logger.info("GPU Memory Check:")
             logger.info(f"   Total: {total_memory:.2f}GB")
-            logger.info(f"   Allocated: {allocated:.2f}GB") 
+            logger.info(f"   Allocated: {allocated:.2f}GB")
             logger.info(f"   Cached: {cached:.2f}GB")
             logger.info(f"   Free: {free_memory:.2f}GB")
             logger.info(f"   Required: {required_memory:.2f}GB (engine: {engine_size_gb:.2f}GB + 20% overhead)")
-            
+
             if free_memory >= required_memory:
-                logger.info(f"   Sufficient memory available")
+                logger.info("   Sufficient memory available")
                 return True
             else:
-                logger.error(f"   Insufficient memory! Need {required_memory:.2f}GB but only {free_memory:.2f}GB available")
+                logger.error(
+                    f"   Insufficient memory! Need {required_memory:.2f}GB but only {free_memory:.2f}GB available"
+                )
                 return False
-                
+
         except Exception as e:
             logger.error(f"   Memory check failed: {e}")
             return True  # Assume OK if check fails
@@ -2508,22 +2542,22 @@ def check_gpu_memory_for_engine(self, engine_size_gb: float) -> bool:
     def cleanup_engines_and_rebuild(self, reduce_batch_size: bool = True, reduce_resolution: bool = False) -> None:
         """
         Clean up TensorRT engines and rebuild with smaller settings to fix OOM issues.
-        
+
         Parameters:
         -----------
         reduce_batch_size : bool
             If True, reduce batch size to 1
-        reduce_resolution : bool  
+        reduce_resolution : bool
             If True, reduce resolution by half
         """
-        import shutil
         import os
-        
+        import shutil
+
         logger.info("Cleaning up engines and rebuilding with smaller settings...")
-        
+
         # Clean up GPU memory first
         self.cleanup_gpu_memory()
-        
+
         # Remove engines directory
         engines_dir = "engines"
         if os.path.exists(engines_dir):
@@ -2532,22 +2566,22 @@ def cleanup_engines_and_rebuild(self, reduce_batch_size: bool = True, reduce_res
                 logger.info(f"   Removed engines directory: {engines_dir}")
             except Exception as e:
                 logger.error(f"   Failed to remove engines: {e}")
-        
+
         # Reduce settings
         if reduce_batch_size:
-            if hasattr(self, 'batch_size') and self.batch_size > 1:
+            if hasattr(self, "batch_size") and self.batch_size > 1:
                 old_batch = self.batch_size
                 self.batch_size = 1
                 logger.info(f"   Reduced batch size: {old_batch} -> {self.batch_size}")
-            
+
             # Also reduce frame buffer size if needed
-            if hasattr(self, 'frame_buffer_size') and self.frame_buffer_size > 1:
+            if hasattr(self, "frame_buffer_size") and self.frame_buffer_size > 1:
                 old_buffer = self.frame_buffer_size
-                self.frame_buffer_size = 1  
+                self.frame_buffer_size = 1
                 logger.info(f"   Reduced frame buffer size: {old_buffer} -> {self.frame_buffer_size}")
-        
+
         if reduce_resolution:
-            if hasattr(self, 'width') and hasattr(self, 'height'):
+            if hasattr(self, "width") and hasattr(self, "height"):
                 old_width, old_height = self.width, self.height
                 self.width = max(512, self.width // 2)
                 self.height = max(512, self.height // 2)
@@ -2555,5 +2589,5 @@ def cleanup_engines_and_rebuild(self, reduce_batch_size: bool = True, reduce_res
                 self.width = (self.width // 64) * 64
                 self.height = (self.height // 64) * 64
                 logger.info(f"   Reduced resolution: {old_width}x{old_height} -> {self.width}x{self.height}")
-        
+
         logger.info("   Next model load will rebuild engines with these smaller settings")

From 6a7315f318db3200d0b93bf4e7a3e754e5755fc8 Mon Sep 17 00:00:00 2001
From: "INTER.Tech" <INTER.Tech@interiam.co>
Date: Sat, 4 Apr 2026 22:22:57 -0400
Subject: [PATCH 04/10] perf(trt): fully static batch profiles to unlock l2tc
 on UNet
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Even with static spatial shapes (512x512), TRT's l2tc (L2 tiling cache
optimization) was still disabled for UNet because the batch dimension
remained dynamic (min=1, max=4). TRT checks that ALL dimensions are
concrete before enabling l2tc — a single symbolic dimension disables it
for the entire graph.

Fix: set build_static_batch=True for all three engine types (UNet, VAE
decoder, VAE encoder) and ControlNet. Since t_index_list is fixed and
cfg_type='self' (never 'full') is always used, the UNet batch is always
exactly len(t_index_list)=2 — never changes at runtime.

Also fix get_minmax_dims() static_batch path: was setting
min_batch = max(1, batch_size-1) which still created a range (1-2).
Now sets min_batch = max_batch = batch_size for a true single-point
profile that TRT treats as fully concrete.

With all dimensions concrete (batch + spatial), the next UNet build
should show tiling_optimization_level=MODERATE and l2_limit_for_tiling
applied without the '[l2tc] VALIDATE FAIL - symbolic shape' warning.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/streamdiffusion/acceleration/tensorrt/engine_manager.py | 2 +-
 src/streamdiffusion/acceleration/tensorrt/models/models.py  | 4 +++-
 src/streamdiffusion/wrapper.py                              | 3 +++
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/streamdiffusion/acceleration/tensorrt/engine_manager.py b/src/streamdiffusion/acceleration/tensorrt/engine_manager.py
index 471f0de4..f59da463 100644
--- a/src/streamdiffusion/acceleration/tensorrt/engine_manager.py
+++ b/src/streamdiffusion/acceleration/tensorrt/engine_manager.py
@@ -230,7 +230,7 @@ def _get_default_controlnet_build_options(
             "opt_image_height": opt_image_height,
             "opt_image_width": opt_image_width,
             "build_dynamic_shape": build_dynamic_shape,
-            "build_static_batch": False,
+            "build_static_batch": True,
         }
         if build_dynamic_shape:
             opts["min_image_resolution"] = 384
diff --git a/src/streamdiffusion/acceleration/tensorrt/models/models.py b/src/streamdiffusion/acceleration/tensorrt/models/models.py
index 62f7490d..99f306dc 100644
--- a/src/streamdiffusion/acceleration/tensorrt/models/models.py
+++ b/src/streamdiffusion/acceleration/tensorrt/models/models.py
@@ -151,7 +151,9 @@ def check_dims(self, batch_size, image_height, image_width):
 
     def get_minmax_dims(self, batch_size, image_height, image_width, static_batch, static_shape):
         if static_batch:
-            min_batch = max(1, batch_size - 1)
+            # Fully static: min=opt=max so TRT sees no symbolic batch dim.
+            # Required for l2tc (L2 tiling) which checks that ALL dims are concrete.
+            min_batch = batch_size
             max_batch = batch_size
         else:
             min_batch = self.min_batch
diff --git a/src/streamdiffusion/wrapper.py b/src/streamdiffusion/wrapper.py
index c215ad82..b6aa3bb9 100644
--- a/src/streamdiffusion/wrapper.py
+++ b/src/streamdiffusion/wrapper.py
@@ -1790,6 +1790,7 @@ def _load_model(
                         "opt_image_height": self.height,
                         "opt_image_width": self.width,
                         "build_dynamic_shape": False,
+                        "build_static_batch": True,
                     },
                 )
 
@@ -1813,6 +1814,7 @@ def _load_model(
                         "opt_image_height": self.height,
                         "opt_image_width": self.width,
                         "build_dynamic_shape": False,
+                        "build_static_batch": True,
                     },
                 )
 
@@ -1828,6 +1830,7 @@ def _load_model(
                         "opt_image_height": self.height,
                         "opt_image_width": self.width,
                         "build_dynamic_shape": False,
+                        "build_static_batch": True,
                     }
                     if fp8:
                         from streamdiffusion.acceleration.tensorrt.fp8_quantize import (

From 6e5ca1104ecb2fd37cde21a8fd11d2e9b18fe957 Mon Sep 17 00:00:00 2001
From: "INTER.Tech" <INTER.Tech@interiam.co>
Date: Sat, 4 Apr 2026 22:35:08 -0400
Subject: [PATCH 05/10] feat(trt): add TRT profiling infrastructure gated by
 STREAMDIFFUSION_PROFILE_TRT

- Add TRTProfiler class (IProfiler impl): per-layer timing with start/end_run,
  get_summary() aggregating median times across last N runs
- Set profiling_verbosity=DETAILED in both FP16 and FP8 build paths so new
  engines embed layer names + tactic IDs for meaningful profiling output
- Engine.activate(): attach TRTProfiler + log when env var is set
- Engine.infer(): disable CUDA graphs when profiler is attached (IProfiler
  cannot report per-layer times through graph replay); wrap execution with
  start_run/end_run; sync stream before end_run to ensure all callbacks fired
- Engine.dump_profile(): log per-layer summary, no-op when profiler is None
- UNet2DConditionModelEngine, AutoencoderKLEngine, ControlNetModelEngine:
  add dump_profile() delegation to underlying Engine

Zero overhead in production (env var not set = no profiler created, CUDA graphs
work normally). Enable with: set STREAMDIFFUSION_PROFILE_TRT=1

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../runtime_engines/controlnet_engine.py      | 117 +++++++++-------
 .../tensorrt/runtime_engines/unet_engine.py   |  15 +++
 .../acceleration/tensorrt/utilities.py        | 125 ++++++++++++++++++
 3 files changed, 210 insertions(+), 47 deletions(-)

diff --git a/src/streamdiffusion/acceleration/tensorrt/runtime_engines/controlnet_engine.py b/src/streamdiffusion/acceleration/tensorrt/runtime_engines/controlnet_engine.py
index a50453b9..676f2e7f 100644
--- a/src/streamdiffusion/acceleration/tensorrt/runtime_engines/controlnet_engine.py
+++ b/src/streamdiffusion/acceleration/tensorrt/runtime_engines/controlnet_engine.py
@@ -1,115 +1,137 @@
 """ControlNet TensorRT Engine with PyTorch fallback"""
 
-import torch
 import logging
-from typing import List, Optional, Tuple, Dict
+from typing import Dict, List, Optional, Tuple
+
+import torch
 from polygraphy import cuda
 
 from ..utilities import Engine
 
+
 # Set up logger for this module
 logger = logging.getLogger(__name__)
 
 
 class ControlNetModelEngine:
     """TensorRT-accelerated ControlNet inference engine"""
-    
-    def __init__(self, engine_path: str, stream: 'cuda.Stream', use_cuda_graph: bool = False, model_type: str = "sd15"):
+
+    def __init__(
+        self, engine_path: str, stream: "cuda.Stream", use_cuda_graph: bool = False, model_type: str = "sd15"
+    ):
         """Initialize ControlNet TensorRT engine"""
         self.engine = Engine(engine_path)
         self.stream = stream
         self.use_cuda_graph = use_cuda_graph
         self.model_type = model_type.lower()
-        
+
         self.engine.load()
         self.engine.activate()
-        
+
         self._input_names = None
         self._output_names = None
-        
+
         # Pre-compute model-specific values to eliminate runtime branching
         if self.model_type in ["sdxl", "sdxl_turbo"]:
             self.max_blocks = 9
             self.down_block_configs = [
-                (320, 1), (320, 1), (320, 1), (320, 2),
-                (640, 2), (640, 2), (640, 4),
-                (1280, 4), (1280, 4)
+                (320, 1),
+                (320, 1),
+                (320, 1),
+                (320, 2),
+                (640, 2),
+                (640, 2),
+                (640, 4),
+                (1280, 4),
+                (1280, 4),
             ]
             self.mid_block_channels = 1280
             self.mid_downsample_factor = 4
         else:
             self.max_blocks = 12
             self.down_block_configs = [
-                (320, 1), (320, 1), (320, 1), (320, 2), (640, 2), (640, 2),
-                (640, 4), (1280, 4), (1280, 4), (1280, 8), (1280, 8), (1280, 8)
+                (320, 1),
+                (320, 1),
+                (320, 1),
+                (320, 2),
+                (640, 2),
+                (640, 2),
+                (640, 4),
+                (1280, 4),
+                (1280, 4),
+                (1280, 8),
+                (1280, 8),
+                (1280, 8),
             ]
             self.mid_block_channels = 1280
             self.mid_downsample_factor = 8
-        
+
         self._shape_cache = {}
-    
-    def _resolve_output_shapes(self, batch_size: int, latent_height: int, latent_width: int) -> Dict[str, Tuple[int, ...]]:
+
+    def _resolve_output_shapes(
+        self, batch_size: int, latent_height: int, latent_width: int
+    ) -> Dict[str, Tuple[int, ...]]:
         """Optimized shape resolution using pre-computed configurations"""
         cache_key = (batch_size, latent_height, latent_width)
         if cache_key in self._shape_cache:
             return self._shape_cache[cache_key]
-        
+
         output_shapes = {}
-        
+
         # Generate down block shapes using pre-computed configs
         for i, (channels, factor) in enumerate(self.down_block_configs):
             output_name = f"down_block_{i:02d}"
             h = max(1, latent_height // factor)
             w = max(1, latent_width // factor)
             output_shapes[output_name] = (batch_size, channels, h, w)
-        
+
         # Generate mid block shape
         mid_h = max(1, latent_height // self.mid_downsample_factor)
         mid_w = max(1, latent_width // self.mid_downsample_factor)
         output_shapes["mid_block"] = (batch_size, self.mid_block_channels, mid_h, mid_w)
-        
+
         self._shape_cache[cache_key] = output_shapes
         return output_shapes
 
-    def __call__(self, 
-                 sample: torch.Tensor,
-                 timestep: torch.Tensor, 
-                 encoder_hidden_states: torch.Tensor,
-                 controlnet_cond: torch.Tensor,
-                 conditioning_scale: float = 1.0,
-                 text_embeds: Optional[torch.Tensor] = None,
-                 time_ids: Optional[torch.Tensor] = None,
-                 **kwargs) -> Tuple[List[torch.Tensor], torch.Tensor]:
+    def __call__(
+        self,
+        sample: torch.Tensor,
+        timestep: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        controlnet_cond: torch.Tensor,
+        conditioning_scale: float = 1.0,
+        text_embeds: Optional[torch.Tensor] = None,
+        time_ids: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Tuple[List[torch.Tensor], torch.Tensor]:
         """Forward pass through TensorRT ControlNet engine"""
         if timestep.dtype != torch.float32:
             timestep = timestep.float()
-        
+
         input_dict = {
             "sample": sample,
             "timestep": timestep,
             "encoder_hidden_states": encoder_hidden_states,
             "controlnet_cond": controlnet_cond,
-            "conditioning_scale": torch.tensor(conditioning_scale, dtype=torch.float32, device=sample.device)
+            "conditioning_scale": torch.tensor(conditioning_scale, dtype=torch.float32, device=sample.device),
         }
-        
+
         if text_embeds is not None:
             input_dict["text_embeds"] = text_embeds
         if time_ids is not None:
             input_dict["time_ids"] = time_ids
-        
 
-        
         shape_dict = {name: tensor.shape for name, tensor in input_dict.items()}
-        
+
         batch_size = sample.shape[0]
         latent_height = sample.shape[2]
         latent_width = sample.shape[3]
-        
+
         output_shapes = self._resolve_output_shapes(batch_size, latent_height, latent_width)
         shape_dict.update(output_shapes)
-        
+
         self.engine.allocate_buffers(shape_dict=shape_dict, device=sample.device)
-        
+
         outputs = self.engine.infer(
             input_dict,
             self.stream,
@@ -120,24 +142,25 @@ def __call__(self,
         # _extract_controlnet_outputs only slices the tensor dict (no GPU work), so no sync required.
 
         down_blocks, mid_block = self._extract_controlnet_outputs(outputs)
-        
+
         return down_blocks, mid_block
-    
+
+    def dump_profile(self, last_n: int = 10) -> None:
+        """Delegate per-layer profiling summary to the underlying TRT Engine.
+
+        No-op when STREAMDIFFUSION_PROFILE_TRT is not set.
+        """
+        self.engine.dump_profile(last_n)
+
     def _extract_controlnet_outputs(self, outputs: Dict[str, torch.Tensor]) -> Tuple[List[torch.Tensor], torch.Tensor]:
         """Extract and organize ControlNet outputs from engine results"""
         down_blocks = []
-        
+
         for i in range(self.max_blocks):
             output_name = f"down_block_{i:02d}"
             if output_name in outputs:
                 tensor = outputs[output_name]
                 down_blocks.append(tensor)
-        
+
         mid_block = outputs.get("mid_block")
         return down_blocks, mid_block
-    
-
-
-
-
- 
\ No newline at end of file
diff --git a/src/streamdiffusion/acceleration/tensorrt/runtime_engines/unet_engine.py b/src/streamdiffusion/acceleration/tensorrt/runtime_engines/unet_engine.py
index aaaad587..0a63f9b9 100644
--- a/src/streamdiffusion/acceleration/tensorrt/runtime_engines/unet_engine.py
+++ b/src/streamdiffusion/acceleration/tensorrt/runtime_engines/unet_engine.py
@@ -319,6 +319,13 @@ def _generate_dummy_controlnet_specs(self, latent_model_input: torch.Tensor) ->
 
         return temp_unet.get_control(image_height, image_width)
 
+    def dump_profile(self, last_n: int = 10) -> None:
+        """Delegate per-layer profiling summary to the underlying TRT Engine.
+
+        No-op when STREAMDIFFUSION_PROFILE_TRT is not set.
+        """
+        self.engine.dump_profile(last_n)
+
     def to(self, *args, **kwargs):
         pass
 
@@ -386,6 +393,14 @@ def decode(self, latent: torch.Tensor, **kwargs):
         )["images"]
         return DecoderOutput(sample=images)
 
+    def dump_profile(self, last_n: int = 10) -> None:
+        """Delegate per-layer profiling summary to encoder and decoder TRT Engines.
+
+        No-op when STREAMDIFFUSION_PROFILE_TRT is not set.
+        """
+        self.encoder.dump_profile(last_n)
+        self.decoder.dump_profile(last_n)
+
     def to(self, *args, **kwargs):
         pass
 
diff --git a/src/streamdiffusion/acceleration/tensorrt/utilities.py b/src/streamdiffusion/acceleration/tensorrt/utilities.py
index 5c8f3663..fc57af7b 100644
--- a/src/streamdiffusion/acceleration/tensorrt/utilities.py
+++ b/src/streamdiffusion/acceleration/tensorrt/utilities.py
@@ -336,6 +336,84 @@ def CUASSERT(cuda_ret):
     return None
 
 
+class TRTProfiler(trt.IProfiler):
+    """
+    Per-layer TRT timing profiler.
+
+    Activated by setting the STREAMDIFFUSION_PROFILE_TRT environment variable.
+    Attach to Engine.context after create_execution_context(); TRT will call
+    report_layer_time() once per layer per inference pass.
+
+    NOTE: Attaching a profiler disables CUDA graph replay for that engine
+    (IProfiler cannot report per-layer times through a captured graph).
+    Production inference always runs without profiler — zero overhead.
+
+    Usage:
+        set STREAMDIFFUSION_PROFILE_TRT=1
+        python td_main.py
+        # After N iterations, call engine.dump_profile()
+
+    Nsight Systems workflow (standalone .engine files):
+        # Build with profilingVerbosity=DETAILED (done automatically at build time)
+        # Profile with trtexec:
+        trtexec --loadEngine=unet.engine --noDataTransfers --useSpinWait \\
+                --warmUp=0 --duration=0 --iterations=50 \\
+                --profilingVerbosity=detailed --dumpProfile --separateProfileRun
+        # For CUDA graph per-kernel view, add --useCudaGraph --cuda-graph-trace=node
+        # and wrap with: nsys profile --capture-range cudaProfilerApi trtexec ...
+    """
+
+    def __init__(self, name: str = ""):
+        super().__init__()
+        self.name = name
+        self._runs: list = []        # list of lists: [[( layer_name, ms ), ...], ...]
+        self._current: list = []     # accumulator for the in-progress inference
+
+    def report_layer_time(self, layer_name: str, ms: float) -> None:  # noqa: N802
+        self._current.append((layer_name, ms))
+
+    def start_run(self) -> None:
+        self._current = []
+
+    def end_run(self) -> None:
+        if self._current:
+            self._runs.append(self._current)
+        self._current = []
+
+    def get_summary(self, last_n: int = 10) -> str:
+        if not self._runs:
+            return f"[{self.name}] No profiling data collected yet."
+
+        runs = self._runs[-last_n:]
+        from collections import defaultdict
+        totals: dict = defaultdict(list)
+        for run in runs:
+            for layer_name, ms in run:
+                totals[layer_name].append(ms)
+
+        # Sort by median descending
+        def _median(v):
+            s = sorted(v)
+            return s[len(s) // 2]
+
+        sorted_layers = sorted(totals.items(), key=lambda x: _median(x[1]), reverse=True)
+        total_ms = sum(_median(v) for _, v in sorted_layers)
+
+        lines = [
+            f"[{self.name}] Layer Profile — {len(runs)} runs, "
+            f"{total_ms:.2f} ms total (median per layer):"
+        ]
+        for layer_name, times in sorted_layers[:25]:
+            med = _median(times)
+            pct = (med / total_ms * 100) if total_ms > 0 else 0
+            lines.append(f"  {med:8.3f} ms  {pct:5.1f}%  {layer_name}")
+        remaining = len(sorted_layers) - 25
+        if remaining > 0:
+            rest_ms = sum(_median(v) for _, v in sorted_layers[25:])
+            lines.append(f"  ... {remaining} more layers  ({rest_ms:.2f} ms)")
+        return "\n".join(lines)
+
+
 class Engine:
     def __init__(
         self,
@@ -524,6 +602,13 @@ def build(
 
         config = builder.create_builder_config()
 
+        # Embed layer names + tactic IDs in the engine for runtime IProfiler support.
+        # Zero runtime cost — only affects engine metadata size (a few KB).
+        try:
+            config.profiling_verbosity = trt.ProfilingVerbosity.DETAILED
+        except AttributeError:
+            pass
+
         # Precision flags
         if fp16:
             config.set_flag(trt.BuilderFlag.FP16)
@@ -633,6 +718,13 @@ def _build_fp8(
             )
 
         config = builder.create_builder_config()
+
+        # Embed layer names + tactic IDs in the engine for runtime IProfiler support.
+        try:
+            config.profiling_verbosity = trt.ProfilingVerbosity.DETAILED
+        except AttributeError:
+            pass
+
         # BuilderFlag.STRONGLY_TYPED was removed in TRT 10.12; the network-level flag
         # (NetworkDefinitionCreationFlag.STRONGLY_TYPED, set on network creation above)
         # is now the only mechanism. On older TRT versions where BuilderFlag.STRONGLY_TYPED
@@ -710,6 +802,16 @@ def activate(self, reuse_device_memory=None):
         else:
             self.context = self.engine.create_execution_context()
 
+        # Attach per-layer profiler when STREAMDIFFUSION_PROFILE_TRT is set.
+        # Requires engines built with profiling_verbosity=DETAILED for meaningful names.
+        # NOTE: profiler presence disables CUDA graph replay in infer() — IProfiler
+        # cannot report per-layer times through a captured graph.
+        self.profiler: Optional[TRTProfiler] = None
+        if os.environ.get("STREAMDIFFUSION_PROFILE_TRT"):
+            self.profiler = TRTProfiler(name=os.path.basename(self.engine_path))
+            self.context.profiler = self.profiler
+            logger.info(f"[TRTProfiler] Attached to {os.path.basename(self.engine_path)} (CUDA graphs disabled)")
+
     def allocate_buffers(self, shape_dict=None, device="cuda"):
         # Check if we can reuse existing buffers (OPTIMIZATION)
         if self._can_reuse_buffers(shape_dict, device):
@@ -811,6 +913,12 @@ def reset_cuda_graph(self):
             self.graph = None
 
     def infer(self, feed_dict, stream, use_cuda_graph=False):
+        # IProfiler cannot report per-layer times through CUDA graph replay — disable graphs
+        # when profiler is attached. This is automatically set when STREAMDIFFUSION_PROFILE_TRT
+        # is set in activate(), so callers do not need to change anything.
+        if self.profiler is not None:
+            use_cuda_graph = False
+
         # Filter inputs to only those the engine actually exposes to avoid binding errors
         # _allowed_inputs is cached on first call — IO tensor names are immutable after engine build
         if self._allowed_inputs is None:
@@ -836,6 +944,9 @@ def infer(self, feed_dict, stream, use_cuda_graph=False):
                     )
             feed_dict = filtered_feed_dict
 
+        if self.profiler is not None:
+            self.profiler.start_run()
+
         for name, buf in feed_dict.items():
             self.tensors[name].copy_(buf)
 
@@ -868,8 +979,22 @@ def infer(self, feed_dict, stream, use_cuda_graph=False):
             if not noerror:
                 raise ValueError("ERROR: inference failed.")
 
+        if self.profiler is not None:
+            # Synchronize to ensure all IProfiler.report_layer_time() callbacks have fired
+            # before end_run() stores the accumulated per-layer data.
+            stream.synchronize()
+            self.profiler.end_run()
+
         return self.tensors
 
+    def dump_profile(self, last_n: int = 10) -> None:
+        """Log a per-layer timing summary for the last N profiled inference runs.
+
+        No-op when STREAMDIFFUSION_PROFILE_TRT is not set (profiler is None).
+        """
+        if self.profiler is not None:
+            logger.info(self.profiler.get_summary(last_n))
+
 
 def decode_images(images: torch.Tensor):
     images = (

From 2854b8bc522d767ba0b4a9ab1cde14b45270f4cf Mon Sep 17 00:00:00 2001
From: "INTER.Tech" <INTER.Tech@interiam.co>
Date: Mon, 6 Apr 2026 14:06:58 -0400
Subject: [PATCH 06/10] perf(trt): document builder_optimization_level=4;
 tactic 0x3e9 is TRT 10.12 bug, level 3 unsafe

---
 src/streamdiffusion/acceleration/tensorrt/utilities.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/streamdiffusion/acceleration/tensorrt/utilities.py b/src/streamdiffusion/acceleration/tensorrt/utilities.py
index fc57af7b..9b0f2e06 100644
--- a/src/streamdiffusion/acceleration/tensorrt/utilities.py
+++ b/src/streamdiffusion/acceleration/tensorrt/utilities.py
@@ -130,9 +130,11 @@ def detect_gpu_profile(device: int = 0) -> GPUBuildProfile:
     sms = props.multi_processor_count
 
     # --- Tier selection ---
-    # opt_level=4 for all tiers: always compiles dynamic kernels (better than
-    # level-3 heuristics) without level-5's "compare dynamic vs static" extra pass
-    # which OOMs during tactic profiling on dynamic-shape engines (160 GiB request).
+    # opt_level=4 for all tiers: always compiles dynamic kernels (better kernel
+    # selection than level-3 heuristics, even for static shapes). Level 5 avoided —
+    # causes OOM during tactic profiling (160 GiB requests observed).
+    # NOTE: tactic 0x3e9 "Assertion g.nodes.size() == 0" errors in TRT 10.12 are
+    # a known TRT bug — benign, the tactic is skipped and build succeeds.
     if cc >= (12, 0):
         tier = "blackwell"
         opt_level = 4

From 8d6e0663949c188a89b3af0320cd43c9d46992f2 Mon Sep 17 00:00:00 2001
From: "INTER.Tech" <INTER.Tech@interiam.co>
Date: Sun, 5 Apr 2026 05:17:43 -0400
Subject: [PATCH 07/10] fix(trt): guard aten::copy behind _use_prealloc to
 unblock ONNX export with use_cached_attn
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CachedSTAttnProcessor2_0 unconditionally used .copy_() which produces
aten::copy during torch.onnx.export() tracing — no ONNX symbolic exists
for this op, crashing UNet export with use_cached_attn=True.

Added _use_prealloc=False flag (default):
- False: ONNX-safe .clone() / torch.stack() path used during tracing
- True: zero-alloc .copy_() path for non-TRT runtime (set externally)

For TRT builds processors don't run at inference time (engine handles
KV cache internally), so _use_prealloc=True is only relevant for
non-TRT acceleration paths.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../acceleration/tensorrt/models/attention_processors.py      | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/streamdiffusion/acceleration/tensorrt/models/attention_processors.py b/src/streamdiffusion/acceleration/tensorrt/models/attention_processors.py
index a06fc0a5..6179ffc9 100644
--- a/src/streamdiffusion/acceleration/tensorrt/models/attention_processors.py
+++ b/src/streamdiffusion/acceleration/tensorrt/models/attention_processors.py
@@ -18,6 +18,10 @@ def __init__(self):
         self._curr_key_buf: Optional[torch.Tensor] = None
         self._curr_value_buf: Optional[torch.Tensor] = None
         self._kv_out_buf: Optional[torch.Tensor] = None  # shape: (2, 1, B, seq, inner_dim)
+        # When False (default): ONNX-safe .clone() path — used during torch.onnx.export() tracing.
+        # When True: zero-alloc .copy_() path — set after ONNX export for non-TRT runtime inference.
+        # NOTE: aten::copy has no ONNX symbolic and cannot be traced; never set True before export.
+        self._use_prealloc: bool = False
 
         # Pre-allocated buffers for zero-alloc hot path (lazy init on first call).
         # _use_prealloc is False by default so ONNX export tracing uses the original

From 7db3e758c58108a7b39f9d85178813595ad7f3c3 Mon Sep 17 00:00:00 2001
From: "INTER.Tech" <INTER.Tech@interiam.co>
Date: Sat, 4 Apr 2026 21:22:04 -0400
Subject: [PATCH 08/10] fix: guard ControlNet TRT engine compilation behind
 acceleration check

---
 src/streamdiffusion/wrapper.py | 73 +++++++++++++++++-----------------
 1 file changed, 37 insertions(+), 36 deletions(-)

diff --git a/src/streamdiffusion/wrapper.py b/src/streamdiffusion/wrapper.py
index b6aa3bb9..f07e310f 100644
--- a/src/streamdiffusion/wrapper.py
+++ b/src/streamdiffusion/wrapper.py
@@ -2047,46 +2047,47 @@ def _load_model(
                 # Expose for later updates if needed by caller code
                 stream._controlnet_module = cn_module
 
-                try:
-                    compiled_cn_engines = []
-                    for cfg, cn_model in zip(configs, cn_module.controlnets):
-                        if not cfg or not cfg.get("model_id") or cn_model is None:
-                            continue
-                        try:
-                            engine = engine_manager.get_or_load_controlnet_engine(
-                                model_id=cfg["model_id"],
-                                pytorch_model=cn_model,
-                                model_type=model_type,
-                                batch_size=stream.trt_unet_batch_size,
-                                max_batch_size=self.max_batch_size,
-                                min_batch_size=self.min_batch_size,
-                                cuda_stream=cuda_stream,
-                                use_cuda_graph=False,
-                                unet=None,
-                                model_path=cfg["model_id"],
-                                load_engine=load_engine,
-                                conditioning_channels=cfg.get("conditioning_channels", 3),
-                            )
+                if acceleration == "tensorrt":
+                    try:
+                        compiled_cn_engines = []
+                        for cfg, cn_model in zip(configs, cn_module.controlnets):
+                            if not cfg or not cfg.get("model_id") or cn_model is None:
+                                continue
+                            try:
+                                engine = engine_manager.get_or_load_controlnet_engine(
+                                    model_id=cfg["model_id"],
+                                    pytorch_model=cn_model,
+                                    model_type=model_type,
+                                    batch_size=stream.trt_unet_batch_size,
+                                    max_batch_size=self.max_batch_size,
+                                    min_batch_size=self.min_batch_size,
+                                    cuda_stream=cuda_stream,
+                                    use_cuda_graph=False,
+                                    unet=None,
+                                    model_path=cfg["model_id"],
+                                    load_engine=load_engine,
+                                    conditioning_channels=cfg.get("conditioning_channels", 3),
+                                )
+                                try:
+                                    setattr(engine, "model_id", cfg["model_id"])
+                                except Exception:
+                                    pass
+                                compiled_cn_engines.append(engine)
+                            except Exception as e:
+                                logger.warning(f"Failed to compile/load ControlNet engine for {cfg.get('model_id')}: {e}")
+                        if compiled_cn_engines:
+                            setattr(stream, "controlnet_engines", compiled_cn_engines)
                             try:
-                                setattr(engine, "model_id", cfg["model_id"])
+                                logger.info(f"Compiled/loaded {len(compiled_cn_engines)} ControlNet TensorRT engine(s)")
                             except Exception:
                                 pass
-                            compiled_cn_engines.append(engine)
-                        except Exception as e:
-                            logger.warning(f"Failed to compile/load ControlNet engine for {cfg.get('model_id')}: {e}")
-                    if compiled_cn_engines:
-                        setattr(stream, "controlnet_engines", compiled_cn_engines)
-                        try:
-                            logger.info(f"Compiled/loaded {len(compiled_cn_engines)} ControlNet TensorRT engine(s)")
-                        except Exception:
-                            pass
-                except Exception:
-                    import traceback
+                    except Exception:
+                        import traceback
 
-                    traceback.print_exc()
-                    logger.warning(
-                        "ControlNet TensorRT engine build step encountered an issue; continuing with PyTorch ControlNet"
-                    )
+                        traceback.print_exc()
+                        logger.warning(
+                            "ControlNet TensorRT engine build step encountered an issue; continuing with PyTorch ControlNet"
+                        )
             except Exception:
                 import traceback
 

From 166e90d53eca37e7d532a335b829f9b87fcd3964 Mon Sep 17 00:00:00 2001
From: "INTER.Tech" <INTER.Tech@interiam.co>
Date: Sun, 5 Apr 2026 07:56:05 -0400
Subject: [PATCH 09/10] fix(controlnet): pass pipeline resolution to TRT engine
 builder
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

get_or_load_controlnet_engine() defaulted to opt_image_height/width=704,
causing a static shape mismatch at runtime when the pipeline runs at 512×512
(latent 64×64 vs expected 88×88). Pass self.height / self.width so the engine
is built at the actual inference resolution.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/streamdiffusion/wrapper.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/streamdiffusion/wrapper.py b/src/streamdiffusion/wrapper.py
index f07e310f..d8047349 100644
--- a/src/streamdiffusion/wrapper.py
+++ b/src/streamdiffusion/wrapper.py
@@ -2065,6 +2065,8 @@ def _load_model(
                                     use_cuda_graph=False,
                                     unet=None,
                                     model_path=cfg["model_id"],
+                                    opt_image_height=self.height,
+                                    opt_image_width=self.width,
                                     load_engine=load_engine,
                                     conditioning_channels=cfg.get("conditioning_channels", 3),
                                 )

From da32eb0c1c61130bb3e23f59ef4cf927dc7199be Mon Sep 17 00:00:00 2001
From: "INTER.Tech" <INTER.Tech@interiam.co>
Date: Sun, 5 Apr 2026 09:19:19 -0400
Subject: [PATCH 10/10] perf(controlnet): enable CUDA graphs for ControlNet TRT
 engine

ControlNet ran with use_cuda_graph=False despite the Engine.infer() and
allocate_buffers() infrastructure supporting graph capture. Since shapes
are fixed at runtime (same resolution every frame), enabling CUDA graphs
eliminates CPU kernel launch overhead per denoising step.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/streamdiffusion/wrapper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/streamdiffusion/wrapper.py b/src/streamdiffusion/wrapper.py
index d8047349..90f9d786 100644
--- a/src/streamdiffusion/wrapper.py
+++ b/src/streamdiffusion/wrapper.py
@@ -2062,7 +2062,7 @@ def _load_model(
                                     max_batch_size=self.max_batch_size,
                                     min_batch_size=self.min_batch_size,
                                     cuda_stream=cuda_stream,
-                                    use_cuda_graph=False,
+                                    use_cuda_graph=True,
                                     unet=None,
                                     model_path=cfg["model_id"],
                                     opt_image_height=self.height,