diff --git a/Install_StreamDiffusion.bat b/Install_StreamDiffusion.bat
new file mode 100644
index 00000000..22d1a5ab
--- /dev/null
+++ b/Install_StreamDiffusion.bat
@@ -0,0 +1,43 @@
+@echo off
+echo ========================================
+echo  StreamDiffusionTD v0.3.1 Installation
+echo  Daydream Fork with StreamV2V
+echo ========================================
+echo.
+
+:: Prerequisite checks
+echo Checking prerequisites...
+
+py -3.11 --version >nul 2>&1
+if errorlevel 1 (
+    echo ERROR: Python 3.11 not found via py launcher.
+    echo Install Python 3.11 from https://python.org and ensure the py launcher is available.
+    pause
+    exit /b 1
+)
+
+git --version >nul 2>&1
+if errorlevel 1 (
+    echo ERROR: Git not found in PATH.
+    echo Install Git from https://git-scm.com/ (required for pip git+ packages).
+    pause
+    exit /b 1
+)
+
+where cl.exe >nul 2>&1
+if errorlevel 1 (
+    echo WARNING: C++ compiler (cl.exe) not found. Some packages may require it to build.
+    echo If installation fails, install Visual Studio Build Tools from:
+    echo   https://visualstudio.microsoft.com/visual-cpp-build-tools/
+    echo.
+)
+
+echo Prerequisites OK. Starting installation...
+echo.
+
+cd /d "%~dp0"
+cd StreamDiffusion-installer
+
+py -3.11 -m sd_installer --base-folder "%~dp0." install --cuda cu128 --no-cache
+
+pause
diff --git a/Install_TensorRT.bat b/Install_TensorRT.bat
new file mode 100644
index 00000000..f269db7b
--- /dev/null
+++ b/Install_TensorRT.bat
@@ -0,0 +1,34 @@
+@echo off
+echo ========================================
+echo  StreamDiffusionTD TensorRT Installation
+echo ========================================
+echo.
+
+cd /d "%~dp0"
+
+:: Check venv exists before trying to activate
+if not exist "venv\Scripts\activate.bat" (
+    echo ERROR: Virtual environment not found at venv\Scripts\activate.bat
+    echo Run Install_StreamDiffusion.bat first to create the environment.
+    pause
+    exit /b 1
+)
+
+echo Activating virtual environment...
+call "venv\Scripts\activate.bat"
+
+if "%VIRTUAL_ENV%" == "" (
+    echo ERROR: Failed to activate virtual environment.
+    pause
+    exit /b 1
+)
+echo Virtual environment activated: %VIRTUAL_ENV%
+
+echo.
+echo Installing TensorRT via CLI...
+cd StreamDiffusion-installer
+python -m sd_installer install-tensorrt
+
+echo.
+echo TensorRT installation finished
+pause
diff --git a/Start_StreamDiffusion.bat b/Start_StreamDiffusion.bat
new file mode 100644
index 00000000..50d5164d
--- /dev/null
+++ b/Start_StreamDiffusion.bat
@@ -0,0 +1,14 @@
+@echo off
+cd /d %~dp0
+
+:: Load runtime environment variables if set_env.bat exists
+if exist "%~dp0set_env.bat" call "%~dp0set_env.bat"
+
+if exist venv (
+    call venv\Scripts\activate.bat
+    venv\Scripts\python.exe StreamDiffusionTD\td_main.py
+) else (
+    call .venv\Scripts\activate.bat
+    .venv\Scripts\python.exe StreamDiffusionTD\td_main.py
+)
+pause
diff --git a/StreamDiffusion-installer b/StreamDiffusion-installer
new file mode 160000
index 00000000..24a5693b
--- /dev/null
+++ b/StreamDiffusion-installer
@@ -0,0 +1 @@
+Subproject commit 24a5693b07868fd679111b4dd2de5ddc753a2cc0
diff --git a/StreamDiffusionTD/install_tensorrt.py b/StreamDiffusionTD/install_tensorrt.py
new file mode 100644
index 00000000..5f169ae3
--- /dev/null
+++ b/StreamDiffusionTD/install_tensorrt.py
@@ -0,0 +1,157 @@
+"""
+Standalone TensorRT installation script for StreamDiffusionTD
+This is a self-contained version that doesn't rely on the streamdiffusion package imports
+
+Version pins aligned with sd_installer/tensorrt.py and src/streamdiffusion/tools/install-tensorrt.py
+"""
+
+import platform
+import subprocess
+import sys
+from typing import Optional
+
+# Canonical version pins — keep in sync with sd_installer/tensorrt.py
+TENSORRT_PINS = {
+    "cu12": {
+        "cudnn": "nvidia-cudnn-cu12==9.7.1.26",
+        "tensorrt": "tensorrt==10.12.0.36",
+    },
+    "cu11": {
+        "cudnn": "nvidia-cudnn-cu11==8.9.7.29",
+        "tensorrt": "tensorrt==9.0.1.post11.dev4",
+    },
+    "polygraphy": "polygraphy==0.49.26",
+    "onnx_graphsurgeon": "onnx-graphsurgeon==0.5.8",
+    "pywin32": "pywin32==311",
+    "triton_windows": "triton-windows==3.4.0.post21",
+}
+
+
+def run_pip(command: str):
+    """Run pip command with proper error handling"""
+    return subprocess.check_call([sys.executable, "-m", "pip"] + command.split())
+
+
+def is_installed(package_name: str) -> bool:
+    """Check if a package is installed"""
+    try:
+        __import__(package_name.replace("-", "_"))
+        return True
+    except ImportError:
+        return False
+
+
+def version(package_name: str) -> Optional[str]:
+    """Get version of installed package"""
+    try:
+        import importlib.metadata
+        return importlib.metadata.version(package_name)
+    except Exception:
+        return None
+
+
+def get_cuda_version_from_torch() -> Optional[str]:
+    try:
+        import torch
+    except ImportError:
+        return None
+
+    cuda_version = torch.version.cuda
+    if cuda_version:
+        # Return full version like "12.8" for better detection
+        major_minor = ".".join(cuda_version.split(".")[:2])
+        return major_minor
+    return None
+
+
+def install(cu: Optional[str] = None):
+    if cu is None:
+        cu = get_cuda_version_from_torch()
+
+    if cu is None:
+        print("Could not detect CUDA version. Please specify manually.")
+        return
+
+    print(f"Detected CUDA version: {cu}")
+    print("Installing TensorRT requirements...")
+
+    # Determine CUDA major version for package selection
+    cuda_major = cu.split(".")[0] if cu else "12"
+    cuda_version_float = float(cu) if cu else 12.0
+
+    # Uninstall old TensorRT versions (anything below 10.8)
+    if is_installed("tensorrt"):
+        current_version_str = version("tensorrt")
+        if current_version_str:
+            try:
+                from packaging.version import Version
+                needs_uninstall = Version(current_version_str) < Version("10.8.0")
+            except ImportError:
+                # packaging not available - compare by major version
+                try:
+                    major = int(current_version_str.split(".")[0])
+                    needs_uninstall = major < 10
+                except (ValueError, IndexError):
+                    needs_uninstall = False
+            if needs_uninstall:
+                print("Uninstalling old TensorRT version...")
+                run_pip("uninstall -y tensorrt")
+
+    if cuda_major == "12":
+        pins = TENSORRT_PINS["cu12"]
+        if cuda_version_float >= 12.8:
+            print("Installing TensorRT 10.12+ for CUDA 12.8+ (Blackwell GPU support)...")
+        else:
+            print("Installing TensorRT for CUDA 12.x...")
+
+        cudnn_name = pins["cudnn"]
+        tensorrt_pkg = pins["tensorrt"]
+
+        print(f"Installing cuDNN: {cudnn_name}")
+        run_pip(f"install {cudnn_name} --no-cache-dir")
+
+        print(f"Installing TensorRT for CUDA {cu}: {tensorrt_pkg}")
+        run_pip(f"install --extra-index-url https://pypi.nvidia.com {tensorrt_pkg} --no-cache-dir")
+
+    elif cuda_major == "11":
+        pins = TENSORRT_PINS["cu11"]
+        print("Installing TensorRT for CUDA 11.x...")
+
+        cudnn_name = pins["cudnn"]
+        tensorrt_pkg = pins["tensorrt"]
+
+        print(f"Installing cuDNN: {cudnn_name}")
+        run_pip(f"install {cudnn_name} --no-cache-dir")
+
+        print(f"Installing TensorRT for CUDA {cu}: {tensorrt_pkg}")
+        run_pip(
+            f"install --pre --extra-index-url https://pypi.nvidia.com {tensorrt_pkg} --no-cache-dir"
+        )
+    else:
+        print(f"Unsupported CUDA version: {cu}")
+        print("Supported versions: CUDA 11.x, 12.x")
+        return
+
+    # Install additional TensorRT tools (pinned versions)
+    if not is_installed("polygraphy"):
+        print("Installing polygraphy...")
+        run_pip(
+            f"install {TENSORRT_PINS['polygraphy']} --extra-index-url https://pypi.ngc.nvidia.com --no-cache-dir"
+        )
+    if not is_installed("onnx_graphsurgeon"):
+        print("Installing onnx-graphsurgeon...")
+        run_pip(
+            f"install {TENSORRT_PINS['onnx_graphsurgeon']} --extra-index-url https://pypi.ngc.nvidia.com --no-cache-dir"
+        )
+    if platform.system() == "Windows" and not is_installed("pywin32"):
+        print("Installing pywin32...")
+        run_pip(f"install {TENSORRT_PINS['pywin32']} --no-cache-dir")
+    if platform.system() == "Windows" and not is_installed("triton"):
+        print("Installing triton-windows...")
+        run_pip(f"install {TENSORRT_PINS['triton_windows']} --no-cache-dir")
+
+    print("TensorRT installation completed successfully!")
+
+
+if __name__ == "__main__":
+    install()
diff --git a/set_env.bat b/set_env.bat
new file mode 100644
index 00000000..519f3124
--- /dev/null
+++ b/set_env.bat
@@ -0,0 +1,22 @@
+@echo off
+:: StreamDiffusionTD Runtime Environment Variables
+:: Called automatically by Start_StreamDiffusion.bat if this file exists.
+:: Edit values here to tune GPU memory and CUDA behavior.
+
+:: Reduce CUDA memory fragmentation (required for large models at 512x512+)
+set PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128,expandable_segments:True
+
+:: Lazy CUDA module loading — speeds up startup, reduces VRAM footprint
+set CUDA_MODULE_LOADING=LAZY
+
+:: L2 cache persistence (Ampere+ only, compute 8.0+)
+:: Set to "0" to disable. Default: "1" (enabled, 64 MB reserved)
+set SDTD_L2_PERSIST=1
+set SDTD_L2_PERSIST_MB=64
+
+:: HuggingFace offline mode — set to "1" to use cached models only (no downloads)
+:: set HF_HUB_OFFLINE=1
+:: set TRANSFORMERS_OFFLINE=1
+
+:: Uncomment to override CUDA version detected by setup.py (e.g., for CI)
+:: set STREAMDIFFUSION_CUDA_VERSION=12.8
diff --git a/setup.py b/setup.py
index 4255f52c..d226b9d5 100644
--- a/setup.py
+++ b/setup.py
@@ -4,11 +4,11 @@
 
 from setuptools import find_packages, setup
 
+
 # Copied from pip_utils.py to avoid import
 def _check_torch_installed():
     try:
         import torch
-        import torchvision
     except Exception:
         msg = (
             "Missing required pre-installed packages: torch, torchvision\n"
@@ -19,16 +19,18 @@ def _check_torch_installed():
         raise RuntimeError(msg)
 
     if not torch.version.cuda:
-        raise RuntimeError("Detected CPU-only PyTorch. Install CUDA-enabled torch/vision/audio before installing this package.")
+        raise RuntimeError(
+            "Detected CPU-only PyTorch. Install CUDA-enabled torch/vision/audio before installing this package."
+        )
 
 
 def get_cuda_constraint():
-    cuda_version = os.environ.get("STREAMDIFFUSION_CUDA_VERSION") or \
-                    os.environ.get("CUDA_VERSION")
+    cuda_version = os.environ.get("STREAMDIFFUSION_CUDA_VERSION") or os.environ.get("CUDA_VERSION")
 
     if not cuda_version:
         try:
             import torch
+
             cuda_version = torch.version.cuda
         except Exception:
             # might not be available during wheel build, so we have to ignore
@@ -53,15 +55,14 @@ def get_cuda_constraint():
     "transformers==4.56.0",
     "accelerate==1.13.0",
     "huggingface_hub==0.35.0",
-    "Pillow>=12.1.1",  # CVE-2026-25990: out-of-bounds write in PSD loading
+    "Pillow>=12.2.0",  # CVE-2026-25990: out-of-bounds write in PSD loading; 12.2.0 verified
     "fire==0.7.1",
     "omegaconf==2.3.0",
-    "onnx==1.18.0",  # onnx-graphsurgeon 0.5.8 requires onnx.helper.float32_to_bfloat16 (removed in onnx 1.19+)
-    "onnxruntime==1.24.3",
-    "onnxruntime-gpu==1.24.3",
+    "onnx==1.18.0",  # IR 11 — modelopt needs FLOAT4E2M1 (added in 1.18); float32_to_bfloat16 present (removed in 1.19+)
+    "onnxruntime-gpu==1.24.4",  # TRT EP, supports IR 11; never co-install CPU onnxruntime — shared files conflict
     "polygraphy==0.49.26",
     "protobuf>=4.25.8,<5",  # mediapipe 0.10.21 requires protobuf 4.x; 4.25.8 fixes CVE-2025-4565; CVE-2026-0994 (JSON DoS) accepted risk for local pipeline
-    "colored==2.3.1",
+    "colored==2.3.2",
     "pywin32==311;sys_platform == 'win32'",
     "onnx-graphsurgeon==0.5.8",
     "controlnet-aux==0.0.10",
@@ -82,7 +83,9 @@ def deps_list(*pkgs):
 extras = {}
 extras["xformers"] = deps_list("xformers")
 extras["torch"] = deps_list("torch", "accelerate")
-extras["tensorrt"] = deps_list("protobuf", "cuda-python", "onnx", "onnxruntime", "onnxruntime-gpu", "colored", "polygraphy", "onnx-graphsurgeon")
+extras["tensorrt"] = deps_list(
+    "protobuf", "cuda-python", "onnx", "onnxruntime-gpu", "colored", "polygraphy", "onnx-graphsurgeon"
+)
 extras["controlnet"] = deps_list("onnx-graphsurgeon", "controlnet-aux")
 extras["ipadapter"] = deps_list("diffusers-ipadapter", "mediapipe", "insightface")
 
diff --git a/src/streamdiffusion/pip_utils.py b/src/streamdiffusion/pip_utils.py
index 9395c548..4a28c0a0 100644
--- a/src/streamdiffusion/pip_utils.py
+++ b/src/streamdiffusion/pip_utils.py
@@ -17,7 +17,6 @@
 def _check_torch_installed():
     try:
         import torch
-        import torchvision  # type: ignore
     except Exception:
         msg = (
             "Missing required pre-installed packages: torch, torchvision\n"
@@ -28,13 +27,16 @@ def _check_torch_installed():
         raise RuntimeError(msg)
 
     if not torch.version.cuda:
-        raise RuntimeError("Detected CPU-only PyTorch. Install CUDA-enabled torch/vision/audio before installing this package.")
+        raise RuntimeError(
+            "Detected CPU-only PyTorch. Install CUDA-enabled torch/vision/audio before installing this package."
+        )
 
 
 def get_cuda_version() -> str | None:
     _check_torch_installed()
 
     import torch
+
     return torch.version.cuda
 
 
@@ -67,7 +69,7 @@ def is_installed(package: str) -> bool:
 
 def run_python(command: str, env: Dict[str, str] | None = None) -> str:
     run_kwargs = {
-        "args": f"\"{python}\" {command}",
+        "args": f'"{python}" {command}',
         "shell": True,
         "env": os.environ if env is None else env,
         "encoding": "utf8",
diff --git a/src/streamdiffusion/tools/install-tensorrt.py b/src/streamdiffusion/tools/install-tensorrt.py
index 46ea28b4..696960f1 100644
--- a/src/streamdiffusion/tools/install-tensorrt.py
+++ b/src/streamdiffusion/tools/install-tensorrt.py
@@ -1,10 +1,10 @@
+import platform
 from typing import Literal, Optional
 
 import fire
 from packaging.version import Version
 
-from ..pip_utils import is_installed, run_pip, version, get_cuda_major
-import platform
+from ..pip_utils import get_cuda_major, is_installed, run_pip, version
 
 
 def install(cu: Optional[Literal["11", "12"]] = get_cuda_major()):
@@ -20,28 +20,34 @@ def install(cu: Optional[Literal["11", "12"]] = get_cuda_major()):
 
     cudnn_package, trt_package = (
         ("nvidia-cudnn-cu12==9.7.1.26", "tensorrt==10.12.0.36")
-        if cu == "12" else
-        ("nvidia-cudnn-cu11==8.9.7.29", "tensorrt==9.0.1.post11.dev4")
+        if cu == "12"
+        else ("nvidia-cudnn-cu11==8.9.7.29", "tensorrt==9.0.1.post11.dev4")
     )
     if not is_installed(trt_package):
         run_pip(f"install {cudnn_package} --no-cache-dir")
         run_pip(f"install --extra-index-url https://pypi.nvidia.com {trt_package} --no-cache-dir")
 
     if not is_installed("polygraphy"):
-        run_pip(
-            "install polygraphy==0.49.24 --extra-index-url https://pypi.ngc.nvidia.com"
-        )
+        run_pip("install polygraphy==0.49.26 --extra-index-url https://pypi.ngc.nvidia.com")
     if not is_installed("onnx_graphsurgeon"):
+        run_pip("install onnx-graphsurgeon==0.5.8 --extra-index-url https://pypi.ngc.nvidia.com")
+    if platform.system() == "Windows" and not is_installed("pywin32"):
+        run_pip("install pywin32==311")
+    if platform.system() == "Windows" and not is_installed("triton"):
+        run_pip("install triton-windows==3.4.0.post21")
+
+    # Pin onnx 1.18 + onnxruntime-gpu 1.24 together:
+    #   - onnx 1.18 exports IR 11; modelopt needs FLOAT4E2M1 added in 1.18
+    #   - onnx 1.19+ exports IR 12 (ORT 1.24 max) and removes float32_to_bfloat16 (onnx-gs needs it)
+    #   - onnxruntime-gpu 1.24 supports IR 11; never co-install CPU onnxruntime (shared files conflict)
+    run_pip("install onnx==1.18.0 onnxruntime-gpu==1.24.3 --no-cache-dir")
+
+    # FP8 quantization dependencies (CUDA 12 only)
+    # nvidia-modelopt requires cupy; pin cupy 13.x + numpy<2 for mediapipe compat
+    if cu == "12":
         run_pip(
-            "install onnx-graphsurgeon==0.5.8 --extra-index-url https://pypi.ngc.nvidia.com"
-        )
-    if platform.system() == 'Windows' and not is_installed("pywin32"):
-        run_pip(
-            "install pywin32==306"
-        )
-    if platform.system() == 'Windows' and not is_installed("triton"):
-        run_pip(
-            "install triton-windows==3.4.0.post21"
+            'install "nvidia-modelopt[onnx]" "cupy-cuda12x==13.6.0" "numpy==1.26.4"'
+            " --no-cache-dir"
         )