daydreamlive · livepeer-tessa · Mar 11, 2026 · Mar 11, 2026 · Mar 11, 2026 · Mar 11, 2026
diff --git a/src/scope/core/pipelines/krea_realtime_video/pipeline.py b/src/scope/core/pipelines/krea_realtime_video/pipeline.py
@@ -137,14 +137,27 @@ def __init__(
             )
 
             print(f"Quantized diffusion model to fp8 in {time.time() - start:.3f}s")
+
+            if compile:
+                # Float8DynamicActivationFloat8WeightConfig is incompatible with
+                # torch.compile(fullgraph=False): AOT autograd's gen_alias_from_base
+                # calls aten.as_strided on Float8Tensor outputs, which is not
+                # implemented. Skip block compilation when FP8 is active.
+                # See: https://github.com/daydreamlive/scope/issues/669
+                logger.warning(
+                    "Skipping torch.compile for attention blocks: "
+                    "Float8DynamicActivationFloat8WeightConfig is not compatible "
+                    "with fullgraph=False compilation (aten.as_strided unsupported "
+                    "on Float8Tensor). FP8 quantization is still active."
+                )
         else:
             generator = generator.to(device=device, dtype=dtype)
 
-        if compile:
-            # Only compile the attention blocks
-            for block in generator.model.blocks:
-                # Disable fullgraph right now due to issues with RoPE
-                block.compile(fullgraph=False)
+            if compile:
+                # Only compile the attention blocks
+                for block in generator.model.blocks:
+                    # Disable fullgraph right now due to issues with RoPE
+                    block.compile(fullgraph=False)
 
         # Load VAE using create_vae factory (supports multiple VAE types)
         # Note: VAE is shared across all Wan model sizes, stored in Wan2.1-T2V-1.3B
@@ -189,7 +202,16 @@ def __init__(
         # does not work properly
         self.state.set("current_start_frame", 0)
         self.state.set("manage_cache", True)
-        self.state.set("kv_cache_attention_bias", DEFAULT_KV_CACHE_ATTENTION_BIAS)
+        # When compile=False the flex_attention path (and its torch.compile call)
+        # must be bypassed entirely.  KV_CACHE_ATTENTION_BIAS_DISABLED (1.0) is
+        # the sentinel that makes causal_model.py skip flex_attention and take the
+        # standard attention path, so use it whenever compilation is disabled.
+        from .modules.causal_model import KV_CACHE_ATTENTION_BIAS_DISABLED
+
+        initial_kv_bias = (
+            DEFAULT_KV_CACHE_ATTENTION_BIAS if compile else KV_CACHE_ATTENTION_BIAS_DISABLED
+        )
+        self.state.set("kv_cache_attention_bias", initial_kv_bias)
 
         self.state.set("height", config.height)
         self.state.set("width", config.width)
@@ -198,25 +220,34 @@ def __init__(
         # Warm-up: Run enough iterations to fill the KV cache completely.
         # This ensures torch.compile compiles the flex_attention kernel at the
         # steady-state cache size, avoiding recompilation during actual streaming.
+        # Skipped when compile=False because there is no compiled kernel to prime
+        # and the warmup loop would otherwise enter the flex_attention code path
+        # (via DEFAULT_KV_CACHE_ATTENTION_BIAS) and trigger torch._dynamo tracing
+        # even though block.compile() was never called.
         #
         # Cache fills at: num_frame_per_block frames per iteration
         # Cache capacity: local_attn_size frames
         # Iterations needed: ceil(local_attn_size / num_frame_per_block) + 1
         #   (+1 to exercise the "cache full with eviction" path)
-        local_attn_size = getattr(model_config, "local_attn_size", 6)
-        num_frame_per_block = getattr(model_config, "num_frame_per_block", 3)
-        warmup_runs = (local_attn_size // num_frame_per_block) + 1
-
-        if stage_callback:
-            stage_callback("Warming up model...")
-        start = time.time()
-        for i in range(warmup_runs):
-            self._generate(
-                prompts=WARMUP_PROMPT,
-                init_cache=(i == 0),  # Only init on first run, then accumulate
-            )
+        if compile:
+            local_attn_size = getattr(model_config, "local_attn_size", 6)
+            num_frame_per_block = getattr(model_config, "num_frame_per_block", 3)
+            warmup_runs = (
+                (local_attn_size + num_frame_per_block - 1) // num_frame_per_block
+            ) + 1
+
+            if stage_callback:
+                stage_callback("Warming up model...")
+            start = time.time()
+            for i in range(warmup_runs):
+                self._generate(
+                    prompts=WARMUP_PROMPT,
+                    init_cache=(i == 0),  # Only init on first run, then accumulate
+                )
 
-        print(f"Warmed up ({warmup_runs} runs) in {time.time() - start:.2f}s")
+            print(f"Warmed up ({warmup_runs} runs) in {time.time() - start:.2f}s")
+        else:
+            logger.info("torch.compile disabled — skipping warmup (no compiled kernel to prime)")
 
         self.first_call = True
         self.last_mode = None  # Track mode for transition detection

diff --git a/src/scope/server/pipeline_manager.py b/src/scope/server/pipeline_manager.py
@@ -58,6 +58,11 @@ def __init__(self):
         # Loading stage for frontend display (e.g., "Loading diffusion model...")
         self._loading_stage: str | None = None
 
+        # Set to True if torch._dynamo.reset() failed during an unload; stale
+        # Dynamo/FP8 compile caches may still be present, so force compile=False
+        # on all subsequent pipeline loads until the worker process restarts.
+        self._dynamo_reset_failed: bool = False
+
     def set_loading_stage(self, stage: str | None) -> None:
         """Set the current loading stage (thread-safe)."""
         with self._lock:
@@ -691,6 +696,21 @@ def _unload_pipeline_by_id_unsafe(
             except Exception as e:
                 logger.warning(f"CUDA cleanup failed: {e}")
 
+        # Reset torch.compile compilation cache to prevent stale compiled graphs
+        # (especially those specialized for Float8Tensor weights) from leaking into
+        # subsequently loaded pipelines. Without this, longlive's FP8-compiled graph
+        # cache can corrupt Krea's compile attempt, causing as_strided dispatch errors.
+        try:
+            torch._dynamo.reset()
+            logger.info("torch._dynamo cache reset")
+        except Exception as e:
+            logger.warning(
+                f"torch._dynamo reset failed: {e}. "
+                "Stale compile caches may remain in this worker; "
+                "forcing compile=False for all subsequent pipeline loads."
+            )
+            self._dynamo_reset_failed = True
+
         # Publish pipeline_unloaded event
         publish_event(
             event_type="pipeline_unloaded",
@@ -959,14 +979,23 @@ def _load_pipeline_implementation(
             if load_params:
                 quantization = load_params.get("quantization", None)
 
+            # Only compile diffusion model for hopper; skip if a prior
+            # torch._dynamo.reset() failed (stale caches would cause a crash).
+            _hopper_gpu = torch.cuda.is_available() and any(
+                x in torch.cuda.get_device_name(0).lower()
+                for x in ("h100", "hopper")
+            )
+            _should_compile = _hopper_gpu and not self._dynamo_reset_failed
+            if _hopper_gpu and self._dynamo_reset_failed:
+                logger.warning(
+                    "torch._dynamo reset previously failed; disabling torch.compile "
+                    "for krea-realtime-video to avoid stale-cache crash. "
+                    "Restart the worker process to re-enable compilation."
+                )
             pipeline = KreaRealtimeVideoPipeline(
                 config,
                 quantization=quantization,
-                # Only compile diffusion model for hopper right now
-                compile=any(
-                    x in torch.cuda.get_device_name(0).lower()
-                    for x in ("h100", "hopper")
-                ),
+                compile=_should_compile,
                 device=torch.device("cuda"),
                 dtype=torch.bfloat16,
                 stage_callback=stage_callback,