daydreamlive · livepeer-tessa · Mar 20, 2026 · Mar 20, 2026 · Mar 20, 2026
diff --git a/src/scope/core/pipelines/wan2_1/blocks/preprocess_video.py b/src/scope/core/pipelines/wan2_1/blocks/preprocess_video.py
@@ -43,6 +43,12 @@ def inputs(self) -> list[InputParam]:
                 type_hint=list[torch.Tensor] | torch.Tensor | None,
                 description="Input frames for VACE conditioning",
             ),
+            InputParam(
+                "vace_input_masks",
+                default=None,
+                type_hint=torch.Tensor | None,
+                description="Spatial control masks [B, 1, F, H, W] for VACE conditioning",
+            ),
             InputParam(
                 "height",
                 required=True,
@@ -71,6 +77,11 @@ def intermediate_outputs(self) -> list[OutputParam]:
                 type_hint=torch.Tensor,
                 description="Input video to convert into noisy latents",
             ),
+            OutputParam(
+                "vace_input_masks",
+                type_hint=torch.Tensor,
+                description="Resampled VACE spatial control masks [B, 1, F, H, W]",
+            ),
         ]
 
     @torch.no_grad()
@@ -107,6 +118,34 @@ def __call__(self, components, state: PipelineState) -> tuple[Any, PipelineState
                 target_num_frames=target_num_frames,
             )
 
+        # Resample vace_input_masks to match target_num_frames.
+        # On the first chunk (current_start_frame == 0), target_num_frames is one
+        # greater than the default chunk size, so masks arriving from the queue
+        # (or a client parameter) would otherwise be one frame short, causing a
+        # shape mismatch inside VaceEncodingBlock._encode_with_conditioning.
+        if block_state.vace_input_masks is not None:
+            masks = block_state.vace_input_masks
+            if isinstance(masks, list):
+                masks = (
+                    torch.stack(masks, dim=2)
+                    if masks[0].dim() == 4
+                    else torch.stack(masks, dim=0)
+                )
+            mask_frames = masks.shape[2]
+            if mask_frames != target_num_frames:
+                indices = (
+                    torch.linspace(
+                        0,
+                        mask_frames - 1,
+                        target_num_frames,
+                        device=masks.device,
+                    )
+                    .round()
+                    .long()
+                )
+                masks = masks[:, :, indices]
+            block_state.vace_input_masks = masks
+
         self.set_block_state(state, block_state)
         return components, state
 

diff --git a/src/scope/server/pipeline_processor.py b/src/scope/server/pipeline_processor.py
@@ -437,6 +437,10 @@ def process_chunk(self):
             processing_time = time.time() - processing_start
 
             if not output_dict:
+                # 1) Some pipelines return {} when idle
+                # 2) For those, prepare() is None, so we never wait on input queues.
+                # 3) Without this sleep the worker thread would busy-loop.
+                self.shutdown_event.wait(SLEEP_TIME)
                 return
 
             # Pass audio to output queue regardless of whether video exists.
@@ -456,6 +460,8 @@ def process_chunk(self):
             # Extract video from the returned dictionary
             output = output_dict.get("video")
             if output is None:
+                self.is_prepared = True
+                self._pending_cache_init = False
                 return
 
             # Clear one-shot parameters after use to prevent sending them on subsequent chunks