From 42d873a76888c058801b58355498fa9092cdcd33 Mon Sep 17 00:00:00 2001
From: Your Name <your-email@example.com>
Date: Fri, 20 Feb 2026 23:44:55 -0800
Subject: [PATCH] Fix #2809: remove unnecessary GPU-to-CPU transfer in VAD
 ComputeScores

The .to('cpu') call after each encoder forward pass in ComputeScores()
forced a GPU->CPU synchronization point on every VAD chunk inference,
causing ~10x latency regression on GPU (RTX 4080/4090) in offline
VAD+ASR mode. The scores tensor stays on the same device as the encoder
output; downstream .item() calls handle the scalar extraction correctly
regardless of device.
---
 funasr/models/fsmn_vad_streaming/model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/funasr/models/fsmn_vad_streaming/model.py b/funasr/models/fsmn_vad_streaming/model.py
index 7dfb96a05..51581ba25 100644
--- a/funasr/models/fsmn_vad_streaming/model.py
+++ b/funasr/models/fsmn_vad_streaming/model.py
@@ -348,7 +348,7 @@ def ComputeDecibel(self, cache: dict = {}) -> None:
 
 
     def ComputeScores(self, feats: torch.Tensor, cache: dict = {}) -> None:
-        scores = self.encoder(feats, cache=cache["encoder"]).to("cpu")  # return B * T * D
+        scores = self.encoder(feats, cache=cache["encoder"])  # return B * T * D
         assert (
             scores.shape[1] == feats.shape[1]
         ), "The shape between feats and scores does not match"