From 42d873a76888c058801b58355498fa9092cdcd33 Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 20 Feb 2026 23:44:55 -0800 Subject: [PATCH] Fix #2809: remove unnecessary GPU-to-CPU transfer in VAD ComputeScores The .to('cpu') call after each encoder forward pass in ComputeScores() forced a GPU->CPU synchronization point on every VAD chunk inference, causing ~10x latency regression on GPU (RTX 4080/4090) in offline VAD+ASR mode. The scores tensor stays on the same device as the encoder output; downstream .item() calls handle the scalar extraction correctly regardless of device. --- funasr/models/fsmn_vad_streaming/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/funasr/models/fsmn_vad_streaming/model.py b/funasr/models/fsmn_vad_streaming/model.py index 7dfb96a05..51581ba25 100644 --- a/funasr/models/fsmn_vad_streaming/model.py +++ b/funasr/models/fsmn_vad_streaming/model.py @@ -348,7 +348,7 @@ def ComputeDecibel(self, cache: dict = {}) -> None: def ComputeScores(self, feats: torch.Tensor, cache: dict = {}) -> None: - scores = self.encoder(feats, cache=cache["encoder"]).to("cpu") # return B * T * D + scores = self.encoder(feats, cache=cache["encoder"]) # return B * T * D assert ( scores.shape[1] == feats.shape[1] ), "The shape between feats and scores does not match"