diff --git a/funasr/utils/load_utils.py b/funasr/utils/load_utils.py index d208f7dc0..86bbaa466 100644 --- a/funasr/utils/load_utils.py +++ b/funasr/utils/load_utils.py @@ -200,10 +200,14 @@ def extract_fbank(data, data_len=None, data_type: str = "sound", frontend=None, data = torch.from_numpy(data) if len(data.shape) < 2: data = data[None, :] # data: [batch, N] + elif data.shape[0] > 1: + data = data.mean(dim=0, keepdim=True) # convert stereo/multi-channel to mono data_len = [data.shape[1]] if data_len is None else data_len elif isinstance(data, torch.Tensor): if len(data.shape) < 2: data = data[None, :] # data: [batch, N] + elif data.shape[0] > 1: + data = data.mean(dim=0, keepdim=True) # convert stereo/multi-channel to mono data_len = [data.shape[1]] if data_len is None else data_len elif isinstance(data, (list, tuple)): data_list, data_len = [], []