From 8c35eb7b1c07ca7b37579bd9467af16693c0ecb8 Mon Sep 17 00:00:00 2001 From: Saito-K03 Date: Fri, 27 Feb 2026 13:50:26 +0900 Subject: [PATCH 1/3] hide_incomplete_unicode now works with 3 (or more) bytes characters --- simulstreaming_whisper.py | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/simulstreaming_whisper.py b/simulstreaming_whisper.py index 6f4eccd..93a91d6 100644 --- a/simulstreaming_whisper.py +++ b/simulstreaming_whisper.py @@ -193,16 +193,26 @@ def hide_incomplete_unicode(self, tokens): starts with '�'. This function hides the last incomplete unicode character and adds it in the next iteration. """ - if self.unicode_buffer != []: - logger.debug(f"Hiding incomplete unicode character: {self.unicode_buffer}") - tokens = self.unicode_buffer + tokens - self.unicode_buffer = [] # clear the buffer after processing - chars, _ = self.model.tokenizer.split_tokens_on_unicode(tokens) - if len(chars) > 0 and chars[-1].endswith('�'): - self.unicode_buffer = tokens[-1:] # keep the last incomplete unicode character - logger.debug(f"Hiding incomplete unicode character: {tokens[-1:]}") - return tokens[:-1] # remove the last token, which is incomplete unicode character - return tokens + if tokens == []: + return tokens #To preserve unicode_buffer + tokens = self.unicode_buffer + tokens #Add previous buffered token + self.unicode_buffer = [] + decoded_str_bytes = self.model.tokenizer.encoding.decode_bytes_batch( + [[t] for t in tokens] #Split bytes on token + ) + decoded_str = bytearray().join(decoded_str_bytes).decode('utf-8', errors="replace") + if len(decoded_str) > 0 and decoded_str[-1].endswith('�'): #split by char won't work because multple tokens can end up with one �. + for i in range(len(tokens)-1, 0, -1): + decoded_str_piece = bytearray().join(decoded_str_bytes[:i]).decode('utf-8', errors="replace") + if not (len(decoded_str_piece) > 0 and decoded_str_piece[-1].endswith('�')): + self.unicode_buffer = tokens[i:] + logger.debug(f"Hiding incomplete unicode character at end: {self.unicode_buffer}") + return tokens[:i] + logger.debug(f"Failed to split token, fallback to previous behaviour") + self.unicode_buffer = [tokens[-1]] + return tokens[:-1] + else: + return tokens def process_iter(self): if len(self.audio_chunks) == 0: From f7ed3f3cdd2b3e9f8f575a4908965b331ab6ce21 Mon Sep 17 00:00:00 2001 From: Saito-K03 Date: Fri, 27 Feb 2026 14:00:36 +0900 Subject: [PATCH 2/3] Fix problem where frame buffer was added on wrong iteration, causing "pop from empty list" --- simulstreaming_whisper.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/simulstreaming_whisper.py b/simulstreaming_whisper.py index 93a91d6..4372fc9 100644 --- a/simulstreaming_whisper.py +++ b/simulstreaming_whisper.py @@ -147,6 +147,7 @@ def init(self, offset=None): self.model.refresh_segment(complete=True) self.unicode_buffer = [] # hide incomplete unicode character for the next iteration + self.frame_buffer = 0 def insert_audio_chunk(self, audio): self.audio_chunks.append(torch.from_numpy(audio)) @@ -162,9 +163,10 @@ def timestamped_text(self, tokens, generation): split_words, split_tokens = generation["result"]["split_words"], generation["result"]["split_tokens"] frames = [p["most_attended_frames"][0] for p in pr] - if self.unicode_buffer != []: - a = [frames[0]] * len(self.unicode_buffer) + if frames and self.frame_buffer: + a = [frames[0]] * self.frame_buffer #Buffer generation["result"] if timestamp accuracy becomes problem frames = a + frames + self.frame_buffer = 0 tokens = tokens.copy() ret = [] @@ -196,6 +198,7 @@ def hide_incomplete_unicode(self, tokens): if tokens == []: return tokens #To preserve unicode_buffer tokens = self.unicode_buffer + tokens #Add previous buffered token + self.frame_buffer = len(self.unicode_buffer) self.unicode_buffer = [] decoded_str_bytes = self.model.tokenizer.encoding.decode_bytes_batch( [[t] for t in tokens] #Split bytes on token From 6834b4152a8faa1e70121ade671415c2d4db54fc Mon Sep 17 00:00:00 2001 From: Saito-K03 Date: Fri, 27 Feb 2026 14:16:10 +0900 Subject: [PATCH 3/3] Add non-json output tolerance against empty output --- simulstreaming/whisper/whisper_streaming/whisper_server.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/simulstreaming/whisper/whisper_streaming/whisper_server.py b/simulstreaming/whisper/whisper_streaming/whisper_server.py index 26c51d8..152e6c5 100644 --- a/simulstreaming/whisper/whisper_streaming/whisper_server.py +++ b/simulstreaming/whisper/whisper_streaming/whisper_server.py @@ -61,7 +61,11 @@ def send_result(self, iteration_output): # - the next words: segment transcript if iteration_output: if self.out_txt: - message = "%1.0f %1.0f %s" % (iteration_output['start'] * 1000, iteration_output['end'] * 1000, iteration_output['text']) + if iteration_output.get('start'): + message = "%1.0f %1.0f %s" % (iteration_output['start'] * 1000, iteration_output['end'] * 1000, iteration_output['text']) + else: + logger.debug("No token in this segment") + return else: message = json.dumps(iteration_output) print(message, flush=True, file=sys.stderr)