From 8c35eb7b1c07ca7b37579bd9467af16693c0ecb8 Mon Sep 17 00:00:00 2001
From: Saito-K03 <saito.kyle.r5@dc.tohoku.ac.jp>
Date: Fri, 27 Feb 2026 13:50:26 +0900
Subject: [PATCH 1/3] hide_incomplete_unicode now works with 3 (or more) bytes
 characters

---
 simulstreaming_whisper.py | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/simulstreaming_whisper.py b/simulstreaming_whisper.py
index 6f4eccd..93a91d6 100644
--- a/simulstreaming_whisper.py
+++ b/simulstreaming_whisper.py
@@ -193,16 +193,26 @@ def hide_incomplete_unicode(self, tokens):
         starts with '�'.
         This function hides the last incomplete unicode character and adds it in the next iteration.
         """
-        if self.unicode_buffer != []:
-            logger.debug(f"Hiding incomplete unicode character: {self.unicode_buffer}")
-            tokens = self.unicode_buffer + tokens
-            self.unicode_buffer = []  # clear the buffer after processing
-        chars, _ = self.model.tokenizer.split_tokens_on_unicode(tokens)
-        if len(chars) > 0 and chars[-1].endswith('�'):
-            self.unicode_buffer = tokens[-1:]  # keep the last incomplete unicode character
-            logger.debug(f"Hiding incomplete unicode character: {tokens[-1:]}")
-            return tokens[:-1]  # remove the last token, which is incomplete unicode character
-        return tokens
+        if tokens == []:
+            return tokens #To preserve unicode_buffer
+        tokens = self.unicode_buffer + tokens #Add previous buffered token
+        self.unicode_buffer = []
+        decoded_str_bytes = self.model.tokenizer.encoding.decode_bytes_batch(
+                [[t] for t in tokens] #Split bytes on token
+                )
+        decoded_str = bytearray().join(decoded_str_bytes).decode('utf-8', errors="replace")
+        if len(decoded_str) > 0 and decoded_str[-1].endswith('�'): #split by char won't work because multple tokens can end up with one �.
+            for i in range(len(tokens)-1, 0, -1):
+                decoded_str_piece = bytearray().join(decoded_str_bytes[:i]).decode('utf-8', errors="replace")
+                if not (len(decoded_str_piece) > 0 and decoded_str_piece[-1].endswith('�')):
+                    self.unicode_buffer = tokens[i:]
+                    logger.debug(f"Hiding incomplete unicode character at end: {self.unicode_buffer}")
+                    return tokens[:i]
+            logger.debug(f"Failed to split token, fallback to previous behaviour")
+            self.unicode_buffer = [tokens[-1]]
+            return tokens[:-1]
+        else:
+            return tokens
 
     def process_iter(self):
         if len(self.audio_chunks) == 0:

From f7ed3f3cdd2b3e9f8f575a4908965b331ab6ce21 Mon Sep 17 00:00:00 2001
From: Saito-K03 <saito.kyle.r5@dc.tohoku.ac.jp>
Date: Fri, 27 Feb 2026 14:00:36 +0900
Subject: [PATCH 2/3] Fix problem where frame buffer was added on wrong
 iteration, causing "pop from empty list"

---
 simulstreaming_whisper.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/simulstreaming_whisper.py b/simulstreaming_whisper.py
index 93a91d6..4372fc9 100644
--- a/simulstreaming_whisper.py
+++ b/simulstreaming_whisper.py
@@ -147,6 +147,7 @@ def init(self, offset=None):
         self.model.refresh_segment(complete=True)
 
         self.unicode_buffer = []  # hide incomplete unicode character for the next iteration
+        self.frame_buffer = 0
 
     def insert_audio_chunk(self, audio):
         self.audio_chunks.append(torch.from_numpy(audio))
@@ -162,9 +163,10 @@ def timestamped_text(self, tokens, generation):
             split_words, split_tokens = generation["result"]["split_words"], generation["result"]["split_tokens"]
 
         frames = [p["most_attended_frames"][0] for p in pr]
-        if self.unicode_buffer != []:
-            a = [frames[0]] * len(self.unicode_buffer)
+        if frames and self.frame_buffer:
+            a = [frames[0]] * self.frame_buffer #Buffer generation["result"] if timestamp accuracy becomes problem
             frames = a + frames
+            self.frame_buffer = 0
             
         tokens = tokens.copy()
         ret = []
@@ -196,6 +198,7 @@ def hide_incomplete_unicode(self, tokens):
         if tokens == []:
             return tokens #To preserve unicode_buffer
         tokens = self.unicode_buffer + tokens #Add previous buffered token
+        self.frame_buffer = len(self.unicode_buffer)
         self.unicode_buffer = []
         decoded_str_bytes = self.model.tokenizer.encoding.decode_bytes_batch(
                 [[t] for t in tokens] #Split bytes on token

From 6834b4152a8faa1e70121ade671415c2d4db54fc Mon Sep 17 00:00:00 2001
From: Saito-K03 <saito.kyle.r5@dc.tohoku.ac.jp>
Date: Fri, 27 Feb 2026 14:16:10 +0900
Subject: [PATCH 3/3] Add non-json output tolerance against empty output

---
 simulstreaming/whisper/whisper_streaming/whisper_server.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/simulstreaming/whisper/whisper_streaming/whisper_server.py b/simulstreaming/whisper/whisper_streaming/whisper_server.py
index 26c51d8..152e6c5 100644
--- a/simulstreaming/whisper/whisper_streaming/whisper_server.py
+++ b/simulstreaming/whisper/whisper_streaming/whisper_server.py
@@ -61,7 +61,11 @@ def send_result(self, iteration_output):
         # - the next words: segment transcript
         if iteration_output:
             if self.out_txt:
-                message = "%1.0f %1.0f %s" % (iteration_output['start'] * 1000, iteration_output['end'] * 1000, iteration_output['text'])
+                if iteration_output.get('start'):
+                    message = "%1.0f %1.0f %s" % (iteration_output['start'] * 1000, iteration_output['end'] * 1000, iteration_output['text'])
+                else:
+                    logger.debug("No token in this segment")
+                    return
             else:
                 message = json.dumps(iteration_output)
             print(message, flush=True, file=sys.stderr)