From 4b5644e0e37987d2e37db1400d6c21ee10593fcb Mon Sep 17 00:00:00 2001
From: spapi <spapi@fbk.eu>
Date: Fri, 27 Mar 2026 11:36:03 +0100
Subject: [PATCH 1/4] Fix bug in strip incomplete words

---
 .../speech_processors/base_streamatt.py       | 21 ++++--
 uts/speech_processors/test_streamatt.py       | 73 ++++++++++++++++++-
 2 files changed, 88 insertions(+), 6 deletions(-)

diff --git a/simulstream/server/speech_processors/base_streamatt.py b/simulstream/server/speech_processors/base_streamatt.py
index fa7ccd4..73a4683 100644
--- a/simulstream/server/speech_processors/base_streamatt.py
+++ b/simulstream/server/speech_processors/base_streamatt.py
@@ -26,6 +26,7 @@
 
 
 BOW_PREFIX = "\u2581"
+STRONG_PUNCTUATION = [".", "!", "?", ":", ";", "。"]
 
 
 logger = logging.getLogger(__name__)
@@ -182,8 +183,7 @@ def _update_speech_history(self, discarded_text: int, cross_attn: torch.Tensor)
         # Check audio history not exceeding maximum allowed length
         self._cut_audio_exceeding_maxlen()
 
-    @staticmethod
-    def _strip_incomplete_words(tokens: List[str]) -> List[str]:
+    def _strip_incomplete_words(self, tokens: List[str]) -> List[str]:
         """
         Remove last incomplete word(s) from the new hypothesis.
 
@@ -193,6 +193,19 @@ def _strip_incomplete_words(tokens: List[str]) -> List[str]:
         Returns:
             List[str]: A list of generated tokens from which partial words are removed.
         """
+        # Some tokenizers emit a trailing empty token after punctuation/EOS; drop it first so
+        # complete outputs like [" output", ".", ""] are not mistaken for incomplete words
+        while tokens and tokens[-1] == "":
+            tokens = tokens[:-1]
+
+        if not tokens:
+            return []
+
+        last_token = tokens[-1].strip()
+        # If the hypothesis already ends with punctuation, keep it as a complete segment
+        if last_token and last_token[-1] in STRONG_PUNCTUATION:
+            return tokens
+
         tokens_to_write = []
         # iterate from the end and count how many trailing tokens to drop
         num_tokens_incomplete = 0
@@ -305,8 +318,6 @@ class PunctuationTextHistory:
     The current implementation supports only SentencePiece.
     """
 
-    STRONG_PUNCTUATION = [".", "!", "?", ":", ";", "。"]
-
     def __init__(self, config: SimpleNamespace):
         self.config = config
 
@@ -317,7 +328,7 @@ def select_text_history(self, text_history):
         for token in reversed(text_history):
             prefix_token = token
             contains_punctuation = False
-            for punct in self.STRONG_PUNCTUATION:
+            for punct in STRONG_PUNCTUATION:
                 if punct in prefix_token:
                     contains_punctuation = True
                     break
diff --git a/uts/speech_processors/test_streamatt.py b/uts/speech_processors/test_streamatt.py
index 180c408..159dc15 100644
--- a/uts/speech_processors/test_streamatt.py
+++ b/uts/speech_processors/test_streamatt.py
@@ -15,7 +15,7 @@
 import unittest
 from types import SimpleNamespace
 
-from simulstream.server.speech_processors.base_streamatt import PunctuationTextHistory
+from simulstream.server.speech_processors.base_streamatt import PunctuationTextHistory, BaseStreamAtt
 
 
 class TestPunctuationTextHistory(unittest.TestCase):
@@ -58,6 +58,77 @@ def test_no_strong_punctuation(self):
         zh_history = ['回', '到', '纽', '约', '后', '，', '我']
         selected_history = self.punctuation_text_history.select_text_history(zh_history)
         self.assertEqual(selected_history, ['回', '到', '纽', '约', '后', '，', '我'])
+        
+        
+class TestStripIncompleteWords(unittest.TestCase):
+    def setUp(self):
+        self.config = SimpleNamespace()
+        self._strip_incomplete_words = BaseStreamAtt._strip_incomplete_words
+    
+    def test_incomplete_word_is_stripped(self):
+        """Last word has no closing token — should be dropped."""
+        stripped = self._strip_incomplete_words(self, ["▁U", "ser", "▁Inter", "ac"])
+        self.assertEqual(stripped, ["▁U", "ser"])
+    
+    def test_single_incomplete_word_returns_empty(self):
+        """Only one word and it's incomplete — nothing left to return."""
+        stripped = self._strip_incomplete_words(self, ["▁Inter", "ac"])
+        self.assertEqual(stripped, [])
+    
+    def test_multiple_incomplete_tokens_all_stripped(self):
+        """Several continuation tokens after the last BOW — all should be dropped."""
+        stripped = self._strip_incomplete_words(self, ["▁U", "ser", "▁Inter", "ac", "ti"])
+        self.assertEqual(stripped, ["▁U", "ser"])
+
+    def test_ends_with_period_kept(self):
+        """Trailing period counts as strong punctuation — full token list returned."""
+        stripped = self._strip_incomplete_words(self, ["▁U", "ser", "▁Inter", "ac", "tion", "."])
+        self.assertEqual(stripped, ["▁U", "ser", "▁Inter", "ac", "tion", "."])
+
+    def test_ends_with_multiple_periods(self):
+        """Trailing period counts as strong punctuation — full token list returned."""
+        stripped = self._strip_incomplete_words(self, ["▁U", "ser", "▁Inter", "ac", "tion", ".", ".", "."])
+        self.assertEqual(stripped, ["▁U", "ser", "▁Inter", "ac", "tion", ".", ".", "."])
+    
+    def test_ends_with_non_strong_punctuation(self):
+        """Non strong punctuation marks should be treated as standard tokens."""
+        stripped = self._strip_incomplete_words(self, ["▁Hello", "-"])
+        self.assertEqual(stripped, [])
+    
+    def test_ends_with_question_mark(self):
+        """Question marks should be treated as strong punctuation."""
+        stripped = self._strip_incomplete_words(self, ["▁Is", "▁this", "▁work", "ing", "?"])
+        self.assertEqual(stripped, ["▁Is", "▁this", "▁work", "ing", "?"])
+
+    def test_trailing_empty_token_stripped_before_check(self):
+        """Empty trailing tokens should be dropped; remaining punctuation keeps the list intact."""
+        stripped = self._strip_incomplete_words(self, ["▁output", ".", ""])
+        self.assertEqual(stripped, ["▁output", "."])
+    
+    def test_multiple_trailing_empty_tokens(self):
+        """Multiple trailing empty tokens should be dropped."""
+        stripped = self._strip_incomplete_words(self, ["▁Hello", ".", "", ""])
+        self.assertEqual(stripped, ["▁Hello", "."])
+    
+    def test_only_empty_tokens_returns_empty(self):
+        """Only empty tokens should be dropped."""
+        stripped = self._strip_incomplete_words(self, ["", "", ""])
+        self.assertEqual(stripped, [])
+
+    def test_empty_input(self):
+        """Empty input should return an empty list."""
+        stripped = self._strip_incomplete_words(self, [])
+        self.assertEqual(stripped, [])
+    
+    def test_single_bow_token_incomplete(self):
+        """A lone BOW token with no following token is itself incomplete."""
+        stripped = self._strip_incomplete_words(self, ["▁Hello"])
+        self.assertEqual(stripped, [])
+    
+    def test_no_bow_prefix_at_all(self):
+        """No BOW token anywhere — loop never breaks, returns empty list."""
+        stripped = self._strip_incomplete_words(self, ["ac", "tion"])
+        self.assertEqual(stripped, [])
 
 
 if __name__ == "__main__":

From 534684f105dbd302509d2ab840c6306c04833324 Mon Sep 17 00:00:00 2001
From: spapi <spapi@fbk.eu>
Date: Fri, 27 Mar 2026 11:42:20 +0100
Subject: [PATCH 2/4] Fix linting

---
 uts/speech_processors/test_streamatt.py | 27 +++++++++++++------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/uts/speech_processors/test_streamatt.py b/uts/speech_processors/test_streamatt.py
index 159dc15..d660048 100644
--- a/uts/speech_processors/test_streamatt.py
+++ b/uts/speech_processors/test_streamatt.py
@@ -15,7 +15,7 @@
 import unittest
 from types import SimpleNamespace
 
-from simulstream.server.speech_processors.base_streamatt import PunctuationTextHistory, BaseStreamAtt
+from simulstream.server.speech_processors.base_streamatt import PunctuationTextHistory, BaseStreamAtt # noqa: E402
 
 
 class TestPunctuationTextHistory(unittest.TestCase):
@@ -58,23 +58,23 @@ def test_no_strong_punctuation(self):
         zh_history = ['回', '到', '纽', '约', '后', '，', '我']
         selected_history = self.punctuation_text_history.select_text_history(zh_history)
         self.assertEqual(selected_history, ['回', '到', '纽', '约', '后', '，', '我'])
-        
-        
+
+
 class TestStripIncompleteWords(unittest.TestCase):
     def setUp(self):
         self.config = SimpleNamespace()
         self._strip_incomplete_words = BaseStreamAtt._strip_incomplete_words
-    
+
     def test_incomplete_word_is_stripped(self):
         """Last word has no closing token — should be dropped."""
         stripped = self._strip_incomplete_words(self, ["▁U", "ser", "▁Inter", "ac"])
         self.assertEqual(stripped, ["▁U", "ser"])
-    
+
     def test_single_incomplete_word_returns_empty(self):
         """Only one word and it's incomplete — nothing left to return."""
         stripped = self._strip_incomplete_words(self, ["▁Inter", "ac"])
         self.assertEqual(stripped, [])
-    
+
     def test_multiple_incomplete_tokens_all_stripped(self):
         """Several continuation tokens after the last BOW — all should be dropped."""
         stripped = self._strip_incomplete_words(self, ["▁U", "ser", "▁Inter", "ac", "ti"])
@@ -87,14 +87,15 @@ def test_ends_with_period_kept(self):
 
     def test_ends_with_multiple_periods(self):
         """Trailing period counts as strong punctuation — full token list returned."""
-        stripped = self._strip_incomplete_words(self, ["▁U", "ser", "▁Inter", "ac", "tion", ".", ".", "."])
+        stripped = self._strip_incomplete_words(
+            self, ["▁U", "ser", "▁Inter", "ac", "tion", ".", ".", "."])
         self.assertEqual(stripped, ["▁U", "ser", "▁Inter", "ac", "tion", ".", ".", "."])
-    
+
     def test_ends_with_non_strong_punctuation(self):
         """Non strong punctuation marks should be treated as standard tokens."""
         stripped = self._strip_incomplete_words(self, ["▁Hello", "-"])
         self.assertEqual(stripped, [])
-    
+
     def test_ends_with_question_mark(self):
         """Question marks should be treated as strong punctuation."""
         stripped = self._strip_incomplete_words(self, ["▁Is", "▁this", "▁work", "ing", "?"])
@@ -104,12 +105,12 @@ def test_trailing_empty_token_stripped_before_check(self):
         """Empty trailing tokens should be dropped; remaining punctuation keeps the list intact."""
         stripped = self._strip_incomplete_words(self, ["▁output", ".", ""])
         self.assertEqual(stripped, ["▁output", "."])
-    
+
     def test_multiple_trailing_empty_tokens(self):
         """Multiple trailing empty tokens should be dropped."""
         stripped = self._strip_incomplete_words(self, ["▁Hello", ".", "", ""])
         self.assertEqual(stripped, ["▁Hello", "."])
-    
+
     def test_only_empty_tokens_returns_empty(self):
         """Only empty tokens should be dropped."""
         stripped = self._strip_incomplete_words(self, ["", "", ""])
@@ -119,12 +120,12 @@ def test_empty_input(self):
         """Empty input should return an empty list."""
         stripped = self._strip_incomplete_words(self, [])
         self.assertEqual(stripped, [])
-    
+
     def test_single_bow_token_incomplete(self):
         """A lone BOW token with no following token is itself incomplete."""
         stripped = self._strip_incomplete_words(self, ["▁Hello"])
         self.assertEqual(stripped, [])
-    
+
     def test_no_bow_prefix_at_all(self):
         """No BOW token anywhere — loop never breaks, returns empty list."""
         stripped = self._strip_incomplete_words(self, ["ac", "tion"])

From b9612ac4fc77e7fd955ec5cdaea6f14cefb77852 Mon Sep 17 00:00:00 2001
From: spapi <spapi@fbk.eu>
Date: Fri, 27 Mar 2026 12:02:04 +0100
Subject: [PATCH 3/4] Fix linting

---
 uts/speech_processors/test_streamatt.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/uts/speech_processors/test_streamatt.py b/uts/speech_processors/test_streamatt.py
index d660048..35f160a 100644
--- a/uts/speech_processors/test_streamatt.py
+++ b/uts/speech_processors/test_streamatt.py
@@ -15,7 +15,7 @@
 import unittest
 from types import SimpleNamespace
 
-from simulstream.server.speech_processors.base_streamatt import PunctuationTextHistory, BaseStreamAtt # noqa: E402
+from simulstream.server.speech_processors.base_streamatt import PunctuationTextHistory, BaseStreamAtt  # noqa: E402
 
 
 class TestPunctuationTextHistory(unittest.TestCase):

From 2677158468024a19e28b7075ebe115ad50c8d175 Mon Sep 17 00:00:00 2001
From: spapi <spapi@fbk.eu>
Date: Fri, 27 Mar 2026 12:11:58 +0100
Subject: [PATCH 4/4] Fix linting

---
 uts/speech_processors/test_streamatt.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/uts/speech_processors/test_streamatt.py b/uts/speech_processors/test_streamatt.py
index 35f160a..efc0bcb 100644
--- a/uts/speech_processors/test_streamatt.py
+++ b/uts/speech_processors/test_streamatt.py
@@ -15,7 +15,8 @@
 import unittest
 from types import SimpleNamespace
 
-from simulstream.server.speech_processors.base_streamatt import PunctuationTextHistory, BaseStreamAtt  # noqa: E402
+from simulstream.server.speech_processors.base_streamatt import (
+    PunctuationTextHistory, BaseStreamAtt)
 
 
 class TestPunctuationTextHistory(unittest.TestCase):