From 4b5644e0e37987d2e37db1400d6c21ee10593fcb Mon Sep 17 00:00:00 2001 From: spapi Date: Fri, 27 Mar 2026 11:36:03 +0100 Subject: [PATCH 1/4] Fix bug in strip incomplete words --- .../speech_processors/base_streamatt.py | 21 ++++-- uts/speech_processors/test_streamatt.py | 73 ++++++++++++++++++- 2 files changed, 88 insertions(+), 6 deletions(-) diff --git a/simulstream/server/speech_processors/base_streamatt.py b/simulstream/server/speech_processors/base_streamatt.py index fa7ccd4..73a4683 100644 --- a/simulstream/server/speech_processors/base_streamatt.py +++ b/simulstream/server/speech_processors/base_streamatt.py @@ -26,6 +26,7 @@ BOW_PREFIX = "\u2581" +STRONG_PUNCTUATION = [".", "!", "?", ":", ";", "。"] logger = logging.getLogger(__name__) @@ -182,8 +183,7 @@ def _update_speech_history(self, discarded_text: int, cross_attn: torch.Tensor) # Check audio history not exceeding maximum allowed length self._cut_audio_exceeding_maxlen() - @staticmethod - def _strip_incomplete_words(tokens: List[str]) -> List[str]: + def _strip_incomplete_words(self, tokens: List[str]) -> List[str]: """ Remove last incomplete word(s) from the new hypothesis. @@ -193,6 +193,19 @@ def _strip_incomplete_words(tokens: List[str]) -> List[str]: Returns: List[str]: A list of generated tokens from which partial words are removed. """ + # Some tokenizers emit a trailing empty token after punctuation/EOS; drop it first so + # complete outputs like [" output", ".", ""] are not mistaken for incomplete words + while tokens and tokens[-1] == "": + tokens = tokens[:-1] + + if not tokens: + return [] + + last_token = tokens[-1].strip() + # If the hypothesis already ends with punctuation, keep it as a complete segment + if last_token and last_token[-1] in STRONG_PUNCTUATION: + return tokens + tokens_to_write = [] # iterate from the end and count how many trailing tokens to drop num_tokens_incomplete = 0 @@ -305,8 +318,6 @@ class PunctuationTextHistory: The current implementation supports only SentencePiece. """ - STRONG_PUNCTUATION = [".", "!", "?", ":", ";", "。"] - def __init__(self, config: SimpleNamespace): self.config = config @@ -317,7 +328,7 @@ def select_text_history(self, text_history): for token in reversed(text_history): prefix_token = token contains_punctuation = False - for punct in self.STRONG_PUNCTUATION: + for punct in STRONG_PUNCTUATION: if punct in prefix_token: contains_punctuation = True break diff --git a/uts/speech_processors/test_streamatt.py b/uts/speech_processors/test_streamatt.py index 180c408..159dc15 100644 --- a/uts/speech_processors/test_streamatt.py +++ b/uts/speech_processors/test_streamatt.py @@ -15,7 +15,7 @@ import unittest from types import SimpleNamespace -from simulstream.server.speech_processors.base_streamatt import PunctuationTextHistory +from simulstream.server.speech_processors.base_streamatt import PunctuationTextHistory, BaseStreamAtt class TestPunctuationTextHistory(unittest.TestCase): @@ -58,6 +58,77 @@ def test_no_strong_punctuation(self): zh_history = ['回', '到', '纽', '约', '后', ',', '我'] selected_history = self.punctuation_text_history.select_text_history(zh_history) self.assertEqual(selected_history, ['回', '到', '纽', '约', '后', ',', '我']) + + +class TestStripIncompleteWords(unittest.TestCase): + def setUp(self): + self.config = SimpleNamespace() + self._strip_incomplete_words = BaseStreamAtt._strip_incomplete_words + + def test_incomplete_word_is_stripped(self): + """Last word has no closing token — should be dropped.""" + stripped = self._strip_incomplete_words(self, ["▁U", "ser", "▁Inter", "ac"]) + self.assertEqual(stripped, ["▁U", "ser"]) + + def test_single_incomplete_word_returns_empty(self): + """Only one word and it's incomplete — nothing left to return.""" + stripped = self._strip_incomplete_words(self, ["▁Inter", "ac"]) + self.assertEqual(stripped, []) + + def test_multiple_incomplete_tokens_all_stripped(self): + """Several continuation tokens after the last BOW — all should be dropped.""" + stripped = self._strip_incomplete_words(self, ["▁U", "ser", "▁Inter", "ac", "ti"]) + self.assertEqual(stripped, ["▁U", "ser"]) + + def test_ends_with_period_kept(self): + """Trailing period counts as strong punctuation — full token list returned.""" + stripped = self._strip_incomplete_words(self, ["▁U", "ser", "▁Inter", "ac", "tion", "."]) + self.assertEqual(stripped, ["▁U", "ser", "▁Inter", "ac", "tion", "."]) + + def test_ends_with_multiple_periods(self): + """Trailing period counts as strong punctuation — full token list returned.""" + stripped = self._strip_incomplete_words(self, ["▁U", "ser", "▁Inter", "ac", "tion", ".", ".", "."]) + self.assertEqual(stripped, ["▁U", "ser", "▁Inter", "ac", "tion", ".", ".", "."]) + + def test_ends_with_non_strong_punctuation(self): + """Non strong punctuation marks should be treated as standard tokens.""" + stripped = self._strip_incomplete_words(self, ["▁Hello", "-"]) + self.assertEqual(stripped, []) + + def test_ends_with_question_mark(self): + """Question marks should be treated as strong punctuation.""" + stripped = self._strip_incomplete_words(self, ["▁Is", "▁this", "▁work", "ing", "?"]) + self.assertEqual(stripped, ["▁Is", "▁this", "▁work", "ing", "?"]) + + def test_trailing_empty_token_stripped_before_check(self): + """Empty trailing tokens should be dropped; remaining punctuation keeps the list intact.""" + stripped = self._strip_incomplete_words(self, ["▁output", ".", ""]) + self.assertEqual(stripped, ["▁output", "."]) + + def test_multiple_trailing_empty_tokens(self): + """Multiple trailing empty tokens should be dropped.""" + stripped = self._strip_incomplete_words(self, ["▁Hello", ".", "", ""]) + self.assertEqual(stripped, ["▁Hello", "."]) + + def test_only_empty_tokens_returns_empty(self): + """Only empty tokens should be dropped.""" + stripped = self._strip_incomplete_words(self, ["", "", ""]) + self.assertEqual(stripped, []) + + def test_empty_input(self): + """Empty input should return an empty list.""" + stripped = self._strip_incomplete_words(self, []) + self.assertEqual(stripped, []) + + def test_single_bow_token_incomplete(self): + """A lone BOW token with no following token is itself incomplete.""" + stripped = self._strip_incomplete_words(self, ["▁Hello"]) + self.assertEqual(stripped, []) + + def test_no_bow_prefix_at_all(self): + """No BOW token anywhere — loop never breaks, returns empty list.""" + stripped = self._strip_incomplete_words(self, ["ac", "tion"]) + self.assertEqual(stripped, []) if __name__ == "__main__": From 534684f105dbd302509d2ab840c6306c04833324 Mon Sep 17 00:00:00 2001 From: spapi Date: Fri, 27 Mar 2026 11:42:20 +0100 Subject: [PATCH 2/4] Fix linting --- uts/speech_processors/test_streamatt.py | 27 +++++++++++++------------ 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/uts/speech_processors/test_streamatt.py b/uts/speech_processors/test_streamatt.py index 159dc15..d660048 100644 --- a/uts/speech_processors/test_streamatt.py +++ b/uts/speech_processors/test_streamatt.py @@ -15,7 +15,7 @@ import unittest from types import SimpleNamespace -from simulstream.server.speech_processors.base_streamatt import PunctuationTextHistory, BaseStreamAtt +from simulstream.server.speech_processors.base_streamatt import PunctuationTextHistory, BaseStreamAtt # noqa: E402 class TestPunctuationTextHistory(unittest.TestCase): @@ -58,23 +58,23 @@ def test_no_strong_punctuation(self): zh_history = ['回', '到', '纽', '约', '后', ',', '我'] selected_history = self.punctuation_text_history.select_text_history(zh_history) self.assertEqual(selected_history, ['回', '到', '纽', '约', '后', ',', '我']) - - + + class TestStripIncompleteWords(unittest.TestCase): def setUp(self): self.config = SimpleNamespace() self._strip_incomplete_words = BaseStreamAtt._strip_incomplete_words - + def test_incomplete_word_is_stripped(self): """Last word has no closing token — should be dropped.""" stripped = self._strip_incomplete_words(self, ["▁U", "ser", "▁Inter", "ac"]) self.assertEqual(stripped, ["▁U", "ser"]) - + def test_single_incomplete_word_returns_empty(self): """Only one word and it's incomplete — nothing left to return.""" stripped = self._strip_incomplete_words(self, ["▁Inter", "ac"]) self.assertEqual(stripped, []) - + def test_multiple_incomplete_tokens_all_stripped(self): """Several continuation tokens after the last BOW — all should be dropped.""" stripped = self._strip_incomplete_words(self, ["▁U", "ser", "▁Inter", "ac", "ti"]) @@ -87,14 +87,15 @@ def test_ends_with_period_kept(self): def test_ends_with_multiple_periods(self): """Trailing period counts as strong punctuation — full token list returned.""" - stripped = self._strip_incomplete_words(self, ["▁U", "ser", "▁Inter", "ac", "tion", ".", ".", "."]) + stripped = self._strip_incomplete_words( + self, ["▁U", "ser", "▁Inter", "ac", "tion", ".", ".", "."]) self.assertEqual(stripped, ["▁U", "ser", "▁Inter", "ac", "tion", ".", ".", "."]) - + def test_ends_with_non_strong_punctuation(self): """Non strong punctuation marks should be treated as standard tokens.""" stripped = self._strip_incomplete_words(self, ["▁Hello", "-"]) self.assertEqual(stripped, []) - + def test_ends_with_question_mark(self): """Question marks should be treated as strong punctuation.""" stripped = self._strip_incomplete_words(self, ["▁Is", "▁this", "▁work", "ing", "?"]) @@ -104,12 +105,12 @@ def test_trailing_empty_token_stripped_before_check(self): """Empty trailing tokens should be dropped; remaining punctuation keeps the list intact.""" stripped = self._strip_incomplete_words(self, ["▁output", ".", ""]) self.assertEqual(stripped, ["▁output", "."]) - + def test_multiple_trailing_empty_tokens(self): """Multiple trailing empty tokens should be dropped.""" stripped = self._strip_incomplete_words(self, ["▁Hello", ".", "", ""]) self.assertEqual(stripped, ["▁Hello", "."]) - + def test_only_empty_tokens_returns_empty(self): """Only empty tokens should be dropped.""" stripped = self._strip_incomplete_words(self, ["", "", ""]) @@ -119,12 +120,12 @@ def test_empty_input(self): """Empty input should return an empty list.""" stripped = self._strip_incomplete_words(self, []) self.assertEqual(stripped, []) - + def test_single_bow_token_incomplete(self): """A lone BOW token with no following token is itself incomplete.""" stripped = self._strip_incomplete_words(self, ["▁Hello"]) self.assertEqual(stripped, []) - + def test_no_bow_prefix_at_all(self): """No BOW token anywhere — loop never breaks, returns empty list.""" stripped = self._strip_incomplete_words(self, ["ac", "tion"]) From b9612ac4fc77e7fd955ec5cdaea6f14cefb77852 Mon Sep 17 00:00:00 2001 From: spapi Date: Fri, 27 Mar 2026 12:02:04 +0100 Subject: [PATCH 3/4] Fix linting --- uts/speech_processors/test_streamatt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/uts/speech_processors/test_streamatt.py b/uts/speech_processors/test_streamatt.py index d660048..35f160a 100644 --- a/uts/speech_processors/test_streamatt.py +++ b/uts/speech_processors/test_streamatt.py @@ -15,7 +15,7 @@ import unittest from types import SimpleNamespace -from simulstream.server.speech_processors.base_streamatt import PunctuationTextHistory, BaseStreamAtt # noqa: E402 +from simulstream.server.speech_processors.base_streamatt import PunctuationTextHistory, BaseStreamAtt # noqa: E402 class TestPunctuationTextHistory(unittest.TestCase): From 2677158468024a19e28b7075ebe115ad50c8d175 Mon Sep 17 00:00:00 2001 From: spapi Date: Fri, 27 Mar 2026 12:11:58 +0100 Subject: [PATCH 4/4] Fix linting --- uts/speech_processors/test_streamatt.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/uts/speech_processors/test_streamatt.py b/uts/speech_processors/test_streamatt.py index 35f160a..efc0bcb 100644 --- a/uts/speech_processors/test_streamatt.py +++ b/uts/speech_processors/test_streamatt.py @@ -15,7 +15,8 @@ import unittest from types import SimpleNamespace -from simulstream.server.speech_processors.base_streamatt import PunctuationTextHistory, BaseStreamAtt # noqa: E402 +from simulstream.server.speech_processors.base_streamatt import ( + PunctuationTextHistory, BaseStreamAtt) class TestPunctuationTextHistory(unittest.TestCase):