From add2c28184a56e87745c75481a062eaf538817df Mon Sep 17 00:00:00 2001 From: Mircea Lungu Date: Tue, 24 Mar 2026 19:33:42 +0100 Subject: [PATCH 1/3] Preserve bold/italic formatting from teacher HTML in tokenized output When teachers use bold/italic in the rich text editor, the inline formatting was lost during tokenization because get_text() strips all HTML tags. This re-parses the article's htmlContent after tokenization to tag individual tokens with is_bold/is_italic flags. Co-Authored-By: Claude Opus 4.6 (1M context) --- zeeguu/core/model/article.py | 83 +++++++++++++++++++++++++++++++ zeeguu/core/tokenization/token.py | 9 ++++ 2 files changed, 92 insertions(+) diff --git a/zeeguu/core/model/article.py b/zeeguu/core/model/article.py index ab8cf9582..eacf4a30a 100644 --- a/zeeguu/core/model/article.py +++ b/zeeguu/core/model/article.py @@ -472,6 +472,76 @@ def get_tokenized_content(self): return result + @staticmethod + def _extract_inline_formatting(element): + """ + Extract character ranges with bold/italic formatting from an HTML element. + Returns a list of (char_start, char_end, is_bold, is_italic) tuples + where positions refer to the plain text (get_text()) of the element. + """ + from bs4 import NavigableString + + ranges = [] + pos = [0] + + def walk(node, bold=False, italic=False): + if isinstance(node, NavigableString): + text_len = len(str(node)) + if text_len > 0 and (bold or italic): + ranges.append((pos[0], pos[0] + text_len, bold, italic)) + pos[0] += text_len + elif hasattr(node, "children"): + new_bold = bold or node.name in ("strong", "b") + new_italic = italic or node.name in ("em", "i") + for child in node.children: + walk(child, new_bold, new_italic) + + walk(element) + return ranges + + @staticmethod + def _tag_tokens_with_formatting(token_paragraphs, plain_text, formatting_ranges): + """Tag tokens with is_bold/is_italic based on their position in the plain text.""" + if not formatting_ranges: + return + + pos = 0 + for para in token_paragraphs: + for sentence in para: + for token in sentence: + idx = plain_text.find(token.text, pos) + if idx >= 0: + token_end = idx + len(token.text) + for start, end, is_bold, is_italic in formatting_ranges: + if idx < end and token_end > start: + if is_bold: + token.is_bold = True + if is_italic: + token.is_italic = True + pos = token_end + + def _get_html_block_elements(self): + """ + Re-parse htmlContent and return block elements in the same order + as create_article_fragments() produces them. + """ + from bs4 import BeautifulSoup + + html_content = getattr(self, "htmlContent", None) + if not html_content: + return [] + + soup = BeautifulSoup(html_content, "html.parser") + block_tags = ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li", "blockquote"] + elements = [] + + for element in soup.find_all(block_tags): + if element.name == "blockquote": + continue + elements.append(element) + + return elements + def _tokenize_fragments(self, tokenizer): """ Tokenize article fragments with batch MWE processing for efficiency. @@ -539,6 +609,19 @@ def _tokenize_fragments(self, tokenizer): frag_tokens, self.language.code, mode="stanza" ) + # Tag tokens with inline formatting (bold/italic) from HTML source + html_elements = self._get_html_block_elements() + if html_elements: + for frag_idx, frag_tokens in enumerate(fragment_tokens_list): + if frag_idx < len(html_elements): + element = html_elements[frag_idx] + formatting_ranges = self._extract_inline_formatting(element) + if formatting_ranges: + plain_text = element.get_text() + self._tag_tokens_with_formatting( + frag_tokens, plain_text, formatting_ranges + ) + # Build result tokenized_fragments = [] for frag_idx, fragment in enumerate(fragments_list): diff --git a/zeeguu/core/tokenization/token.py b/zeeguu/core/tokenization/token.py index fe5be5ced..6dcb4cd40 100644 --- a/zeeguu/core/tokenization/token.py +++ b/zeeguu/core/tokenization/token.py @@ -77,6 +77,10 @@ def __init__( self.dep = dep self.head = head self.lemma = lemma + # Inline formatting flags - set from HTML source after tokenization + self.is_bold = False + self.is_italic = False + # MWE (Multi-Word Expression) fields - set by MWE detector after tokenization self.mwe_group_id = None self.mwe_role = None # "head" | "dependent" | None @@ -106,6 +110,11 @@ def as_serializable_dictionary(self): "head": self.head, "lemma": self.lemma, } + # Only include inline formatting flags when set (to minimize payload) + if self.is_bold: + result["is_bold"] = True + if self.is_italic: + result["is_italic"] = True # Only include MWE fields if token is part of an MWE (to minimize payload) if self.mwe_group_id: result["mwe_group_id"] = self.mwe_group_id From e1f0b0b05e67389de8f7564a487167d47a9985f7 Mon Sep 17 00:00:00 2001 From: Mircea Lungu Date: Tue, 24 Mar 2026 19:39:07 +0100 Subject: [PATCH 2/3] Simplify: extract shared BLOCK_TAGS, fix empty-element alignment, guard find() - Extract duplicated block tag list into BLOCK_TAGS constant - Filter empty elements in _get_html_block_elements to stay aligned with fragments - Use early-continue when str.find() returns -1 to avoid stalling the cursor Co-Authored-By: Claude Opus 4.6 (1M context) --- zeeguu/core/model/article.py | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/zeeguu/core/model/article.py b/zeeguu/core/model/article.py index eacf4a30a..9ededf4a7 100644 --- a/zeeguu/core/model/article.py +++ b/zeeguu/core/model/article.py @@ -85,6 +85,9 @@ def process_result_value(self, value, dialect): """ +BLOCK_TAGS = ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li", "blockquote"] + + class Article(db.Model): __table_args__ = {"mysql_collate": "utf8_bin"} @@ -342,12 +345,7 @@ def create_article_fragments(self, session): # Extract text content from HTML elements and create fragments order = 0 - # Include block-level HTML elements: headings, paragraphs, list items, blockquotes - # Note: We skip ul/ol containers to avoid duplication, only process individual li items - # Note: We skip inline elements like strong, em here as they should be preserved within their parent blocks - block_elements = ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li", "blockquote"] - - for element in soup.find_all(block_elements): + for element in soup.find_all(BLOCK_TAGS): # Skip blockquote containers - we'll process their paragraph children instead if element.name == "blockquote": continue @@ -510,20 +508,23 @@ def _tag_tokens_with_formatting(token_paragraphs, plain_text, formatting_ranges) for sentence in para: for token in sentence: idx = plain_text.find(token.text, pos) - if idx >= 0: - token_end = idx + len(token.text) - for start, end, is_bold, is_italic in formatting_ranges: - if idx < end and token_end > start: - if is_bold: - token.is_bold = True - if is_italic: - token.is_italic = True - pos = token_end + if idx < 0: + # Tokenizer text diverged from HTML text; skip + continue + token_end = idx + len(token.text) + for start, end, is_bold, is_italic in formatting_ranges: + if idx < end and token_end > start: + if is_bold: + token.is_bold = True + if is_italic: + token.is_italic = True + pos = token_end def _get_html_block_elements(self): """ Re-parse htmlContent and return block elements in the same order as create_article_fragments() produces them. + Skips blockquote containers and empty elements to stay aligned. """ from bs4 import BeautifulSoup @@ -532,12 +533,13 @@ def _get_html_block_elements(self): return [] soup = BeautifulSoup(html_content, "html.parser") - block_tags = ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li", "blockquote"] elements = [] - for element in soup.find_all(block_tags): + for element in soup.find_all(BLOCK_TAGS): if element.name == "blockquote": continue + if not element.get_text().strip(): + continue elements.append(element) return elements From be1808915990d41ceb682cf7f711b50562f3499b Mon Sep 17 00:00:00 2001 From: Mircea Lungu Date: Wed, 25 Mar 2026 16:12:13 +0100 Subject: [PATCH 3/3] Fix: handle already-serialized dict tokens in formatting tagger The tokenizers return dicts by default (as_serializable_dictionary=True), not Token objects. _tag_tokens_with_formatting now handles both formats. Co-Authored-By: Claude Opus 4.6 (1M context) --- zeeguu/core/model/article.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/zeeguu/core/model/article.py b/zeeguu/core/model/article.py index 9ededf4a7..584d2bd87 100644 --- a/zeeguu/core/model/article.py +++ b/zeeguu/core/model/article.py @@ -507,17 +507,23 @@ def _tag_tokens_with_formatting(token_paragraphs, plain_text, formatting_ranges) for para in token_paragraphs: for sentence in para: for token in sentence: - idx = plain_text.find(token.text, pos) + token_text = token["text"] if isinstance(token, dict) else token.text + idx = plain_text.find(token_text, pos) if idx < 0: - # Tokenizer text diverged from HTML text; skip continue - token_end = idx + len(token.text) + token_end = idx + len(token_text) for start, end, is_bold, is_italic in formatting_ranges: if idx < end and token_end > start: if is_bold: - token.is_bold = True + if isinstance(token, dict): + token["is_bold"] = True + else: + token.is_bold = True if is_italic: - token.is_italic = True + if isinstance(token, dict): + token["is_italic"] = True + else: + token.is_italic = True pos = token_end def _get_html_block_elements(self):