From add2c28184a56e87745c75481a062eaf538817df Mon Sep 17 00:00:00 2001
From: Mircea Lungu <mircea.lungu@gmail.com>
Date: Tue, 24 Mar 2026 19:33:42 +0100
Subject: [PATCH 1/3] Preserve bold/italic formatting from teacher HTML in
 tokenized output

When teachers use bold/italic in the rich text editor, the inline
formatting was lost during tokenization because get_text() strips
all HTML tags. This re-parses the article's htmlContent after
tokenization to tag individual tokens with is_bold/is_italic flags.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 zeeguu/core/model/article.py      | 83 +++++++++++++++++++++++++++++++
 zeeguu/core/tokenization/token.py |  9 ++++
 2 files changed, 92 insertions(+)

diff --git a/zeeguu/core/model/article.py b/zeeguu/core/model/article.py
index ab8cf9582..eacf4a30a 100644
--- a/zeeguu/core/model/article.py
+++ b/zeeguu/core/model/article.py
@@ -472,6 +472,76 @@ def get_tokenized_content(self):
 
         return result
 
+    @staticmethod
+    def _extract_inline_formatting(element):
+        """
+        Extract character ranges with bold/italic formatting from an HTML element.
+        Returns a list of (char_start, char_end, is_bold, is_italic) tuples
+        where positions refer to the plain text (get_text()) of the element.
+        """
+        from bs4 import NavigableString
+
+        ranges = []
+        pos = [0]
+
+        def walk(node, bold=False, italic=False):
+            if isinstance(node, NavigableString):
+                text_len = len(str(node))
+                if text_len > 0 and (bold or italic):
+                    ranges.append((pos[0], pos[0] + text_len, bold, italic))
+                pos[0] += text_len
+            elif hasattr(node, "children"):
+                new_bold = bold or node.name in ("strong", "b")
+                new_italic = italic or node.name in ("em", "i")
+                for child in node.children:
+                    walk(child, new_bold, new_italic)
+
+        walk(element)
+        return ranges
+
+    @staticmethod
+    def _tag_tokens_with_formatting(token_paragraphs, plain_text, formatting_ranges):
+        """Tag tokens with is_bold/is_italic based on their position in the plain text."""
+        if not formatting_ranges:
+            return
+
+        pos = 0
+        for para in token_paragraphs:
+            for sentence in para:
+                for token in sentence:
+                    idx = plain_text.find(token.text, pos)
+                    if idx >= 0:
+                        token_end = idx + len(token.text)
+                        for start, end, is_bold, is_italic in formatting_ranges:
+                            if idx < end and token_end > start:
+                                if is_bold:
+                                    token.is_bold = True
+                                if is_italic:
+                                    token.is_italic = True
+                        pos = token_end
+
+    def _get_html_block_elements(self):
+        """
+        Re-parse htmlContent and return block elements in the same order
+        as create_article_fragments() produces them.
+        """
+        from bs4 import BeautifulSoup
+
+        html_content = getattr(self, "htmlContent", None)
+        if not html_content:
+            return []
+
+        soup = BeautifulSoup(html_content, "html.parser")
+        block_tags = ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li", "blockquote"]
+        elements = []
+
+        for element in soup.find_all(block_tags):
+            if element.name == "blockquote":
+                continue
+            elements.append(element)
+
+        return elements
+
     def _tokenize_fragments(self, tokenizer):
         """
         Tokenize article fragments with batch MWE processing for efficiency.
@@ -539,6 +609,19 @@ def _tokenize_fragments(self, tokenizer):
                         frag_tokens, self.language.code, mode="stanza"
                     )
 
+        # Tag tokens with inline formatting (bold/italic) from HTML source
+        html_elements = self._get_html_block_elements()
+        if html_elements:
+            for frag_idx, frag_tokens in enumerate(fragment_tokens_list):
+                if frag_idx < len(html_elements):
+                    element = html_elements[frag_idx]
+                    formatting_ranges = self._extract_inline_formatting(element)
+                    if formatting_ranges:
+                        plain_text = element.get_text()
+                        self._tag_tokens_with_formatting(
+                            frag_tokens, plain_text, formatting_ranges
+                        )
+
         # Build result
         tokenized_fragments = []
         for frag_idx, fragment in enumerate(fragments_list):
diff --git a/zeeguu/core/tokenization/token.py b/zeeguu/core/tokenization/token.py
index fe5be5ced..6dcb4cd40 100644
--- a/zeeguu/core/tokenization/token.py
+++ b/zeeguu/core/tokenization/token.py
@@ -77,6 +77,10 @@ def __init__(
         self.dep = dep
         self.head = head
         self.lemma = lemma
+        # Inline formatting flags - set from HTML source after tokenization
+        self.is_bold = False
+        self.is_italic = False
+
         # MWE (Multi-Word Expression) fields - set by MWE detector after tokenization
         self.mwe_group_id = None
         self.mwe_role = None  # "head" | "dependent" | None
@@ -106,6 +110,11 @@ def as_serializable_dictionary(self):
             "head": self.head,
             "lemma": self.lemma,
         }
+        # Only include inline formatting flags when set (to minimize payload)
+        if self.is_bold:
+            result["is_bold"] = True
+        if self.is_italic:
+            result["is_italic"] = True
         # Only include MWE fields if token is part of an MWE (to minimize payload)
         if self.mwe_group_id:
             result["mwe_group_id"] = self.mwe_group_id

From e1f0b0b05e67389de8f7564a487167d47a9985f7 Mon Sep 17 00:00:00 2001
From: Mircea Lungu <mircea.lungu@gmail.com>
Date: Tue, 24 Mar 2026 19:39:07 +0100
Subject: [PATCH 2/3] Simplify: extract shared BLOCK_TAGS, fix empty-element
 alignment, guard find()

- Extract duplicated block tag list into BLOCK_TAGS constant
- Filter empty elements in _get_html_block_elements to stay aligned with fragments
- Use early-continue when str.find() returns -1 to avoid stalling the cursor

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 zeeguu/core/model/article.py | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/zeeguu/core/model/article.py b/zeeguu/core/model/article.py
index eacf4a30a..9ededf4a7 100644
--- a/zeeguu/core/model/article.py
+++ b/zeeguu/core/model/article.py
@@ -85,6 +85,9 @@ def process_result_value(self, value, dialect):
 """
 
 
+BLOCK_TAGS = ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li", "blockquote"]
+
+
 class Article(db.Model):
     __table_args__ = {"mysql_collate": "utf8_bin"}
 
@@ -342,12 +345,7 @@ def create_article_fragments(self, session):
 
         # Extract text content from HTML elements and create fragments
         order = 0
-        # Include block-level HTML elements: headings, paragraphs, list items, blockquotes
-        # Note: We skip ul/ol containers to avoid duplication, only process individual li items
-        # Note: We skip inline elements like strong, em here as they should be preserved within their parent blocks
-        block_elements = ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li", "blockquote"]
-
-        for element in soup.find_all(block_elements):
+        for element in soup.find_all(BLOCK_TAGS):
             # Skip blockquote containers - we'll process their paragraph children instead
             if element.name == "blockquote":
                 continue
@@ -510,20 +508,23 @@ def _tag_tokens_with_formatting(token_paragraphs, plain_text, formatting_ranges)
             for sentence in para:
                 for token in sentence:
                     idx = plain_text.find(token.text, pos)
-                    if idx >= 0:
-                        token_end = idx + len(token.text)
-                        for start, end, is_bold, is_italic in formatting_ranges:
-                            if idx < end and token_end > start:
-                                if is_bold:
-                                    token.is_bold = True
-                                if is_italic:
-                                    token.is_italic = True
-                        pos = token_end
+                    if idx < 0:
+                        # Tokenizer text diverged from HTML text; skip
+                        continue
+                    token_end = idx + len(token.text)
+                    for start, end, is_bold, is_italic in formatting_ranges:
+                        if idx < end and token_end > start:
+                            if is_bold:
+                                token.is_bold = True
+                            if is_italic:
+                                token.is_italic = True
+                    pos = token_end
 
     def _get_html_block_elements(self):
         """
         Re-parse htmlContent and return block elements in the same order
         as create_article_fragments() produces them.
+        Skips blockquote containers and empty elements to stay aligned.
         """
         from bs4 import BeautifulSoup
 
@@ -532,12 +533,13 @@ def _get_html_block_elements(self):
             return []
 
         soup = BeautifulSoup(html_content, "html.parser")
-        block_tags = ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li", "blockquote"]
         elements = []
 
-        for element in soup.find_all(block_tags):
+        for element in soup.find_all(BLOCK_TAGS):
             if element.name == "blockquote":
                 continue
+            if not element.get_text().strip():
+                continue
             elements.append(element)
 
         return elements

From be1808915990d41ceb682cf7f711b50562f3499b Mon Sep 17 00:00:00 2001
From: Mircea Lungu <mircea.lungu@gmail.com>
Date: Wed, 25 Mar 2026 16:12:13 +0100
Subject: [PATCH 3/3] Fix: handle already-serialized dict tokens in formatting
 tagger

The tokenizers return dicts by default (as_serializable_dictionary=True),
not Token objects. _tag_tokens_with_formatting now handles both formats.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 zeeguu/core/model/article.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/zeeguu/core/model/article.py b/zeeguu/core/model/article.py
index 9ededf4a7..584d2bd87 100644
--- a/zeeguu/core/model/article.py
+++ b/zeeguu/core/model/article.py
@@ -507,17 +507,23 @@ def _tag_tokens_with_formatting(token_paragraphs, plain_text, formatting_ranges)
         for para in token_paragraphs:
             for sentence in para:
                 for token in sentence:
-                    idx = plain_text.find(token.text, pos)
+                    token_text = token["text"] if isinstance(token, dict) else token.text
+                    idx = plain_text.find(token_text, pos)
                     if idx < 0:
-                        # Tokenizer text diverged from HTML text; skip
                         continue
-                    token_end = idx + len(token.text)
+                    token_end = idx + len(token_text)
                     for start, end, is_bold, is_italic in formatting_ranges:
                         if idx < end and token_end > start:
                             if is_bold:
-                                token.is_bold = True
+                                if isinstance(token, dict):
+                                    token["is_bold"] = True
+                                else:
+                                    token.is_bold = True
                             if is_italic:
-                                token.is_italic = True
+                                if isinstance(token, dict):
+                                    token["is_italic"] = True
+                                else:
+                                    token.is_italic = True
                     pos = token_end
 
     def _get_html_block_elements(self):