zeeguu · mircealungu · Mar 24, 2026 · Mar 24, 2026
diff --git a/zeeguu/core/model/article.py b/zeeguu/core/model/article.py
@@ -85,6 +85,9 @@ def process_result_value(self, value, dialect):
 """
 
 
+BLOCK_TAGS = ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li", "blockquote"]
+
+
 class Article(db.Model):
     __table_args__ = {"mysql_collate": "utf8_bin"}
 
@@ -342,12 +345,7 @@ def create_article_fragments(self, session):
 
         # Extract text content from HTML elements and create fragments
         order = 0
-        # Include block-level HTML elements: headings, paragraphs, list items, blockquotes
-        # Note: We skip ul/ol containers to avoid duplication, only process individual li items
-        # Note: We skip inline elements like strong, em here as they should be preserved within their parent blocks
-        block_elements = ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li", "blockquote"]
-
-        for element in soup.find_all(block_elements):
+        for element in soup.find_all(BLOCK_TAGS):
             # Skip blockquote containers - we'll process their paragraph children instead
             if element.name == "blockquote":
                 continue
@@ -472,6 +470,80 @@ def get_tokenized_content(self):
 
         return result
 
+    @staticmethod
+    def _extract_inline_formatting(element):
+        """
+        Extract character ranges with bold/italic formatting from an HTML element.
+        Returns a list of (char_start, char_end, is_bold, is_italic) tuples
+        where positions refer to the plain text (get_text()) of the element.
+        """
+        from bs4 import NavigableString
+
+        ranges = []
+        pos = [0]
+
+        def walk(node, bold=False, italic=False):
+            if isinstance(node, NavigableString):
+                text_len = len(str(node))
+                if text_len > 0 and (bold or italic):
+                    ranges.append((pos[0], pos[0] + text_len, bold, italic))
+                pos[0] += text_len
+            elif hasattr(node, "children"):
+                new_bold = bold or node.name in ("strong", "b")
+                new_italic = italic or node.name in ("em", "i")
+                for child in node.children:
+                    walk(child, new_bold, new_italic)
+
+        walk(element)
+        return ranges
+
+    @staticmethod
+    def _tag_tokens_with_formatting(token_paragraphs, plain_text, formatting_ranges):
+        """Tag tokens with is_bold/is_italic based on their position in the plain text."""
+        if not formatting_ranges:
+            return
+
+        pos = 0
+        for para in token_paragraphs:
+            for sentence in para:
+                for token in sentence:
+                    idx = plain_text.find(token.text, pos)
+                    if idx < 0:
+                        # Tokenizer text diverged from HTML text; skip
+                        continue
+                    token_end = idx + len(token.text)
+                    for start, end, is_bold, is_italic in formatting_ranges:
+                        if idx < end and token_end > start:
+                            if is_bold:
+                                token.is_bold = True
+                            if is_italic:
+                                token.is_italic = True
+                    pos = token_end
+
+    def _get_html_block_elements(self):
+        """
+        Re-parse htmlContent and return block elements in the same order
+        as create_article_fragments() produces them.
+        Skips blockquote containers and empty elements to stay aligned.
+        """
+        from bs4 import BeautifulSoup
+
+        html_content = getattr(self, "htmlContent", None)
+        if not html_content:
+            return []
+
+        soup = BeautifulSoup(html_content, "html.parser")
+        elements = []
+
+        for element in soup.find_all(BLOCK_TAGS):
+            if element.name == "blockquote":
+                continue
+            if not element.get_text().strip():
+                continue
+            elements.append(element)
+
+        return elements
+
     def _tokenize_fragments(self, tokenizer):
         """
         Tokenize article fragments with batch MWE processing for efficiency.
@@ -539,6 +611,19 @@ def _tokenize_fragments(self, tokenizer):
                         frag_tokens, self.language.code, mode="stanza"
                     )
 
+        # Tag tokens with inline formatting (bold/italic) from HTML source
+        html_elements = self._get_html_block_elements()
+        if html_elements:
+            for frag_idx, frag_tokens in enumerate(fragment_tokens_list):
+                if frag_idx < len(html_elements):
+                    element = html_elements[frag_idx]
+                    formatting_ranges = self._extract_inline_formatting(element)
+                    if formatting_ranges:
+                        plain_text = element.get_text()
+                        self._tag_tokens_with_formatting(
+                            frag_tokens, plain_text, formatting_ranges
+                        )
+
         # Build result
         tokenized_fragments = []
         for frag_idx, fragment in enumerate(fragments_list):

diff --git a/zeeguu/core/tokenization/token.py b/zeeguu/core/tokenization/token.py
@@ -77,6 +77,10 @@ def __init__(
         self.dep = dep
         self.head = head
         self.lemma = lemma
+        # Inline formatting flags - set from HTML source after tokenization
+        self.is_bold = False
+        self.is_italic = False
+
         # MWE (Multi-Word Expression) fields - set by MWE detector after tokenization
         self.mwe_group_id = None
         self.mwe_role = None  # "head" | "dependent" | None
@@ -106,6 +110,11 @@ def as_serializable_dictionary(self):
             "head": self.head,
             "lemma": self.lemma,
         }
+        # Only include inline formatting flags when set (to minimize payload)
+        if self.is_bold:
+            result["is_bold"] = True
+        if self.is_italic:
+            result["is_italic"] = True
         # Only include MWE fields if token is part of an MWE (to minimize payload)
         if self.mwe_group_id:
             result["mwe_group_id"] = self.mwe_group_id