diff --git a/zeeguu/core/model/article.py b/zeeguu/core/model/article.py index ab8cf958..584d2bd8 100644 --- a/zeeguu/core/model/article.py +++ b/zeeguu/core/model/article.py @@ -85,6 +85,9 @@ def process_result_value(self, value, dialect): """ +BLOCK_TAGS = ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li", "blockquote"] + + class Article(db.Model): __table_args__ = {"mysql_collate": "utf8_bin"} @@ -342,12 +345,7 @@ def create_article_fragments(self, session): # Extract text content from HTML elements and create fragments order = 0 - # Include block-level HTML elements: headings, paragraphs, list items, blockquotes - # Note: We skip ul/ol containers to avoid duplication, only process individual li items - # Note: We skip inline elements like strong, em here as they should be preserved within their parent blocks - block_elements = ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li", "blockquote"] - - for element in soup.find_all(block_elements): + for element in soup.find_all(BLOCK_TAGS): # Skip blockquote containers - we'll process their paragraph children instead if element.name == "blockquote": continue @@ -472,6 +470,86 @@ def get_tokenized_content(self): return result + @staticmethod + def _extract_inline_formatting(element): + """ + Extract character ranges with bold/italic formatting from an HTML element. + Returns a list of (char_start, char_end, is_bold, is_italic) tuples + where positions refer to the plain text (get_text()) of the element. + """ + from bs4 import NavigableString + + ranges = [] + pos = [0] + + def walk(node, bold=False, italic=False): + if isinstance(node, NavigableString): + text_len = len(str(node)) + if text_len > 0 and (bold or italic): + ranges.append((pos[0], pos[0] + text_len, bold, italic)) + pos[0] += text_len + elif hasattr(node, "children"): + new_bold = bold or node.name in ("strong", "b") + new_italic = italic or node.name in ("em", "i") + for child in node.children: + walk(child, new_bold, new_italic) + + walk(element) + return ranges + + @staticmethod + def _tag_tokens_with_formatting(token_paragraphs, plain_text, formatting_ranges): + """Tag tokens with is_bold/is_italic based on their position in the plain text.""" + if not formatting_ranges: + return + + pos = 0 + for para in token_paragraphs: + for sentence in para: + for token in sentence: + token_text = token["text"] if isinstance(token, dict) else token.text + idx = plain_text.find(token_text, pos) + if idx < 0: + continue + token_end = idx + len(token_text) + for start, end, is_bold, is_italic in formatting_ranges: + if idx < end and token_end > start: + if is_bold: + if isinstance(token, dict): + token["is_bold"] = True + else: + token.is_bold = True + if is_italic: + if isinstance(token, dict): + token["is_italic"] = True + else: + token.is_italic = True + pos = token_end + + def _get_html_block_elements(self): + """ + Re-parse htmlContent and return block elements in the same order + as create_article_fragments() produces them. + Skips blockquote containers and empty elements to stay aligned. + """ + from bs4 import BeautifulSoup + + html_content = getattr(self, "htmlContent", None) + if not html_content: + return [] + + soup = BeautifulSoup(html_content, "html.parser") + elements = [] + + for element in soup.find_all(BLOCK_TAGS): + if element.name == "blockquote": + continue + if not element.get_text().strip(): + continue + elements.append(element) + + return elements + def _tokenize_fragments(self, tokenizer): """ Tokenize article fragments with batch MWE processing for efficiency. @@ -539,6 +617,19 @@ def _tokenize_fragments(self, tokenizer): frag_tokens, self.language.code, mode="stanza" ) + # Tag tokens with inline formatting (bold/italic) from HTML source + html_elements = self._get_html_block_elements() + if html_elements: + for frag_idx, frag_tokens in enumerate(fragment_tokens_list): + if frag_idx < len(html_elements): + element = html_elements[frag_idx] + formatting_ranges = self._extract_inline_formatting(element) + if formatting_ranges: + plain_text = element.get_text() + self._tag_tokens_with_formatting( + frag_tokens, plain_text, formatting_ranges + ) + # Build result tokenized_fragments = [] for frag_idx, fragment in enumerate(fragments_list): diff --git a/zeeguu/core/tokenization/token.py b/zeeguu/core/tokenization/token.py index fe5be5ce..6dcb4cd4 100644 --- a/zeeguu/core/tokenization/token.py +++ b/zeeguu/core/tokenization/token.py @@ -77,6 +77,10 @@ def __init__( self.dep = dep self.head = head self.lemma = lemma + # Inline formatting flags - set from HTML source after tokenization + self.is_bold = False + self.is_italic = False + # MWE (Multi-Word Expression) fields - set by MWE detector after tokenization self.mwe_group_id = None self.mwe_role = None # "head" | "dependent" | None @@ -106,6 +110,11 @@ def as_serializable_dictionary(self): "head": self.head, "lemma": self.lemma, } + # Only include inline formatting flags when set (to minimize payload) + if self.is_bold: + result["is_bold"] = True + if self.is_italic: + result["is_italic"] = True # Only include MWE fields if token is part of an MWE (to minimize payload) if self.mwe_group_id: result["mwe_group_id"] = self.mwe_group_id