Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 91 additions & 6 deletions zeeguu/core/model/article.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,9 @@ def process_result_value(self, value, dialect):
"""


BLOCK_TAGS = ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li", "blockquote"]


class Article(db.Model):
__table_args__ = {"mysql_collate": "utf8_bin"}

Expand Down Expand Up @@ -342,12 +345,7 @@ def create_article_fragments(self, session):

# Extract text content from HTML elements and create fragments
order = 0
# Include block-level HTML elements: headings, paragraphs, list items, blockquotes
# Note: We skip ul/ol containers to avoid duplication, only process individual li items
# Note: We skip inline elements like strong, em here as they should be preserved within their parent blocks
block_elements = ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li", "blockquote"]

for element in soup.find_all(block_elements):
for element in soup.find_all(BLOCK_TAGS):
# Skip blockquote containers - we'll process their paragraph children instead
if element.name == "blockquote":
continue
Expand Down Expand Up @@ -472,6 +470,80 @@ def get_tokenized_content(self):

return result

@staticmethod
def _extract_inline_formatting(element):
"""
Extract character ranges with bold/italic formatting from an HTML element.
Returns a list of (char_start, char_end, is_bold, is_italic) tuples
where positions refer to the plain text (get_text()) of the element.
"""
from bs4 import NavigableString

ranges = []
pos = [0]

def walk(node, bold=False, italic=False):
if isinstance(node, NavigableString):
text_len = len(str(node))
if text_len > 0 and (bold or italic):
ranges.append((pos[0], pos[0] + text_len, bold, italic))
pos[0] += text_len
elif hasattr(node, "children"):
new_bold = bold or node.name in ("strong", "b")
new_italic = italic or node.name in ("em", "i")
for child in node.children:
walk(child, new_bold, new_italic)

walk(element)
return ranges

@staticmethod
def _tag_tokens_with_formatting(token_paragraphs, plain_text, formatting_ranges):
"""Tag tokens with is_bold/is_italic based on their position in the plain text."""
if not formatting_ranges:
return

pos = 0
for para in token_paragraphs:
for sentence in para:
for token in sentence:
idx = plain_text.find(token.text, pos)
if idx < 0:
# Tokenizer text diverged from HTML text; skip
continue
token_end = idx + len(token.text)
for start, end, is_bold, is_italic in formatting_ranges:
if idx < end and token_end > start:
if is_bold:
token.is_bold = True
if is_italic:
token.is_italic = True
pos = token_end

def _get_html_block_elements(self):
"""
Re-parse htmlContent and return block elements in the same order
as create_article_fragments() produces them.
Skips blockquote containers and empty elements to stay aligned.
"""
from bs4 import BeautifulSoup

html_content = getattr(self, "htmlContent", None)
if not html_content:
return []

soup = BeautifulSoup(html_content, "html.parser")
elements = []

for element in soup.find_all(BLOCK_TAGS):
if element.name == "blockquote":
continue
if not element.get_text().strip():
continue
elements.append(element)

return elements

def _tokenize_fragments(self, tokenizer):
"""
Tokenize article fragments with batch MWE processing for efficiency.
Expand Down Expand Up @@ -539,6 +611,19 @@ def _tokenize_fragments(self, tokenizer):
frag_tokens, self.language.code, mode="stanza"
)

# Tag tokens with inline formatting (bold/italic) from HTML source
html_elements = self._get_html_block_elements()
if html_elements:
for frag_idx, frag_tokens in enumerate(fragment_tokens_list):
if frag_idx < len(html_elements):
element = html_elements[frag_idx]
formatting_ranges = self._extract_inline_formatting(element)
if formatting_ranges:
plain_text = element.get_text()
self._tag_tokens_with_formatting(
frag_tokens, plain_text, formatting_ranges
)

# Build result
tokenized_fragments = []
for frag_idx, fragment in enumerate(fragments_list):
Expand Down
9 changes: 9 additions & 0 deletions zeeguu/core/tokenization/token.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,10 @@ def __init__(
self.dep = dep
self.head = head
self.lemma = lemma
# Inline formatting flags - set from HTML source after tokenization
self.is_bold = False
self.is_italic = False

# MWE (Multi-Word Expression) fields - set by MWE detector after tokenization
self.mwe_group_id = None
self.mwe_role = None # "head" | "dependent" | None
Expand Down Expand Up @@ -106,6 +110,11 @@ def as_serializable_dictionary(self):
"head": self.head,
"lemma": self.lemma,
}
# Only include inline formatting flags when set (to minimize payload)
if self.is_bold:
result["is_bold"] = True
if self.is_italic:
result["is_italic"] = True
# Only include MWE fields if token is part of an MWE (to minimize payload)
if self.mwe_group_id:
result["mwe_group_id"] = self.mwe_group_id
Expand Down
Loading