Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion machine/corpora/paratext_project_text_updater_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ def __init__(
def update_usfm(
self,
book_id: str,
chapters: Optional[Sequence[int]] = None,
rows: Optional[Sequence[UpdateUsfmRow]] = None,
full_name: Optional[str] = None,
text_behavior: UpdateUsfmTextBehavior = UpdateUsfmTextBehavior.PREFER_EXISTING,
Expand Down Expand Up @@ -61,7 +62,7 @@ def update_usfm(
)
try:
parse_usfm(usfm, handler, self._settings.stylesheet, self._settings.versification)
return handler.get_usfm(self._settings.stylesheet)
return handler.get_usfm(self._settings.stylesheet, chapters)
except Exception as e:
error_message = (
f"An error occurred while parsing the usfm for '{book_id}'"
Expand Down
34 changes: 25 additions & 9 deletions machine/corpora/update_usfm_parser_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,27 +334,43 @@ def _end_embed_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -
if embed_outside_of_block:
self._end_update_block(state, [scripture_ref])

def get_usfm(self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty") -> str:
def get_usfm(
self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty", chapters: Optional[Sequence[int]] = None
) -> str:
if isinstance(stylesheet, str):
stylesheet = UsfmStylesheet(stylesheet)
tokenizer = UsfmTokenizer(stylesheet)
tokens = list(self._tokens)
if chapters is not None:
tokens = self._get_incremental_draft_tokens(tokens, chapters)
if len(self._remarks) > 0:
remark_tokens: List[UsfmToken] = []
for remark in self._remarks:
remark_tokens.append(UsfmToken(UsfmTokenType.PARAGRAPH, "rem"))
remark_tokens.append(UsfmToken(UsfmTokenType.TEXT, text=remark))
if len(tokens) > 0:
index = 0
markers_to_skip = {"id", "ide", "rem"}
while tokens[index].marker in markers_to_skip:
index += 1
if len(tokens) > index and tokens[index].type == UsfmTokenType.TEXT:
index += 1
for remark_token in reversed(remark_tokens):
tokens.insert(index, remark_token)
for index, token in enumerate(tokens):
if token.type == UsfmTokenType.CHAPTER:
tokens[index + 1 : index + 1] = remark_tokens
return tokenizer.detokenize(tokens)

def _get_incremental_draft_tokens(self, tokens: List[UsfmToken], chapters: Sequence[int]) -> List[UsfmToken]:
incremental_draft_tokens: List[UsfmToken] = []
in_chapter: bool = False
for index, token in enumerate(tokens):
if index == 0 and token.marker == "id":
incremental_draft_tokens.append(token)
continue
elif token.type == UsfmTokenType.CHAPTER:
if token.data and int(token.data) in chapters:
in_chapter = True
incremental_draft_tokens.append(token)
else:
in_chapter = False
elif in_chapter:
incremental_draft_tokens.append(token)
return incremental_draft_tokens

def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> Tuple[List[str], Optional[dict[str, object]]]:
row_texts: List[str] = []
row_metadata = None
Expand Down
Loading