From cab499af93df92e3ffcf0e262bffbc8b6ab9b5dd Mon Sep 17 00:00:00 2001 From: Peter Chapman Date: Thu, 26 Mar 2026 17:15:06 +1300 Subject: [PATCH] Handle USFM and Text corpora separately in pre-processing --- .../Services/ParallelCorpusService.cs | 110 +++++++++++------- 1 file changed, 68 insertions(+), 42 deletions(-) diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusService.cs index a4e11124..c07316b1 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusService.cs @@ -153,54 +153,80 @@ public async Task PreprocessAsync( ignoreUsfmMarkers ??= []; bool parallelTrainingDataPresent = false; - List keyTermTrainingData = new(); + List keyTermTrainingData = []; + + // Iterate over USFM and Text training corpora separately. + // This is not only because they use different keys, but if we have text corpora + // with scripture corpora, we don't want to exclude the text corpora from training. + foreach (bool isScripture in new bool[] { true, false }) + { + // Create source and target arrays of text corpora filtered for training + // based on the filters specified in the associated monolingual corpora + ITextCorpus[] sourceTrainingCorpora = + [ + .. corpusBundle.SourceTextCorpora.SelectMany(c => + c.TextCorpora.Where(tc => isScripture ? tc.IsScripture() : !tc.IsScripture()) + .Select(tc => FilterTrainingCorpora(c.MonolingualCorpus, tc)) + ), + ]; + + ITextCorpus[] targetTrainingCorpora = + [ + .. corpusBundle.TargetTextCorpora.SelectMany(c => + c.TextCorpora.Where(tc => isScripture ? tc.IsScripture() : !tc.IsScripture()) + .Select(tc => FilterTrainingCorpora(c.MonolingualCorpus, tc)) + ), + ]; + + // To support mixed source, collapse multiple source text corpora into one text corpus + // by randomly interlacing content from each of the source text corpora + ITextCorpus sourceTrainingCorpus = sourceTrainingCorpora.ChooseRandom(Seed); + if (sourceTrainingCorpus.IsScripture()) + { + // Filter out all non-scripture; we only train on scripture content + sourceTrainingCorpus = sourceTrainingCorpus.Where(IsScriptureRow); + } - // Create source and target arrays of text corpora filtered for training - // based on the filters specified in the associated monolingual corpora - ITextCorpus[] sourceTrainingCorpora = corpusBundle - .SourceTextCorpora.SelectMany(c => - c.TextCorpora.Select(tc => FilterTrainingCorpora(c.MonolingualCorpus, tc)) - ) - .ToArray(); + // Instead of interlacing rows from the target text corpora randomly, just take the + // text row from the first target text corpus that has content for that row + ITextCorpus targetTrainingCorpus = targetTrainingCorpora.ChooseFirst(); + if (targetTrainingCorpus.IsScripture()) + { + // Filter out all non-scripture; we only train on scripture content + targetTrainingCorpus = targetTrainingCorpus.Where(IsScriptureRow); + } - ITextCorpus[] targetTrainingCorpora = corpusBundle - .TargetTextCorpora.SelectMany(c => - c.TextCorpora.Select(tc => FilterTrainingCorpora(c.MonolingualCorpus, tc)) - ) - .ToArray(); + // Align source and target training data + ParallelTextRow[] trainingRows = + [ + .. sourceTrainingCorpus.AlignRows(targetTrainingCorpus, allSourceRows: true, allTargetRows: true), + ]; - // To support mixed source, collapse multiple source text corpora into one text corpus - // by randomly interlacing content from each of the source text corpora - ITextCorpus sourceTrainingCorpus = sourceTrainingCorpora.ChooseRandom(Seed); - if (sourceTrainingCorpus.IsScripture()) - { - // Filter out all non-scripture; we only train on scripture content - sourceTrainingCorpus = sourceTrainingCorpus.Where(IsScriptureRow); + // After merging segments across ranges, run the 'train' preprocessing function + // on each training row and record whether any parallel training data was present + foreach (Row row in CollapseRanges(trainingRows)) + { + await train(row, TrainingDataType.Text); + if (!parallelTrainingDataPresent && row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0) + { + parallelTrainingDataPresent = true; + } + } } - // Instead of interlacing rows from the target text corpora randomly, just take the - // text row from the first target text corpus that has content for that row - ITextCorpus targetTrainingCorpus = targetTrainingCorpora.ChooseFirst(); - if (targetTrainingCorpus.IsScripture()) + // Get the target corpus from the training corpora. + // This is across Scripture and non-Scripture corpora + ITextCorpus[] targetCorpora = + [ + .. corpusBundle.TargetTextCorpora.SelectMany(c => + c.TextCorpora.Select(tc => FilterTrainingCorpora(c.MonolingualCorpus, tc)) + ), + ]; + ITextCorpus targetCorpus = targetCorpora.ChooseFirst(); + if (targetCorpus.IsScripture()) { // Filter out all non-scripture; we only train on scripture content - targetTrainingCorpus = targetTrainingCorpus.Where(IsScriptureRow); - } - - // Align source and target training data - ParallelTextRow[] trainingRows = sourceTrainingCorpus - .AlignRows(targetTrainingCorpus, allSourceRows: true, allTargetRows: true) - .ToArray(); - - // After merging segments across ranges, run the 'train' preprocessing function - // on each training row and record whether any parallel training data was present - foreach (Row row in CollapseRanges(trainingRows)) - { - await train(row, TrainingDataType.Text); - if (!parallelTrainingDataPresent && row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0) - { - parallelTrainingDataPresent = true; - } + targetCorpus = targetCorpus.Where(IsScriptureRow); } if (useKeyTerms) @@ -255,7 +281,7 @@ public async Task PreprocessAsync( { sourceInferencingCorpus, targetInferencingCorpus, - targetTrainingCorpus, + targetCorpus, }.AlignMany([true, false, false]); foreach ((Row row, bool isInTrainingData) in CollapseInferencingRanges(inferencingCorpus.ToArray()))