From cab499af93df92e3ffcf0e262bffbc8b6ab9b5dd Mon Sep 17 00:00:00 2001
From: Peter Chapman <peter@conglomo.co.nz>
Date: Thu, 26 Mar 2026 17:15:06 +1300
Subject: [PATCH] Handle USFM and Text corpora separately in pre-processing

---
 .../Services/ParallelCorpusService.cs         | 110 +++++++++++-------
 1 file changed, 68 insertions(+), 42 deletions(-)
diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusService.cs
index a4e11124..c07316b1 100644
--- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusService.cs
+++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusService.cs
@@ -153,54 +153,80 @@ public async Task PreprocessAsync(
         ignoreUsfmMarkers ??= [];
 
         bool parallelTrainingDataPresent = false;
-        List<Row> keyTermTrainingData = new();
+        List<Row> keyTermTrainingData = [];
+
+        // Iterate over USFM and Text training corpora separately.
+        // This is not only because they use different keys, but if we have text corpora
+        // with scripture corpora, we don't want to exclude the text corpora from training.
+        foreach (bool isScripture in new bool[] { true, false })
+        {
+            // Create source and target arrays of text corpora filtered for training
+            // based on the filters specified in the associated monolingual corpora
+            ITextCorpus[] sourceTrainingCorpora =
+            [
+                .. corpusBundle.SourceTextCorpora.SelectMany(c =>
+                    c.TextCorpora.Where(tc => isScripture ? tc.IsScripture() : !tc.IsScripture())
+                        .Select(tc => FilterTrainingCorpora(c.MonolingualCorpus, tc))
+                ),
+            ];
+
+            ITextCorpus[] targetTrainingCorpora =
+            [
+                .. corpusBundle.TargetTextCorpora.SelectMany(c =>
+                    c.TextCorpora.Where(tc => isScripture ? tc.IsScripture() : !tc.IsScripture())
+                        .Select(tc => FilterTrainingCorpora(c.MonolingualCorpus, tc))
+                ),
+            ];
+
+            // To support mixed source, collapse multiple source text corpora into one text corpus
+            // by randomly interlacing content from each of the source text corpora
+            ITextCorpus sourceTrainingCorpus = sourceTrainingCorpora.ChooseRandom(Seed);
+            if (sourceTrainingCorpus.IsScripture())
+            {
+                // Filter out all non-scripture; we only train on scripture content
+                sourceTrainingCorpus = sourceTrainingCorpus.Where(IsScriptureRow);
+            }
 
-        // Create source and target arrays of text corpora filtered for training
-        // based on the filters specified in the associated monolingual corpora
-        ITextCorpus[] sourceTrainingCorpora = corpusBundle
-            .SourceTextCorpora.SelectMany(c =>
-                c.TextCorpora.Select(tc => FilterTrainingCorpora(c.MonolingualCorpus, tc))
-            )
-            .ToArray();
+            // Instead of interlacing rows from the target text corpora randomly, just take the
+            // text row from the first target text corpus that has content for that row
+            ITextCorpus targetTrainingCorpus = targetTrainingCorpora.ChooseFirst();
+            if (targetTrainingCorpus.IsScripture())
+            {
+                // Filter out all non-scripture; we only train on scripture content
+                targetTrainingCorpus = targetTrainingCorpus.Where(IsScriptureRow);
+            }
 
-        ITextCorpus[] targetTrainingCorpora = corpusBundle
-            .TargetTextCorpora.SelectMany(c =>
-                c.TextCorpora.Select(tc => FilterTrainingCorpora(c.MonolingualCorpus, tc))
-            )
-            .ToArray();
+            // Align source and target training data
+            ParallelTextRow[] trainingRows =
+            [
+                .. sourceTrainingCorpus.AlignRows(targetTrainingCorpus, allSourceRows: true, allTargetRows: true),
+            ];
 
-        // To support mixed source, collapse multiple source text corpora into one text corpus
-        // by randomly interlacing content from each of the source text corpora
-        ITextCorpus sourceTrainingCorpus = sourceTrainingCorpora.ChooseRandom(Seed);
-        if (sourceTrainingCorpus.IsScripture())
-        {
-            // Filter out all non-scripture; we only train on scripture content
-            sourceTrainingCorpus = sourceTrainingCorpus.Where(IsScriptureRow);
+            // After merging segments across ranges, run the 'train' preprocessing function
+            // on each training row and record whether any parallel training data was present
+            foreach (Row row in CollapseRanges(trainingRows))
+            {
+                await train(row, TrainingDataType.Text);
+                if (!parallelTrainingDataPresent && row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0)
+                {
+                    parallelTrainingDataPresent = true;
+                }
+            }
         }
 
-        // Instead of interlacing rows from the target text corpora randomly, just take the
-        // text row from the first target text corpus that has content for that row
-        ITextCorpus targetTrainingCorpus = targetTrainingCorpora.ChooseFirst();
-        if (targetTrainingCorpus.IsScripture())
+        // Get the target corpus from the training corpora.
+        // This is across Scripture and non-Scripture corpora
+        ITextCorpus[] targetCorpora =
+        [
+            .. corpusBundle.TargetTextCorpora.SelectMany(c =>
+                c.TextCorpora.Select(tc => FilterTrainingCorpora(c.MonolingualCorpus, tc))
+            ),
+        ];
+        ITextCorpus targetCorpus = targetCorpora.ChooseFirst();
+        if (targetCorpus.IsScripture())
         {
             // Filter out all non-scripture; we only train on scripture content
-            targetTrainingCorpus = targetTrainingCorpus.Where(IsScriptureRow);
-        }
-
-        // Align source and target training data
-        ParallelTextRow[] trainingRows = sourceTrainingCorpus
-            .AlignRows(targetTrainingCorpus, allSourceRows: true, allTargetRows: true)
-            .ToArray();
-
-        // After merging segments across ranges, run the 'train' preprocessing function
-        // on each training row and record whether any parallel training data was present
-        foreach (Row row in CollapseRanges(trainingRows))
-        {
-            await train(row, TrainingDataType.Text);
-            if (!parallelTrainingDataPresent && row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0)
-            {
-                parallelTrainingDataPresent = true;
-            }
+            targetCorpus = targetCorpus.Where(IsScriptureRow);
         }
 
         if (useKeyTerms)
@@ -255,7 +281,7 @@ public async Task PreprocessAsync(
             {
                 sourceInferencingCorpus,
                 targetInferencingCorpus,
-                targetTrainingCorpus,
+                targetCorpus,
             }.AlignMany([true, false, false]);
 
             foreach ((Row row, bool isInTrainingData) in CollapseInferencingRanges(inferencingCorpus.ToArray()))