From 4243a2fba727bf96182db186c7be1f6985fb854d Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Thu, 26 Feb 2026 17:01:02 -0500 Subject: [PATCH 1/5] Update machine version; support daughter projects in preprocessing Initial pass at daughter project support Refactor paratext project parallel corpus handling (unfinished) Remove unnecessary classes Progress toward finished tests Passing tests --- .../EchoEngine/TranslationEngineServiceV1.cs | 11 +- .../WordAlignmentEngineServiceV1.cs | 12 +- .../Services/NmtPreprocessBuildJob.cs | 24 +- .../Services/PreprocessBuildJob.cs | 75 +++-- .../Services/SmtTransferPreprocessBuildJob.cs | 4 +- .../Services/TranslationPreprocessBuildJob.cs | 18 +- .../WordAlignmentPreprocessBuildJob.cs | 20 +- .../src/Serval.Machine.Shared/Usings.cs | 1 - .../Services/NmtEngineServiceTests.cs | 2 +- .../Services/PreprocessBuildJobTests.cs | 128 +++++++- .../Services/SmtTransferEngineServiceTests.cs | 2 +- .../Services/StatisticalEngineServiceTests.cs | 2 +- .../IServiceCollectionExtensions.cs | 1 - .../Services/IScriptureDataFileService.cs | 7 - .../Services/ScriptureDataFileService.cs | 31 -- .../ZipParatextProjectSettingsParser.cs | 4 - src/Serval/src/Serval.Shared/Usings.cs | 5 - .../Services/EngineService.cs | 9 +- .../Services/PretranslationService.cs | 118 +++++++- .../Services/EngineService.cs | 9 +- .../Services/ScriptureDataFileServiceTests.cs | 106 ------- src/Serval/test/Serval.Shared.Tests/Usings.cs | 9 - .../Services/EngineServiceTests.cs | 62 ++-- .../Services/PretranslationServiceTests.cs | 173 ++++------- .../test/Serval.Translation.Tests/Usings.cs | 4 - .../data/pt-project1/41MATTe1.SFM | 6 + .../data/pt-project1/Settings.xml | 34 +++ .../data/pt-project1/custom.vrs | 31 ++ .../data/pt-project2/41MATTe2.SFM | 0 .../data/pt-project2/Settings.xml | 33 ++ .../data/pt-project2/custom.vrs | 31 ++ .../data/pt-project3/41MATTe3.SFM | 5 + .../data/pt-project3/Settings.xml | 33 ++ .../data/pt-project3/custom.vrs | 31 ++ .../Services/EngineServiceTests.cs | 60 ++-- .../test/Serval.WordAlignment.Tests/Usings.cs | 4 - .../IServiceCollectionsExtensions.cs | 3 +- .../Models/MissingParentProjectError.cs | 7 + .../Models/TrainingDataType.cs | 2 +- .../Services/CorpusBundle.cs | 226 ++++++++++++++ .../IParallelCorpusPreprocessingService.cs | 17 -- .../Services/IParallelCorpusService.cs | 26 ++ .../Services/ITextCorpusService.cs | 7 - ...ingService.cs => ParallelCorpusService.cs} | 284 ++++++++++-------- .../Services/TextCorpusService.cs | 51 ---- .../Services/ZipParatextProjectFileHandler.cs | 2 +- .../ZipParatextProjectSettingsParser.cs | 6 + .../Services/ZipParatextProjectTextUpdater.cs | 4 +- .../src/SIL.ServiceToolkit/Usings.cs | 1 + .../Services/CorpusBundleTests.cs | 279 +++++++++++++++++ ...Tests.cs => ParallelCorpusServiceTests.cs} | 21 +- .../test/SIL.ServiceToolkit.Tests/Usings.cs | 2 + .../Utils/NUnitExtensions.cs | 9 + 53 files changed, 1383 insertions(+), 669 deletions(-) delete mode 100644 src/Serval/src/Serval.Shared/Services/IScriptureDataFileService.cs delete mode 100644 src/Serval/src/Serval.Shared/Services/ScriptureDataFileService.cs delete mode 100644 src/Serval/src/Serval.Shared/Services/ZipParatextProjectSettingsParser.cs delete mode 100644 src/Serval/test/Serval.Shared.Tests/Services/ScriptureDataFileServiceTests.cs create mode 100644 src/Serval/test/Serval.Translation.Tests/data/pt-project1/41MATTe1.SFM create mode 100644 src/Serval/test/Serval.Translation.Tests/data/pt-project1/Settings.xml create mode 100644 src/Serval/test/Serval.Translation.Tests/data/pt-project1/custom.vrs create mode 100644 src/Serval/test/Serval.Translation.Tests/data/pt-project2/41MATTe2.SFM create mode 100644 src/Serval/test/Serval.Translation.Tests/data/pt-project2/Settings.xml create mode 100644 src/Serval/test/Serval.Translation.Tests/data/pt-project2/custom.vrs create mode 100644 src/Serval/test/Serval.Translation.Tests/data/pt-project3/41MATTe3.SFM create mode 100644 src/Serval/test/Serval.Translation.Tests/data/pt-project3/Settings.xml create mode 100644 src/Serval/test/Serval.Translation.Tests/data/pt-project3/custom.vrs create mode 100644 src/ServiceToolkit/src/SIL.ServiceToolkit/Models/MissingParentProjectError.cs create mode 100644 src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusBundle.cs delete mode 100644 src/ServiceToolkit/src/SIL.ServiceToolkit/Services/IParallelCorpusPreprocessingService.cs create mode 100644 src/ServiceToolkit/src/SIL.ServiceToolkit/Services/IParallelCorpusService.cs delete mode 100644 src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ITextCorpusService.cs rename src/ServiceToolkit/src/SIL.ServiceToolkit/Services/{ParallelCorpusPreprocessingService.cs => ParallelCorpusService.cs} (66%) delete mode 100644 src/ServiceToolkit/src/SIL.ServiceToolkit/Services/TextCorpusService.cs rename src/{Serval/src/Serval.Shared => ServiceToolkit/src/SIL.ServiceToolkit}/Services/ZipParatextProjectFileHandler.cs (97%) create mode 100644 src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ZipParatextProjectSettingsParser.cs rename src/{Serval/src/Serval.Shared => ServiceToolkit/src/SIL.ServiceToolkit}/Services/ZipParatextProjectTextUpdater.cs (91%) create mode 100644 src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/CorpusBundleTests.cs rename src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/{ParallelCorpusProcessingServiceTests.cs => ParallelCorpusServiceTests.cs} (95%) create mode 100644 src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Utils/NUnitExtensions.cs diff --git a/src/Echo/src/EchoEngine/TranslationEngineServiceV1.cs b/src/Echo/src/EchoEngine/TranslationEngineServiceV1.cs index 60541a976..24998c3ec 100644 --- a/src/Echo/src/EchoEngine/TranslationEngineServiceV1.cs +++ b/src/Echo/src/EchoEngine/TranslationEngineServiceV1.cs @@ -4,7 +4,7 @@ namespace EchoEngine; public class TranslationEngineServiceV1( BackgroundTaskQueue taskQueue, - IParallelCorpusPreprocessingService parallelCorpusPreprocessingService, + IParallelCorpusService parallelCorpusPreprocessingService, TranslationPlatformApi.TranslationPlatformApiClient platformApiClient ) : TranslationEngineApi.TranslationEngineApiBase { @@ -12,8 +12,7 @@ TranslationPlatformApi.TranslationPlatformApiClient platformApiClient private readonly BackgroundTaskQueue _taskQueue = taskQueue; private readonly TranslationPlatformApi.TranslationPlatformApiClient _platformApiClient = platformApiClient; - private readonly IParallelCorpusPreprocessingService _parallelCorpusPreprocessingService = - parallelCorpusPreprocessingService; + private readonly IParallelCorpusService _parallelCorpusPreprocessingService = parallelCorpusPreprocessingService; public override Task Create(CreateRequest request, ServerCallContext context) { @@ -126,21 +125,21 @@ await client.BuildStartedAsync( List pretranslationsRequests = []; await _parallelCorpusPreprocessingService.PreprocessAsync( - request.Corpora.Select(Map).ToList(), + new SIL.ServiceToolkit.Utils.CorpusBundle(request.Corpora.Select(Map)), (row, _) => { if (row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0) trainCount++; return Task.CompletedTask; }, - (row, isInTrainingData, corpus) => + (row, isInTrainingData, corpusId) => { string[] tokens = row.SourceSegment.Split(); pretranslationsRequests.Add( new InsertPretranslationsRequest { EngineId = request.EngineId, - CorpusId = corpus.Id, + CorpusId = corpusId, TextId = row.TextId, SourceRefs = { row.SourceRefs.Select(r => r.ToString()) }, TargetRefs = { row.TargetRefs.Select(r => r.ToString()) }, diff --git a/src/Echo/src/EchoEngine/WordAlignmentEngineServiceV1.cs b/src/Echo/src/EchoEngine/WordAlignmentEngineServiceV1.cs index 98ba34c93..df88dd181 100644 --- a/src/Echo/src/EchoEngine/WordAlignmentEngineServiceV1.cs +++ b/src/Echo/src/EchoEngine/WordAlignmentEngineServiceV1.cs @@ -1,16 +1,16 @@ using Serval.WordAlignment.V1; +using SIL.ServiceToolkit.Utils; namespace EchoEngine; public class WordAlignmentEngineServiceV1( BackgroundTaskQueue taskQueue, - IParallelCorpusPreprocessingService parallelCorpusPreprocessingService + IParallelCorpusService parallelCorpusPreprocessingService ) : WordAlignmentEngineApi.WordAlignmentEngineApiBase { private static readonly Empty Empty = new(); private readonly BackgroundTaskQueue _taskQueue = taskQueue; - private readonly IParallelCorpusPreprocessingService _parallelCorpusPreprocessingService = - parallelCorpusPreprocessingService; + private readonly IParallelCorpusService _parallelCorpusPreprocessingService = parallelCorpusPreprocessingService; public override Task Create(CreateRequest request, ServerCallContext context) { @@ -80,20 +80,20 @@ await client.BuildStartedAsync( int wordAlignCount = 0; List wordAlignmentsRequests = []; await _parallelCorpusPreprocessingService.PreprocessAsync( - request.Corpora.Select(Map).ToList(), + new CorpusBundle(request.Corpora.Select(Map)), (row, _) => { if (row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0) trainCount++; return Task.CompletedTask; }, - (row, isInTrainingData, corpus) => + (row, isInTrainingData, corpusId) => { wordAlignmentsRequests.Add( new InsertWordAlignmentsRequest { EngineId = request.EngineId, - CorpusId = corpus.Id, + CorpusId = corpusId, TextId = row.TextId, SourceRefs = { row.SourceRefs.Select(r => r.ToString()) }, TargetRefs = { row.TargetRefs.Select(r => r.ToString()) }, diff --git a/src/Machine/src/Serval.Machine.Shared/Services/NmtPreprocessBuildJob.cs b/src/Machine/src/Serval.Machine.Shared/Services/NmtPreprocessBuildJob.cs index f64da3c73..20e95f3bc 100644 --- a/src/Machine/src/Serval.Machine.Shared/Services/NmtPreprocessBuildJob.cs +++ b/src/Machine/src/Serval.Machine.Shared/Services/NmtPreprocessBuildJob.cs @@ -8,7 +8,7 @@ public class NmtPreprocessBuildJob( IBuildJobService buildJobService, ISharedFileService sharedFileService, ILanguageTagService languageTagService, - IParallelCorpusPreprocessingService parallelCorpusPreprocessingService, + IParallelCorpusService parallelCorpusPreprocessingService, IOptionsMonitor options ) : TranslationPreprocessBuildJob( @@ -33,22 +33,12 @@ private bool ResolveLanguageCode(string languageCode, out string resolvedCode) protected override async Task UpdateTargetQuoteConventionAsync( string engineId, string buildId, - IReadOnlyList corpora, + CorpusBundle corpusBundle, CancellationToken cancellationToken ) { - List quoteConventionAnalyses = []; - foreach (ParallelCorpus parallelCorpus in corpora) - { - QuoteConventionAnalysis? targetQuotationConventionAnalysis = - ParallelCorpusPreprocessingService.AnalyzeTargetCorpusQuoteConvention(parallelCorpus); - if (targetQuotationConventionAnalysis != null) - quoteConventionAnalyses.Add(targetQuotationConventionAnalysis); - } - string overallTargetQuoteConventionAnalysis = - QuoteConventionAnalysis.CombineWithWeightedAverage(quoteConventionAnalyses)?.BestQuoteConvention?.Name - ?? string.Empty; + ParallelCorpusService.AnalyzeTargetQuoteConvention(corpusBundle)?.BestQuoteConvention?.Name ?? string.Empty; await PlatformService.UpdateTargetQuoteConventionAsync( engineId, @@ -65,7 +55,7 @@ protected override async Task UpdateBuildExecutionData( int pretranslateCount, string sourceLanguageTag, string targetLanguageTag, - IReadOnlyList corpora, + CorpusBundle corpusBundle, CancellationToken cancellationToken ) { @@ -84,7 +74,7 @@ CancellationToken cancellationToken pretranslateCount, sourceLanguageTag, targetLanguageTag, - corpora + corpusBundle ); int maxWarnings = BuildJobOptions.MaxWarnings; @@ -128,12 +118,12 @@ protected override IReadOnlyList GetWarnings( int inferenceCount, string sourceLanguageTag, string targetLanguageTag, - IReadOnlyList corpora + CorpusBundle corpusBundle ) { List warnings = [ - .. base.GetWarnings(trainCount, inferenceCount, sourceLanguageTag, targetLanguageTag, corpora), + .. base.GetWarnings(trainCount, inferenceCount, sourceLanguageTag, targetLanguageTag, corpusBundle), ]; // Has at least a Gospel of Mark amount of data and not the special case of no data which will be caught elsewhere diff --git a/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs b/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs index 630cf5d2e..5e82c1d35 100644 --- a/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs +++ b/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs @@ -7,7 +7,7 @@ public abstract class PreprocessBuildJob( ILogger> logger, IBuildJobService buildJobService, ISharedFileService sharedFileService, - IParallelCorpusPreprocessingService parallelCorpusPreprocessingService, + IParallelCorpusService parallelCorpusService, IOptionsMonitor options ) : HangfireBuildJob>( @@ -31,8 +31,7 @@ IOptionsMonitor options internal BuildJobRunnerType TrainJobRunnerType { get; init; } = BuildJobRunnerType.ClearML; protected readonly BuildJobOptions BuildJobOptions = options.CurrentValue; protected readonly ISharedFileService SharedFileService = sharedFileService; - protected readonly IParallelCorpusPreprocessingService ParallelCorpusPreprocessingService = - parallelCorpusPreprocessingService; + protected readonly IParallelCorpusService ParallelCorpusService = parallelCorpusService; protected override async Task DoWorkAsync( string engineId, @@ -46,9 +45,11 @@ CancellationToken cancellationToken if (engine is null) throw new OperationCanceledException($"Engine {engineId} does not exist. Build canceled."); + CorpusBundle corpusBundle = new(data); + (int trainCount, int inferenceCount) = await WriteDataFilesAsync( buildId, - data, + corpusBundle, buildOptions, cancellationToken ); @@ -60,11 +61,11 @@ await UpdateBuildExecutionData( inferenceCount, engine.SourceLanguage, engine.TargetLanguage, - data, + corpusBundle, cancellationToken ); - await UpdateTargetQuoteConventionAsync(engineId, buildId, data, cancellationToken); + await UpdateTargetQuoteConventionAsync(engineId, buildId, corpusBundle, cancellationToken); if (inferenceCount == 0 && engine is TranslationEngine { IsModelPersisted: false }) { @@ -95,20 +96,20 @@ protected abstract Task UpdateBuildExecutionData( int inferenceCount, string sourceLanguageTag, string targetLanguageTag, - IReadOnlyList corpora, + CorpusBundle corpusBundle, CancellationToken cancellationToken ); protected virtual Task UpdateTargetQuoteConventionAsync( string engineId, string buildId, - IReadOnlyList corpora, + CorpusBundle corpusBundle, CancellationToken cancellationToken ) => Task.CompletedTask; protected abstract Task<(int TrainCount, int InferenceCount)> WriteDataFilesAsync( string buildId, - IReadOnlyList corpora, + CorpusBundle corpusBundle, string? buildOptions, CancellationToken cancellationToken ); @@ -116,7 +117,7 @@ CancellationToken cancellationToken protected override async Task CleanupAsync( string engineId, string buildId, - IReadOnlyList data, + IReadOnlyList parallelCorpora, JobCompletionStatus completionStatus ) { @@ -138,34 +139,48 @@ protected virtual IReadOnlyList GetWarnings( int inferenceCount, string sourceLanguageTag, string targetLanguageTag, - IReadOnlyList corpora + CorpusBundle corpusBundle ) { List warnings = []; - foreach (ParallelCorpus parallelCorpus in corpora) + foreach ( + ( + string parallelCorpusId, + string monolingualCorpusId, + IReadOnlyList errors + ) in ParallelCorpusService.AnalyzeUsfmVersification(corpusBundle) + ) { - IReadOnlyList<(string MonolingualCorpusId, IReadOnlyList errors)> errorsPerCorpus = - ParallelCorpusPreprocessingService.AnalyzeUsfmVersification(parallelCorpus); - - foreach ((string monolingualCorpusId, IReadOnlyList errors) in errorsPerCorpus) + foreach (UsfmVersificationError error in errors) { - foreach (UsfmVersificationError error in errors) - { - warnings.Add( - error.Type switch - { - UsfmVersificationErrorType.InvalidChapterNumber => - $"Invalid chapter number error in project {error.ProjectName} at “{error.ActualVerseRef}” (parallel corpus {parallelCorpus.Id}, monolingual corpus {monolingualCorpusId})", - UsfmVersificationErrorType.InvalidVerseNumber => - $"Invalid verse number error in project {error.ProjectName} at “{error.ActualVerseRef}” (parallel corpus {parallelCorpus.Id}, monolingual corpus {monolingualCorpusId})", - _ => - $"USFM versification error in project {error.ProjectName}, expected verse “{error.ExpectedVerseRef}”, actual verse “{error.ActualVerseRef}”, mismatch type {error.Type} (parallel corpus {parallelCorpus.Id}, monolingual corpus {monolingualCorpusId})", - } - ); - } + warnings.Add( + error.Type switch + { + UsfmVersificationErrorType.InvalidChapterNumber => + $"Invalid chapter number error in project {error.ProjectName} at “{error.ActualVerseRef}” (parallel corpus {parallelCorpusId}, monolingual corpus {monolingualCorpusId})", + UsfmVersificationErrorType.InvalidVerseNumber => + $"Invalid verse number error in project {error.ProjectName} at “{error.ActualVerseRef}” (parallel corpus {parallelCorpusId}, monolingual corpus {monolingualCorpusId})", + _ => + $"USFM versification error in project {error.ProjectName}, expected verse “{error.ExpectedVerseRef}”, actual verse “{error.ActualVerseRef}”, mismatch type {error.Type} (parallel corpus {parallelCorpusId}, monolingual corpus {monolingualCorpusId})", + } + ); } } + + foreach ( + ( + string parallelCorpusId, + string monolingualCorpusId, + MissingParentProjectError error + ) in ParallelCorpusService.FindMissingParentProjects(corpusBundle) + ) + { + warnings.Add( + $"Unable to locate parent project {error.ParentProjectName} of daughter project {error.ProjectName} (parallel corpus {parallelCorpusId}, monolingual corpus {monolingualCorpusId})" + ); + } + return warnings; } } diff --git a/src/Machine/src/Serval.Machine.Shared/Services/SmtTransferPreprocessBuildJob.cs b/src/Machine/src/Serval.Machine.Shared/Services/SmtTransferPreprocessBuildJob.cs index adbf66e85..d9f9253c3 100644 --- a/src/Machine/src/Serval.Machine.Shared/Services/SmtTransferPreprocessBuildJob.cs +++ b/src/Machine/src/Serval.Machine.Shared/Services/SmtTransferPreprocessBuildJob.cs @@ -9,7 +9,7 @@ public class SmtTransferPreprocessBuildJob( ISharedFileService sharedFileService, IDistributedReaderWriterLockFactory lockFactory, IRepository trainSegmentPairs, - IParallelCorpusPreprocessingService parallelCorpusPreprocessingService, + IParallelCorpusService parallelCorpusPreprocessingService, IOptionsMonitor options ) : TranslationPreprocessBuildJob( @@ -29,7 +29,7 @@ IOptionsMonitor options protected override async Task InitializeAsync( string engineId, string buildId, - IReadOnlyList data, + IReadOnlyList corpora, CancellationToken cancellationToken ) { diff --git a/src/Machine/src/Serval.Machine.Shared/Services/TranslationPreprocessBuildJob.cs b/src/Machine/src/Serval.Machine.Shared/Services/TranslationPreprocessBuildJob.cs index 8c1aacf33..7870b3703 100644 --- a/src/Machine/src/Serval.Machine.Shared/Services/TranslationPreprocessBuildJob.cs +++ b/src/Machine/src/Serval.Machine.Shared/Services/TranslationPreprocessBuildJob.cs @@ -7,7 +7,7 @@ public class TranslationPreprocessBuildJob( ILogger> logger, IBuildJobService buildJobService, ISharedFileService sharedFileService, - IParallelCorpusPreprocessingService parallelCorpusPreprocessingService, + IParallelCorpusService parallelCorpusPreprocessingService, IOptionsMonitor options ) : PreprocessBuildJob( @@ -23,7 +23,7 @@ IOptionsMonitor options { protected override async Task<(int TrainCount, int InferenceCount)> WriteDataFilesAsync( string buildId, - IReadOnlyList corpora, + CorpusBundle corpusBundle, string? buildOptions, CancellationToken cancellationToken ) @@ -55,13 +55,13 @@ await SharedFileService.OpenWriteAsync($"builds/{buildId}/train.key-terms.trg.tx int trainCount = 0; int pretranslateCount = 0; pretranslateWriter.WriteStartArray(); - await ParallelCorpusPreprocessingService.PreprocessAsync( - corpora, + await ParallelCorpusService.PreprocessAsync( + corpusBundle, async (row, trainingDataType) => { if (row.SourceSegment.Length > 0 || row.TargetSegment.Length > 0) { - if (trainingDataType == TrainingDataType.KeyTerms) + if (trainingDataType == TrainingDataType.KeyTerm) { await sourceKeyTermsTrainWriter.WriteAsync($"{row.SourceSegment}\n"); await targetKeyTermsTrainWriter.WriteAsync($"{row.TargetSegment}\n"); @@ -75,12 +75,12 @@ await ParallelCorpusPreprocessingService.PreprocessAsync( if (row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0) trainCount++; }, - async (row, isInTrainingData, corpus) => + async (row, isInTrainingData, corpusId) => { if (row.SourceSegment.Length > 0 && !isInTrainingData) { pretranslateWriter.WriteStartObject(); - pretranslateWriter.WriteString("corpusId", corpus.Id); + pretranslateWriter.WriteString("corpusId", corpusId); pretranslateWriter.WriteString("textId", row.TextId); pretranslateWriter.WriteStartArray("refs"); foreach (object rowRef in row.TargetRefs) @@ -109,7 +109,7 @@ protected override async Task UpdateBuildExecutionData( int pretranslateCount, string sourceLanguageTag, string targetLanguageTag, - IReadOnlyList corpora, + CorpusBundle corpusBundle, CancellationToken cancellationToken ) { @@ -118,7 +118,7 @@ CancellationToken cancellationToken pretranslateCount, sourceLanguageTag, targetLanguageTag, - corpora + corpusBundle ); // Log summary of build data diff --git a/src/Machine/src/Serval.Machine.Shared/Services/WordAlignmentPreprocessBuildJob.cs b/src/Machine/src/Serval.Machine.Shared/Services/WordAlignmentPreprocessBuildJob.cs index 89f265e7f..0af94cd63 100644 --- a/src/Machine/src/Serval.Machine.Shared/Services/WordAlignmentPreprocessBuildJob.cs +++ b/src/Machine/src/Serval.Machine.Shared/Services/WordAlignmentPreprocessBuildJob.cs @@ -7,7 +7,7 @@ public class WordAlignmentPreprocessBuildJob( ILogger logger, IBuildJobService buildJobService, ISharedFileService sharedFileService, - IParallelCorpusPreprocessingService parallelCorpusPreprocessingService, + IParallelCorpusService parallelCorpusPreprocessingService, IOptionsMonitor options ) : PreprocessBuildJob( @@ -23,7 +23,7 @@ IOptionsMonitor options { protected override async Task<(int TrainCount, int InferenceCount)> WriteDataFilesAsync( string buildId, - IReadOnlyList corpora, + CorpusBundle corpusBundle, string? buildOptions, CancellationToken cancellationToken ) @@ -55,13 +55,13 @@ await SharedFileService.OpenWriteAsync($"builds/{buildId}/train.key-terms.trg.tx int trainCount = 0; int inferenceCount = 0; wordAlignmentWriter.WriteStartArray(); - await ParallelCorpusPreprocessingService.PreprocessAsync( - corpora, + await ParallelCorpusService.PreprocessAsync( + corpusBundle, async (row, trainingDataType) => { if (row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0) { - if (trainingDataType == TrainingDataType.KeyTerms) + if (trainingDataType == TrainingDataType.KeyTerm) { await sourceKeyTermsTrainWriter.WriteAsync($"{row.SourceSegment}\n"); await targetKeyTermsTrainWriter.WriteAsync($"{row.TargetSegment}\n"); @@ -75,12 +75,12 @@ await ParallelCorpusPreprocessingService.PreprocessAsync( trainCount++; } }, - async (row, isInTrainingData, corpus) => + async (row, isInTrainingData, corpusId) => { if (row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0 && !isInTrainingData) { wordAlignmentWriter.WriteStartObject(); - wordAlignmentWriter.WriteString("corpusId", corpus.Id); + wordAlignmentWriter.WriteString("corpusId", corpusId); wordAlignmentWriter.WriteString("textId", row.TextId); wordAlignmentWriter.WriteStartArray("refs"); foreach (object rowRef in row.TargetRefs) @@ -109,7 +109,7 @@ protected override async Task UpdateBuildExecutionData( int wordAlignCount, string sourceLanguageTag, string targetLanguageTag, - IReadOnlyList corpora, + CorpusBundle corpusBundle, CancellationToken cancellationToken ) { @@ -118,7 +118,7 @@ CancellationToken cancellationToken wordAlignCount, sourceLanguageTag, targetLanguageTag, - corpora + corpusBundle ); // Log summary of build data @@ -148,7 +148,7 @@ CancellationToken cancellationToken protected override Task UpdateTargetQuoteConventionAsync( string engineId, string buildId, - IReadOnlyList corpora, + CorpusBundle corpusBundle, CancellationToken cancellationToken ) { diff --git a/src/Machine/src/Serval.Machine.Shared/Usings.cs b/src/Machine/src/Serval.Machine.Shared/Usings.cs index 5921669aa..7e6db4fa3 100644 --- a/src/Machine/src/Serval.Machine.Shared/Usings.cs +++ b/src/Machine/src/Serval.Machine.Shared/Usings.cs @@ -56,7 +56,6 @@ global using SIL.DataAccess; global using SIL.Machine.Corpora; global using SIL.Machine.Morphology.HermitCrab; -global using SIL.Machine.PunctuationAnalysis; global using SIL.Machine.Tokenization; global using SIL.Machine.Translation; global using SIL.Machine.Translation.Thot; diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/NmtEngineServiceTests.cs b/src/Machine/test/Serval.Machine.Shared.Tests/Services/NmtEngineServiceTests.cs index 29d307d7b..3b2bd5574 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/NmtEngineServiceTests.cs +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/NmtEngineServiceTests.cs @@ -330,7 +330,7 @@ public override object ActivateJob(Type jobType) _env.BuildJobService, _env.SharedFileService, new LanguageTagService(), - new ParallelCorpusPreprocessingService(new TextCorpusService()), + new ParallelCorpusService(), _env.BuildJobOptions ); } diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs index 1aa6e716d..afcb87275 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs @@ -343,10 +343,34 @@ public void RunAsync_OnlyParseSelectedBooks_NoBadBooks() using TestEnvironment env = new(); env.PersistModel(); // MRK does not contain verse data, so there is no inferencing ParallelCorpus corpus = env.ParatextCorpus(trainOnTextIds: ["LEV"], inferenceTextIds: ["MRK"]); + var parallelCorpusService = new ParallelCorpusService(); + env.ParallelCorpusService = Substitute.For(); + env.ParallelCorpusService.When(s => + s.PreprocessAsync( + Arg.Any(), + Arg.Any>(), + Arg.Any>(), + Arg.Any(), + Arg.Any?>() + ) + ) + .Do(async callInfo => + { + CorpusBundle corpusBundle = callInfo.ArgAt(0); + DummyCorpusBundle dummyCorpusBundle = new DummyCorpusBundle( + corpusBundle, + ["LEV", "MRK", "MAT"], + ["MAT"] + ); - env.TextCorpusService = Substitute.For(); - env.TextCorpusService.CreateTextCorpora(Arg.Any>()) - .Returns([new DummyCorpus(["LEV", "MRK", "MAT"], ["MAT"])]); + await parallelCorpusService.PreprocessAsync( + dummyCorpusBundle, + callInfo.ArgAt>(1), + callInfo.ArgAt>(2), + callInfo.ArgAt(3), + callInfo.ArgAt?>(4) + ); + }); Assert.DoesNotThrowAsync(async () => { await env.RunBuildJobAsync(corpus); @@ -354,17 +378,47 @@ public void RunAsync_OnlyParseSelectedBooks_NoBadBooks() } [Test] - public void RunAsync_OnlyParseSelectedBooks_TrainOnBadBook() + public async Task RunAsync_OnlyParseSelectedBooks_TrainOnBadBook() { using TestEnvironment env = new(); ParallelCorpus corpus = env.ParatextCorpus(trainOnTextIds: ["MAT"], inferenceTextIds: ["MRK"]); - env.TextCorpusService = Substitute.For(); - env.TextCorpusService.CreateTextCorpora(Arg.Any>()) - .Returns([new DummyCorpus(["LEV", "MRK", "MAT"], ["MAT"])]); - Assert.ThrowsAsync(async () => + var parallelCorpusService = new ParallelCorpusService(); + env.ParallelCorpusService = Substitute.For(); + ArgumentException? ex = null; + env.ParallelCorpusService.When(s => + s.PreprocessAsync( + Arg.Any(), + Arg.Any>(), + Arg.Any>(), + Arg.Any(), + Arg.Any?>() + ) + ) + .Do(async callInfo => + { + CorpusBundle corpusBundle = callInfo.ArgAt(0); + DummyCorpusBundle dummyCorpusBundle = new DummyCorpusBundle( + corpusBundle, + ["LEV", "MRK", "MAT"], + ["MAT"] + ); + ex = Assert.ThrowsAsync(async () => + { + await parallelCorpusService.PreprocessAsync( + dummyCorpusBundle, + callInfo.ArgAt>(1), + callInfo.ArgAt>(2), + callInfo.ArgAt(3), + callInfo.ArgAt?>(4) + ); + }); + }); + Assert.ThrowsAsync(async () => { await env.RunBuildJobAsync(corpus); }); + + Assert.That(ex, Is.Not.Null); } [Test] @@ -372,13 +426,43 @@ public void RunAsync_OnlyParseSelectedBooks_PretranslateOnBadBook() { using TestEnvironment env = new(); ParallelCorpus corpus = env.ParatextCorpus(trainOnTextIds: ["LEV"], inferenceTextIds: ["MAT"]); - env.TextCorpusService = Substitute.For(); - env.TextCorpusService.CreateTextCorpora(Arg.Any>()) - .Returns([new DummyCorpus(["LEV", "MRK", "MAT"], ["MAT"])]); - Assert.ThrowsAsync(async () => + var parallelCorpusService = new ParallelCorpusService(); + env.ParallelCorpusService = Substitute.For(); + ArgumentException? ex = null; + env.ParallelCorpusService.When(s => + s.PreprocessAsync( + Arg.Any(), + Arg.Any>(), + Arg.Any>(), + Arg.Any(), + Arg.Any?>() + ) + ) + .Do(async callInfo => + { + CorpusBundle corpusBundle = callInfo.ArgAt(0); + DummyCorpusBundle dummyCorpusBundle = new DummyCorpusBundle( + corpusBundle, + ["LEV", "MRK", "MAT"], + ["MAT"] + ); + ex = Assert.ThrowsAsync(async () => + { + await parallelCorpusService.PreprocessAsync( + dummyCorpusBundle, + callInfo.ArgAt>(1), + callInfo.ArgAt>(2), + callInfo.ArgAt(3), + callInfo.ArgAt?>(4) + ); + }); + }); + Assert.ThrowsAsync(async () => { await env.RunBuildJobAsync(corpus); }); + + Assert.That(ex, Is.Not.Null); } [Test] @@ -501,12 +585,12 @@ private class TestEnvironment : DisposableBase private readonly TempDirectory _tempDir; public ISharedFileService SharedFileService { get; } - public ITextCorpusService TextCorpusService { get; set; } public IPlatformService PlatformService { get; } public MemoryRepository Engines { get; } public MemoryRepository TrainSegmentPairs { get; } public IDistributedReaderWriterLockFactory LockFactory { get; } public IBuildJobService BuildJobService { get; } + public IParallelCorpusService ParallelCorpusService { get; set; } public IClearMLService ClearMLService { get; } public IOptionsMonitor BuildJobOptions { get; } @@ -709,7 +793,6 @@ public TestEnvironment() } ); TrainSegmentPairs = new MemoryRepository(); - TextCorpusService = new TextCorpusService(); PlatformService = Substitute.For(); PlatformService.EngineGroup.Returns(EngineGroup.Translation); PlatformService.UpdateBuildExecutionDataAsync( @@ -788,6 +871,7 @@ public TestEnvironment() ], Engines ); + ParallelCorpusService = new ParallelCorpusService(); } public PreprocessBuildJob GetBuildJob(EngineType engineType) @@ -804,7 +888,7 @@ public PreprocessBuildJob GetBuildJob(EngineType engineType) BuildJobService, SharedFileService, new LanguageTagService(), - new ParallelCorpusPreprocessingService(TextCorpusService), + ParallelCorpusService, BuildJobOptions ); } @@ -819,7 +903,7 @@ public PreprocessBuildJob GetBuildJob(EngineType engineType) SharedFileService, LockFactory, TrainSegmentPairs, - new ParallelCorpusPreprocessingService(TextCorpusService), + ParallelCorpusService, BuildJobOptions ); } @@ -1130,4 +1214,16 @@ IEnumerator IEnumerable.GetEnumerator() return Texts.GetEnumerator(); } } + + private class DummyCorpusBundle(CorpusBundle corpusBundle, IEnumerable books, IEnumerable failsOn) + : CorpusBundle(corpusBundle.ParallelCorpora) + { + private IEnumerable FailsOn { get; } = failsOn; + private IEnumerable Books { get; } = books; + + protected override IReadOnlyList CreateTextCorpora(IReadOnlyList files) + { + return [new DummyCorpus(Books, FailsOn)]; + } + } } diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/SmtTransferEngineServiceTests.cs b/src/Machine/test/Serval.Machine.Shared.Tests/Services/SmtTransferEngineServiceTests.cs index 9162fcf37..a852b846f 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/SmtTransferEngineServiceTests.cs +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/SmtTransferEngineServiceTests.cs @@ -711,7 +711,7 @@ public override object ActivateJob(Type jobType) _env.SharedFileService, _env._lockFactory, _env.TrainSegmentPairs, - new ParallelCorpusPreprocessingService(new TextCorpusService()), + new ParallelCorpusService(), _env.BuildJobOptions ) { diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/StatisticalEngineServiceTests.cs b/src/Machine/test/Serval.Machine.Shared.Tests/Services/StatisticalEngineServiceTests.cs index 63711d67f..4722b4b88 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/StatisticalEngineServiceTests.cs +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/StatisticalEngineServiceTests.cs @@ -457,7 +457,7 @@ public override object ActivateJob(Type jobType) Substitute.For>(), _env.BuildJobService, _env.SharedFileService, - new ParallelCorpusPreprocessingService(new TextCorpusService()), + new ParallelCorpusService(), _env.BuildJobOptions ) { diff --git a/src/Serval/src/Serval.Shared/Configuration/IServiceCollectionExtensions.cs b/src/Serval/src/Serval.Shared/Configuration/IServiceCollectionExtensions.cs index 3e3459b7a..a6f64f22a 100644 --- a/src/Serval/src/Serval.Shared/Configuration/IServiceCollectionExtensions.cs +++ b/src/Serval/src/Serval.Shared/Configuration/IServiceCollectionExtensions.cs @@ -5,7 +5,6 @@ public static class IServiceCollectionExtensions public static IServalBuilder AddServal(this IServiceCollection services, IConfiguration configuration) { services.AddFileSystem(); - services.AddTransient(); return new ServalBuilder(services, configuration); } } diff --git a/src/Serval/src/Serval.Shared/Services/IScriptureDataFileService.cs b/src/Serval/src/Serval.Shared/Services/IScriptureDataFileService.cs deleted file mode 100644 index 08424a55d..000000000 --- a/src/Serval/src/Serval.Shared/Services/IScriptureDataFileService.cs +++ /dev/null @@ -1,7 +0,0 @@ -namespace Serval.Shared.Services; - -public interface IScriptureDataFileService -{ - ParatextProjectSettings GetParatextProjectSettings(string filename); - ZipParatextProjectTextUpdater GetZipParatextProjectTextUpdater(string filename); -} diff --git a/src/Serval/src/Serval.Shared/Services/ScriptureDataFileService.cs b/src/Serval/src/Serval.Shared/Services/ScriptureDataFileService.cs deleted file mode 100644 index e38b8c08c..000000000 --- a/src/Serval/src/Serval.Shared/Services/ScriptureDataFileService.cs +++ /dev/null @@ -1,31 +0,0 @@ -namespace Serval.Shared.Services; - -public class ScriptureDataFileService(IFileSystem fileSystem, IOptionsMonitor dataFileOptions) - : IScriptureDataFileService -{ - private readonly IFileSystem _fileSystem = fileSystem; - private readonly IOptionsMonitor _dataFileOptions = dataFileOptions; - - public ParatextProjectSettings GetParatextProjectSettings(string filename) - { - using IZipContainer container = _fileSystem.OpenZipFile(GetFilePath(filename)); - return ParseProjectSettings(container); - } - - public ZipParatextProjectTextUpdater GetZipParatextProjectTextUpdater(string filename) - { - IZipContainer container = _fileSystem.OpenZipFile(GetFilePath(filename)); - return new ZipParatextProjectTextUpdater(container); - } - - private string GetFilePath(string filename) - { - return Path.Combine(_dataFileOptions.CurrentValue.FilesDirectory, filename); - } - - private static ParatextProjectSettings ParseProjectSettings(IZipContainer container) - { - ZipParatextProjectSettingsParser settingsParser = new(container); - return settingsParser.Parse(); - } -} diff --git a/src/Serval/src/Serval.Shared/Services/ZipParatextProjectSettingsParser.cs b/src/Serval/src/Serval.Shared/Services/ZipParatextProjectSettingsParser.cs deleted file mode 100644 index 907717e9e..000000000 --- a/src/Serval/src/Serval.Shared/Services/ZipParatextProjectSettingsParser.cs +++ /dev/null @@ -1,4 +0,0 @@ -namespace Serval.Shared.Services; - -public class ZipParatextProjectSettingsParser(IZipContainer projectContainer) - : ParatextProjectSettingsParserBase(new ZipParatextProjectFileHandler(projectContainer)) { } diff --git a/src/Serval/src/Serval.Shared/Usings.cs b/src/Serval/src/Serval.Shared/Usings.cs index c4bf91004..a55e44177 100644 --- a/src/Serval/src/Serval.Shared/Usings.cs +++ b/src/Serval/src/Serval.Shared/Usings.cs @@ -10,13 +10,8 @@ global using Microsoft.Extensions.Configuration; global using Microsoft.Extensions.Diagnostics.HealthChecks; global using Microsoft.Extensions.Logging; -global using Microsoft.Extensions.Options; -global using MongoDB.Driver; global using Serval.Shared.Configuration; global using Serval.Shared.Contracts; global using Serval.Shared.Models; -global using Serval.Shared.Services; global using Serval.Shared.Utils; global using SIL.DataAccess; -global using SIL.Machine.Corpora; -global using SIL.ServiceToolkit.Services; diff --git a/src/Serval/src/Serval.Translation/Services/EngineService.cs b/src/Serval/src/Serval.Translation/Services/EngineService.cs index 8c208c742..1b987fe7a 100644 --- a/src/Serval/src/Serval.Translation/Services/EngineService.cs +++ b/src/Serval/src/Serval.Translation/Services/EngineService.cs @@ -12,7 +12,6 @@ public class EngineService( IOptionsMonitor dataFileOptions, IDataAccessContext dataAccessContext, ILoggerFactory loggerFactory, - IScriptureDataFileService scriptureDataFileService, IOutboxService outboxService, IOptionsMonitor translationOptions ) : OwnedEntityServiceBase(engines), IEngineService @@ -24,7 +23,6 @@ IOptionsMonitor translationOptions private readonly IOptionsMonitor _dataFileOptions = dataFileOptions; private readonly IDataAccessContext _dataAccessContext = dataAccessContext; private readonly ILogger _logger = loggerFactory.CreateLogger(); - private readonly IScriptureDataFileService _scriptureDataFileService = scriptureDataFileService; private readonly IOutboxService _outboxService = outboxService; private readonly IOptionsMonitor _translationOptions = translationOptions; @@ -264,13 +262,16 @@ await _outboxService.EnqueueMessageAsync( ); } - private Dictionary> GetChapters(string fileLocation, string scriptureRange) + protected virtual Dictionary> GetChapters(string fileLocation, string scriptureRange) { try { + using var archive = new ZipContainer( + Path.Combine(_dataFileOptions.CurrentValue.FilesDirectory, fileLocation) + ); return ScriptureRangeParser.GetChapters( scriptureRange, - _scriptureDataFileService.GetParatextProjectSettings(fileLocation).Versification + new ZipParatextProjectSettingsParser(archive).Parse().Versification ); } catch (ArgumentException ae) diff --git a/src/Serval/src/Serval.Translation/Services/PretranslationService.cs b/src/Serval/src/Serval.Translation/Services/PretranslationService.cs index 588c3626a..9340388b9 100644 --- a/src/Serval/src/Serval.Translation/Services/PretranslationService.cs +++ b/src/Serval/src/Serval.Translation/Services/PretranslationService.cs @@ -8,14 +8,14 @@ public class PretranslationService( IRepository pretranslations, IRepository engines, IRepository builds, - IScriptureDataFileService scriptureDataFileService + IOptionsMonitor dataFileOptions ) : EntityServiceBase(pretranslations), IPretranslationService { private readonly IRepository _engines = engines; private readonly IRepository _builds = builds; - private readonly IScriptureDataFileService _scriptureDataFileService = scriptureDataFileService; private const string AIDisclaimerRemark = "This draft of {0} was generated using AI on {1}. It should be reviewed and edited carefully."; + private readonly IOptionsMonitor _dataFileOptions = dataFileOptions; public async Task> GetAllAsync( string engineId, @@ -58,6 +58,47 @@ public async Task GetUsfmAsync( if (build is null || build.DateFinished is null) throw new InvalidOperationException($"Could not find any completed builds for engine '{engineId}'."); + CorpusBundle corpusBundle; + if (build.TrainOn == null || build.Pretranslate == null) + { + if (parallelCorpus != null) + { + corpusBundle = new CorpusBundle(engine!.ParallelCorpora.Select(Map)); + } + else + { + corpusBundle = new CorpusBundle(engine!.Corpora.Select(c => Map(c, engine))); + } + } + else + { + HashSet referencedCorpora; + if (parallelCorpus != null) + { + referencedCorpora = build + .TrainOn.Select(t => t.ParallelCorpusRef) + .Concat(build.Pretranslate.Select(p => p.ParallelCorpusRef)) + .Where(r => r != null) + .Select(r => r!) + .ToHashSet(); + corpusBundle = new CorpusBundle( + engine!.ParallelCorpora.Where(pc => referencedCorpora.Contains(pc.Id)).Select(Map) + ); + } + else + { + referencedCorpora = build + .TrainOn.Select(t => t.CorpusRef) + .Concat(build.Pretranslate.Select(p => p.CorpusRef)) + .Where(r => r != null) + .Select(r => r!) + .ToHashSet(); + corpusBundle = new CorpusBundle( + engine!.Corpora.Where(c => referencedCorpora.Contains(c.Id)).Select(c => Map(c, engine)) + ); + } + } + string disclaimerRemark = string.Format( CultureInfo.InvariantCulture, AIDisclaimerRemark, @@ -115,12 +156,8 @@ public async Task GetUsfmAsync( if (sourceFile.Format is not FileFormat.Paratext || targetFile.Format is not FileFormat.Paratext) throw new InvalidOperationException("USFM format is not valid for non-Scripture corpora."); - ParatextProjectSettings sourceSettings = _scriptureDataFileService.GetParatextProjectSettings( - sourceFile.Filename - ); - ParatextProjectSettings targetSettings = _scriptureDataFileService.GetParatextProjectSettings( - targetFile.Filename - ); + ParatextProjectSettings sourceSettings = corpusBundle.GetSettings(GetFilePath(sourceFile.Filename))!; + ParatextProjectSettings targetSettings = corpusBundle.GetSettings(GetFilePath(targetFile.Filename))!; IEnumerable pretranslations = await GetAllAsync( engineId, @@ -166,8 +203,9 @@ PretranslationUsfmMarkerBehavior StyleBehavior p.StyleBehavior ) ); - using Shared.Services.ZipParatextProjectTextUpdater updater = - _scriptureDataFileService.GetZipParatextProjectTextUpdater(targetFile.Filename); + using SIL.ServiceToolkit.Services.ZipParatextProjectTextUpdater updater = corpusBundle.GetTextUpdater( + GetFilePath(targetFile.Filename) + ); switch (textOrigin) { case PretranslationUsfmTextOrigin.PreferExisting: @@ -242,8 +280,9 @@ PretranslationUsfmMarkerBehavior StyleBehavior && (template is PretranslationUsfmTemplate.Auto or PretranslationUsfmTemplate.Source) ) { - using Shared.Services.ZipParatextProjectTextUpdater updater = - _scriptureDataFileService.GetZipParatextProjectTextUpdater(sourceFile.Filename); + using SIL.ServiceToolkit.Services.ZipParatextProjectTextUpdater updater = corpusBundle.GetTextUpdater( + GetFilePath(sourceFile.Filename) + ); // Copy and update the source book if it exists switch (textOrigin) @@ -565,4 +604,59 @@ pretranslationRow.Pretranslation.Alignment is not null : null ); } + + private SIL.ServiceToolkit.Models.ParallelCorpus Map(ParallelCorpus source) + { + return new SIL.ServiceToolkit.Models.ParallelCorpus + { + Id = source.Id, + SourceCorpora = source.SourceCorpora.Select(Map).ToArray(), + TargetCorpora = source.TargetCorpora.Select(Map).ToArray(), + }; + } + + private SIL.ServiceToolkit.Models.MonolingualCorpus Map(MonolingualCorpus source) + { + return new SIL.ServiceToolkit.Models.MonolingualCorpus + { + Id = source.Id, + Language = source.Language, + Files = source.Files.Select(Map).ToList(), + }; + } + + private SIL.ServiceToolkit.Models.ParallelCorpus Map(Corpus source, Engine engine) + { + return new SIL.ServiceToolkit.Models.ParallelCorpus + { + Id = source.Id, + SourceCorpora = source.SourceFiles.Select(f => Map(f, engine.SourceLanguage)).ToArray(), + TargetCorpora = source.TargetFiles.Select(f => Map(f, engine.TargetLanguage)).ToArray(), + }; + } + + private SIL.ServiceToolkit.Models.MonolingualCorpus Map(CorpusFile source, string language) + { + return new SIL.ServiceToolkit.Models.MonolingualCorpus + { + Id = source.Id, + Language = language, + Files = [Map(source)], + }; + } + + private SIL.ServiceToolkit.Models.CorpusFile Map(CorpusFile source) + { + return new SIL.ServiceToolkit.Models.CorpusFile + { + Location = GetFilePath(source.Filename), + Format = (SIL.ServiceToolkit.Models.FileFormat)source.Format, + TextId = source.TextId, + }; + } + + private string GetFilePath(string filename) + { + return Path.Combine(_dataFileOptions.CurrentValue.FilesDirectory, filename); + } } diff --git a/src/Serval/src/Serval.WordAlignment/Services/EngineService.cs b/src/Serval/src/Serval.WordAlignment/Services/EngineService.cs index 6deac95f6..5b6addff7 100644 --- a/src/Serval/src/Serval.WordAlignment/Services/EngineService.cs +++ b/src/Serval/src/Serval.WordAlignment/Services/EngineService.cs @@ -10,7 +10,6 @@ public class EngineService( IOptionsMonitor dataFileOptions, IDataAccessContext dataAccessContext, ILoggerFactory loggerFactory, - IScriptureDataFileService scriptureDataFileService, IOutboxService outboxService, IOptionsMonitor wordAlignmentOptions ) : OwnedEntityServiceBase(engines), IEngineService @@ -21,7 +20,6 @@ IOptionsMonitor wordAlignmentOptions private readonly IOptionsMonitor _dataFileOptions = dataFileOptions; private readonly IDataAccessContext _dataAccessContext = dataAccessContext; private readonly ILogger _logger = loggerFactory.CreateLogger(); - private readonly IScriptureDataFileService _scriptureDataFileService = scriptureDataFileService; private readonly IOutboxService _outboxService = outboxService; private readonly IOptionsMonitor _wordAlignmentOptions = wordAlignmentOptions; @@ -126,13 +124,16 @@ await _outboxService.EnqueueMessageAsync( ); } - private Dictionary> GetChapters(string fileLocation, string scriptureRange) + protected virtual Dictionary> GetChapters(string fileLocation, string scriptureRange) { try { + using var archive = new ZipContainer( + Path.Combine(_dataFileOptions.CurrentValue.FilesDirectory, fileLocation) + ); return ScriptureRangeParser.GetChapters( scriptureRange, - _scriptureDataFileService.GetParatextProjectSettings(fileLocation).Versification + new ZipParatextProjectSettingsParser(archive).Parse().Versification ); } catch (ArgumentException ae) diff --git a/src/Serval/test/Serval.Shared.Tests/Services/ScriptureDataFileServiceTests.cs b/src/Serval/test/Serval.Shared.Tests/Services/ScriptureDataFileServiceTests.cs deleted file mode 100644 index 5836085e5..000000000 --- a/src/Serval/test/Serval.Shared.Tests/Services/ScriptureDataFileServiceTests.cs +++ /dev/null @@ -1,106 +0,0 @@ -namespace Serval.Shared.Services; - -[TestFixture] -public class ScriptureDataFileServiceTests -{ - [Test] - public void GetParatextProjectSettings() - { - TestEnvironment env = new(); - ParatextProjectSettings settings = env.Service.GetParatextProjectSettings("file1.zip"); - Assert.That(settings.Name, Is.EqualTo("PROJ")); - } - - [Test] - public void GetZipParatextProjectTextUpdater() - { - TestEnvironment env = new(); - using ZipParatextProjectTextUpdater updater = env.Service.GetZipParatextProjectTextUpdater("file1.zip"); - Assert.That( - updater.UpdateUsfm("MAT", [], textBehavior: UpdateUsfmTextBehavior.PreferExisting).ReplaceLineEndings("\n"), - Is.EqualTo( - $@"\id MAT - PROJ -\h {Canon.BookIdToEnglishName("MAT")} -\c 1 -\p -\v 1 Chapter one, verse one. -\v 2 Chapter one, verse two. -\c 2 -\p -\v 1 Chapter two, verse one. -\v 2 Chapter two, verse two. -" - ) - .IgnoreLineEndings() - ); - } - - private class TestEnvironment - { - public TestEnvironment() - { - IFileSystem fileSystem = Substitute.For(); - fileSystem - .OpenZipFile("file1.zip") - .Returns(ci => - { - IZipContainer container = CreateZipContainer(); - AddBook(container, "MAT"); - return container; - }); - IOptionsMonitor dataFileOptions = Substitute.For>(); - dataFileOptions.CurrentValue.Returns(new DataFileOptions()); - - Service = new ScriptureDataFileService(fileSystem, dataFileOptions); - } - - public ScriptureDataFileService Service { get; } - - private static IZipContainer CreateZipContainer() - { - IZipContainer container = Substitute.For(); - container.EntryExists("Settings.xml").Returns(true); - XElement settingsXml = new( - "ScriptureText", - new XElement("StyleSheet", "usfm.sty"), - new XElement("Guid", "ID"), - new XElement("Name", "PROJ"), - new XElement("FullName", "PROJ"), - new XElement("Encoding", "65001"), - new XElement( - "Naming", - new XAttribute("PrePart", ""), - new XAttribute("PostPart", "PROJ.SFM"), - new XAttribute("BookNameForm", "MAT") - ), - new XElement("BiblicalTermsListSetting", "Major::BiblicalTerms.xml") - ); - container - .OpenEntry("Settings.xml") - .Returns(new MemoryStream(Encoding.UTF8.GetBytes(settingsXml.ToString()))); - container.EntryExists("custom.vrs").Returns(false); - container.EntryExists("usfm.sty").Returns(false); - container.EntryExists("custom.sty").Returns(false); - return container; - } - - private static void AddBook(IZipContainer container, string book) - { - string bookFileName = $"{book}PROJ.SFM"; - container.EntryExists(bookFileName).Returns(true); - string usfm = - $@"\id {book} - PROJ -\h {Canon.BookIdToEnglishName(book)} -\c 1 -\p -\v 1 Chapter one, verse one. -\v 2 Chapter one, verse two. -\c 2 -\p -\v 1 Chapter two, verse one. -\v 2 Chapter two, verse two. -"; - container.OpenEntry(bookFileName).Returns(new MemoryStream(Encoding.UTF8.GetBytes(usfm))); - } - } -} diff --git a/src/Serval/test/Serval.Shared.Tests/Usings.cs b/src/Serval/test/Serval.Shared.Tests/Usings.cs index ec2251e6f..86da99b25 100644 --- a/src/Serval/test/Serval.Shared.Tests/Usings.cs +++ b/src/Serval/test/Serval.Shared.Tests/Usings.cs @@ -1,13 +1,4 @@ -global using System.Text; global using System.Text.Json; -global using System.Xml.Linq; -global using Microsoft.Extensions.Options; -global using NSubstitute; global using NUnit.Framework; global using NUnit.Framework.Constraints; -global using Serval.Shared.Configuration; -global using Serval.Shared.Utils; -global using SIL.Machine.Corpora; -global using SIL.Scripture; -global using SIL.ServiceToolkit.Services; global using SIL.ServiceToolkit.Utils; diff --git a/src/Serval/test/Serval.Translation.Tests/Services/EngineServiceTests.cs b/src/Serval/test/Serval.Translation.Tests/Services/EngineServiceTests.cs index 9a9f63315..bc47ec373 100644 --- a/src/Serval/test/Serval.Translation.Tests/Services/EngineServiceTests.cs +++ b/src/Serval/test/Serval.Translation.Tests/Services/EngineServiceTests.cs @@ -2356,27 +2356,6 @@ public TestEnvironment() .Returns(TranslationServiceClient); IOptionsMonitor dataFileOptions = Substitute.For>(); dataFileOptions.CurrentValue.Returns(new DataFileOptions()); - var scriptureDataFileService = Substitute.For(); - scriptureDataFileService - .GetParatextProjectSettings(Arg.Any()) - .Returns( - new ParatextProjectSettings( - guid: "Id", - name: "Tst", - fullName: "Test", - encoding: Encoding.UTF8, - versification: ScrVers.English, - stylesheet: new UsfmStylesheet("usfm.sty"), - fileNamePrefix: "TST", - fileNameForm: "MAT", - fileNameSuffix: ".USFM", - biblicalTermsListType: "BiblicalTerms", - biblicalTermsProjectName: "", - biblicalTermsFileName: "BiblicalTerms.xml", - languageCode: "en", - translationType: "Standard" - ) - ); Pretranslations = new MemoryRepository(); OutboxService = Substitute.For(); @@ -2387,7 +2366,7 @@ public TestEnvironment() new TranslationOptions { Engines = [new EngineInfo { Type = "Smt" }] } ); - Service = new EngineService( + Service = new TestEngineService( Engines, new MemoryRepository(), Pretranslations, @@ -2396,7 +2375,6 @@ public TestEnvironment() dataFileOptions, new MemoryDataAccessContext(), new LoggerFactory(), - scriptureDataFileService, OutboxService, translationOptions ); @@ -2870,4 +2848,42 @@ private static AsyncUnaryCall CreateAsyncUnaryCall(TRespon ); } } + + private class TestEngineService( + IRepository engines, + IRepository builds, + IRepository pretranslations, + IScopedMediator mediator, + GrpcClientFactory grpcClientFactory, + IOptionsMonitor dataFileOptions, + IDataAccessContext dataAccessContext, + ILoggerFactory loggerFactory, + IOutboxService outboxService, + IOptionsMonitor translationOptions + ) + : EngineService( + engines, + builds, + pretranslations, + mediator, + grpcClientFactory, + dataFileOptions, + dataAccessContext, + loggerFactory, + outboxService, + translationOptions + ) + { + protected override Dictionary> GetChapters(string fileLocation, string scriptureRange) + { + try + { + return ScriptureRangeParser.GetChapters(scriptureRange); + } + catch (ArgumentException ae) + { + throw new InvalidOperationException($"The scripture range {scriptureRange} is not valid: {ae.Message}"); + } + } + } } diff --git a/src/Serval/test/Serval.Translation.Tests/Services/PretranslationServiceTests.cs b/src/Serval/test/Serval.Translation.Tests/Services/PretranslationServiceTests.cs index b498f4da7..286576f82 100644 --- a/src/Serval/test/Serval.Translation.Tests/Services/PretranslationServiceTests.cs +++ b/src/Serval/test/Serval.Translation.Tests/Services/PretranslationServiceTests.cs @@ -1,25 +1,11 @@ -namespace Serval.Translation.Services; +using System.IO.Compression; +using SIL.Machine.Utils; + +namespace Serval.Translation.Services; [TestFixture] public class PretranslationServiceTests { - private const string SourceUsfm = - $@"\id MAT - SRC -\c 1 -\v 1 SRC - Chapter one, verse one. -\p new paragraph -\v 2 -\v 3 SRC - Chapter one, verse three. -"; - - private const string TargetUsfm = - @"\id MAT - TRG -\c 1 -\v 1 TRG - Chapter one, verse one. -\v 2 -\v 3 TRG - Chapter one, verse three. -"; - [Test] public async Task GetUsfmAsync_Source_PreferExisting() { @@ -33,7 +19,7 @@ public async Task GetUsfmAsync_Source_PreferExisting() Assert.That( usfm, Is.EqualTo( - @"\id MAT - TRG + @"\id MAT - Test2 \rem This draft of MAT was generated using AI on 1970-01-01 00:00:00Z. It should be reviewed and edited carefully. \rem Paragraph breaks and embed markers were moved to the end of the verse. Style markers were removed. \c 1 @@ -60,7 +46,7 @@ public async Task GetUsfmAsync_Source_PreferPretranslated() Assert.That( usfm, Is.EqualTo( - @"\id MAT - TRG + @"\id MAT - Test2 \rem This draft of MAT was generated using AI on 1970-01-01 00:00:00Z. It should be reviewed and edited carefully. \rem Paragraph breaks and embed markers were moved to the end of the verse. Style markers were removed. \c 1 @@ -87,7 +73,7 @@ public async Task GetUsfmAsync_Source_OnlyExisting() Assert.That( usfm, Is.EqualTo( - @"\id MAT - TRG + @"\id MAT - Test2 \rem This draft of MAT was generated using AI on 1970-01-01 00:00:00Z. It should be reviewed and edited carefully. \rem Paragraph breaks and embed markers were moved to the end of the verse. Style markers were removed. \c 1 @@ -114,7 +100,7 @@ public async Task GetUsfmAsync_Source_OnlyPretranslated() Assert.That( usfm, Is.EqualTo( - @"\id MAT - TRG + @"\id MAT - Test2 \rem This draft of MAT was generated using AI on 1970-01-01 00:00:00Z. It should be reviewed and edited carefully. \rem Paragraph breaks and embed markers were moved to the end of the verse. Style markers were removed. \c 1 @@ -142,7 +128,7 @@ public async Task GetUsfmAsync_Source_PlaceMarkers() Assert.That( usfm, Is.EqualTo( - @"\id MAT - TRG + @"\id MAT - Test2 \rem This draft of MAT was generated using AI on 1970-01-01 00:00:00Z. It should be reviewed and edited carefully. \rem Embed markers were moved to the end of the verse. Paragraph breaks have positions preserved. Style markers were removed. \c 1 @@ -159,8 +145,7 @@ public async Task GetUsfmAsync_Source_PlaceMarkers() [Test] public async Task GetUsfmAsync_Target_PreferExisting() { - using TestEnvironment env = new(); - env.AddMatthewToTarget(); + using TestEnvironment env = new(addMatthew: true); string usfm = await env.GetUsfmAsync( PretranslationUsfmTextOrigin.PreferExisting, @@ -186,8 +171,7 @@ public async Task GetUsfmAsync_Target_PreferExisting() [Test] public async Task GetUsfmAsync_Target_PreferPretranslated() { - using TestEnvironment env = new(); - env.AddMatthewToTarget(); + using TestEnvironment env = new(addMatthew: true); string usfm = await env.GetUsfmAsync( PretranslationUsfmTextOrigin.PreferPretranslated, @@ -197,7 +181,7 @@ public async Task GetUsfmAsync_Target_PreferPretranslated() Assert.That( usfm, Is.EqualTo( - @"\id MAT - TRG + @"\id MAT - Test3 \rem This draft of MAT was generated using AI on 1970-01-01 00:00:00Z. It should be reviewed and edited carefully. \rem Paragraph breaks and embed markers were moved to the end of the verse. Style markers were removed. \c 1 @@ -236,7 +220,7 @@ public async Task GetUsfmAsync_Auto_TargetBookDoesNotExist() Assert.That( usfm, Is.EqualTo( - @"\id MAT - TRG + @"\id MAT - Test2 \rem This draft of MAT was generated using AI on 1970-01-01 00:00:00Z. It should be reviewed and edited carefully. \rem Paragraph breaks and embed markers were moved to the end of the verse. Style markers were removed. \c 1 @@ -253,8 +237,7 @@ public async Task GetUsfmAsync_Auto_TargetBookDoesNotExist() [Test] public async Task GetUsfmAsync_Auto_TargetBookExists() { - using TestEnvironment env = new(); - env.AddMatthewToTarget(); + using TestEnvironment env = new(addMatthew: true); string usfm = await env.GetUsfmAsync( PretranslationUsfmTextOrigin.PreferPretranslated, @@ -264,7 +247,7 @@ public async Task GetUsfmAsync_Auto_TargetBookExists() Assert.That( usfm, Is.EqualTo( - @"\id MAT - TRG + @"\id MAT - Test3 \rem This draft of MAT was generated using AI on 1970-01-01 00:00:00Z. It should be reviewed and edited carefully. \rem Paragraph breaks and embed markers were moved to the end of the verse. Style markers were removed. \c 1 @@ -280,15 +263,22 @@ public async Task GetUsfmAsync_Auto_TargetBookExists() [Test] public async Task GetUsfmAsync_Target_OnlyExisting() { - using TestEnvironment env = new(); - env.AddMatthewToTarget(); + using TestEnvironment env = new(addMatthew: true); string usfm = await env.GetUsfmAsync( PretranslationUsfmTextOrigin.OnlyExisting, PretranslationUsfmTemplate.Target ); - List lines = TargetUsfm.Split('\n').ToList(); + string targetUsfm = + @"\id MAT - Test3 +\c 1 +\v 1 TRG - Chapter one, verse one. +\v 2 +\v 3 TRG - Chapter one, verse three. +"; + + List lines = targetUsfm.Split('\n').ToList(); lines.Insert( 1, @@ -304,8 +294,7 @@ public async Task GetUsfmAsync_Target_OnlyExisting() [Test] public async Task GetUsfmAsync_Target_OnlyPretranslated() { - using TestEnvironment env = new(); - env.AddMatthewToTarget(); + using TestEnvironment env = new(addMatthew: true); string usfm = await env.GetUsfmAsync( PretranslationUsfmTextOrigin.OnlyPretranslated, @@ -315,7 +304,7 @@ public async Task GetUsfmAsync_Target_OnlyPretranslated() Assert.That( usfm, Is.EqualTo( - @"\id MAT - TRG + @"\id MAT - Test3 \rem This draft of MAT was generated using AI on 1970-01-01 00:00:00Z. It should be reviewed and edited carefully. \rem Paragraph breaks and embed markers were moved to the end of the verse. Style markers were removed. \c 1 @@ -441,8 +430,31 @@ public void GetChapterRanges(int[] chapterNumbers, string expectedRangeString) private class TestEnvironment : IDisposable { - public TestEnvironment() + private static readonly string TestDataPath = Path.Combine("..", "..", "..", "data"); + + public TestEnvironment(bool addMatthew = false) { + _tempDir = new TempDirectory("PretranslationServiceTests"); + string file1Path = Path.Combine(_tempDir.Path, "file1.zip"); + if (!File.Exists(file1Path)) + { + ZipFile.CreateFromDirectory( + Path.Combine(TestDataPath, "pt-project1"), + Path.Combine(_tempDir.Path, "file1.zip") + ); + } + string file2Path = Path.Combine(_tempDir.Path, "file2.zip"); + if (File.Exists(file2Path)) + File.Delete(file2Path); + if (addMatthew) + { + ZipFile.CreateFromDirectory(Path.Combine(TestDataPath, "pt-project3"), file2Path); + } + else + { + ZipFile.CreateFromDirectory(Path.Combine(TestDataPath, "pt-project2"), file2Path); + } + CorpusFile file1 = new() { Id = "file1", @@ -636,50 +648,22 @@ public TestEnvironment() Translation = "Chapter 1, verse 2.", }, ]); - ScriptureDataFileService = Substitute.For(); - ScriptureDataFileService.GetParatextProjectSettings("file1.zip").Returns(CreateProjectSettings("SRC")); - ScriptureDataFileService.GetParatextProjectSettings("file2.zip").Returns(CreateProjectSettings("TRG")); - var zipSubstituteSource = Substitute.For(); - var zipSubstituteTarget = Substitute.For(); - zipSubstituteSource - .OpenEntry("MATSRC.SFM") - .Returns(x => new MemoryStream(Encoding.UTF8.GetBytes(SourceUsfm))); - zipSubstituteTarget.OpenEntry("MATTRG.SFM").Returns(x => new MemoryStream(Encoding.UTF8.GetBytes(""))); - zipSubstituteSource.EntryExists(Arg.Any()).Returns(false); - zipSubstituteTarget.EntryExists(Arg.Any()).Returns(false); - zipSubstituteSource.EntryExists("MATSRC.SFM").Returns(true); - zipSubstituteTarget.EntryExists("MATTRG.SFM").Returns(true); - TargetZipContainer = zipSubstituteTarget; - TextUpdaters = new List(); - Shared.Services.ZipParatextProjectTextUpdater GetTextUpdater(string type) - { - var updater = type switch - { - "SRC" => new Shared.Services.ZipParatextProjectTextUpdater( - zipSubstituteSource, - CreateProjectSettings("SRC") - ), - "TRG" => new Shared.Services.ZipParatextProjectTextUpdater( - zipSubstituteTarget, - CreateProjectSettings("TRG") - ), - _ => throw new ArgumentException(), - }; - TextUpdaters.Add(updater); - return updater; - } - ScriptureDataFileService.GetZipParatextProjectTextUpdater("file1.zip").Returns(x => GetTextUpdater("SRC")); - ScriptureDataFileService.GetZipParatextProjectTextUpdater("file2.zip").Returns(x => GetTextUpdater("TRG")); - Service = new PretranslationService(Pretranslations, Engines, Builds, ScriptureDataFileService); + IOptionsMonitor dataFileOptions = Substitute.For>(); + dataFileOptions.CurrentValue.Returns(new DataFileOptions() { FilesDirectory = _tempDir.Path }); + Service = new PretranslationService(Pretranslations, Engines, Builds, dataFileOptions); } public PretranslationService Service { get; } public MemoryRepository Pretranslations { get; } public MemoryRepository Engines { get; } public MemoryRepository Builds { get; } - public IScriptureDataFileService ScriptureDataFileService { get; } - public IZipContainer TargetZipContainer { get; } - public IList TextUpdaters { get; } + + private readonly TempDirectory _tempDir; + + public void Dispose() + { + _tempDir.Dispose(); + } public async Task GetUsfmAsync( PretranslationUsfmTextOrigin textOrigin, @@ -717,40 +701,5 @@ public async Task GetUsfmAsync( Assert.That(parallel_usfm, Is.EqualTo(usfm)); return usfm; } - - public void AddMatthewToTarget() - { - TargetZipContainer - .OpenEntry("MATTRG.SFM") - .Returns(x => new MemoryStream(Encoding.UTF8.GetBytes(TargetUsfm))); - } - - private static ParatextProjectSettings CreateProjectSettings(string name) - { - return new ParatextProjectSettings( - guid: "Id", - name: name, - fullName: name, - encoding: Encoding.UTF8, - versification: ScrVers.English, - stylesheet: new UsfmStylesheet("usfm.sty"), - fileNamePrefix: "", - fileNameForm: "MAT", - fileNameSuffix: $"{name}.SFM", - biblicalTermsListType: "Major", - biblicalTermsProjectName: "", - biblicalTermsFileName: "BiblicalTerms.xml", - languageCode: "en", - translationType: "Standard" - ); - } - - public void Dispose() - { - foreach (var updater in TextUpdaters) - { - updater.Dispose(); - } - } } } diff --git a/src/Serval/test/Serval.Translation.Tests/Usings.cs b/src/Serval/test/Serval.Translation.Tests/Usings.cs index 48c63ed06..1403ffb4e 100644 --- a/src/Serval/test/Serval.Translation.Tests/Usings.cs +++ b/src/Serval/test/Serval.Translation.Tests/Usings.cs @@ -1,4 +1,3 @@ -global using System.Text; global using System.Text.RegularExpressions; global using Grpc.Core; global using Grpc.Net.ClientFactory; @@ -9,11 +8,8 @@ global using NUnit.Framework; global using Serval.Shared.Configuration; global using Serval.Shared.Models; -global using Serval.Shared.Services; global using Serval.Shared.Utils; global using Serval.Translation.Contracts; global using Serval.Translation.Models; global using SIL.DataAccess; -global using SIL.Machine.Corpora; -global using SIL.Scripture; global using SIL.ServiceToolkit.Services; diff --git a/src/Serval/test/Serval.Translation.Tests/data/pt-project1/41MATTe1.SFM b/src/Serval/test/Serval.Translation.Tests/data/pt-project1/41MATTe1.SFM new file mode 100644 index 000000000..8130771c2 --- /dev/null +++ b/src/Serval/test/Serval.Translation.Tests/data/pt-project1/41MATTe1.SFM @@ -0,0 +1,6 @@ +\id MAT - SRC +\c 1 +\v 1 SRC - Chapter one, verse one. +\p new paragraph +\v 2 +\v 3 SRC - Chapter one, verse three. diff --git a/src/Serval/test/Serval.Translation.Tests/data/pt-project1/Settings.xml b/src/Serval/test/Serval.Translation.Tests/data/pt-project1/Settings.xml new file mode 100644 index 000000000..6358f4f0b --- /dev/null +++ b/src/Serval/test/Serval.Translation.Tests/data/pt-project1/Settings.xml @@ -0,0 +1,34 @@ + + usfm.sty + 4 + en::: + English + 8.0.100.76 + Test1 + 65001 + T + + NFC + Te1 + a7e0b3ce0200736062f9f810a444dbfbe64aca35 + Charis SIL + 12 + + + + 41MAT + + Tes.SFM + Major::BiblicalTerms.xml + F + F + F + Public + Standard:: + + 3 + 000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + 000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + + + \ No newline at end of file diff --git a/src/Serval/test/Serval.Translation.Tests/data/pt-project1/custom.vrs b/src/Serval/test/Serval.Translation.Tests/data/pt-project1/custom.vrs new file mode 100644 index 000000000..9c1cd3873 --- /dev/null +++ b/src/Serval/test/Serval.Translation.Tests/data/pt-project1/custom.vrs @@ -0,0 +1,31 @@ +# custom.vrs + +LEV 14:56 +ROM 14:26 +REV 12:17 +TOB 5:22 +TOB 10:12 +SIR 23:28 +ESG 1:22 +ESG 3:15 +ESG 5:14 +ESG 8:17 +ESG 10:14 +SIR 33:33 +SIR 41:24 +BAR 1:22 +4MA 7:25 +4MA 12:20 + +# deliberately missing verses +-ROM 16:26 +-ROM 16:27 +-3JN 1:15 +-S3Y 1:49 +-ESG 4:6 +-ESG 9:5 +-ESG 9:30 + +LEV 14:55 = LEV 14:55 +LEV 14:55 = LEV 14:56 +LEV 14:56 = LEV 14:57 diff --git a/src/Serval/test/Serval.Translation.Tests/data/pt-project2/41MATTe2.SFM b/src/Serval/test/Serval.Translation.Tests/data/pt-project2/41MATTe2.SFM new file mode 100644 index 000000000..e69de29bb diff --git a/src/Serval/test/Serval.Translation.Tests/data/pt-project2/Settings.xml b/src/Serval/test/Serval.Translation.Tests/data/pt-project2/Settings.xml new file mode 100644 index 000000000..4ce9e238d --- /dev/null +++ b/src/Serval/test/Serval.Translation.Tests/data/pt-project2/Settings.xml @@ -0,0 +1,33 @@ + + usfm.sty + 4 + en::: + English + 8.0.100.76 + Test2 + 65001 + T + + NFC + Te2 + a7e0b3ce0200736062f9f810a444dbfbe64aca35 + Charis SIL + 12 + + + + 41MAT + + Ten.SFM + F + F + F + Public + Standard:: + + 3 + 000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + 000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + + + \ No newline at end of file diff --git a/src/Serval/test/Serval.Translation.Tests/data/pt-project2/custom.vrs b/src/Serval/test/Serval.Translation.Tests/data/pt-project2/custom.vrs new file mode 100644 index 000000000..9c1cd3873 --- /dev/null +++ b/src/Serval/test/Serval.Translation.Tests/data/pt-project2/custom.vrs @@ -0,0 +1,31 @@ +# custom.vrs + +LEV 14:56 +ROM 14:26 +REV 12:17 +TOB 5:22 +TOB 10:12 +SIR 23:28 +ESG 1:22 +ESG 3:15 +ESG 5:14 +ESG 8:17 +ESG 10:14 +SIR 33:33 +SIR 41:24 +BAR 1:22 +4MA 7:25 +4MA 12:20 + +# deliberately missing verses +-ROM 16:26 +-ROM 16:27 +-3JN 1:15 +-S3Y 1:49 +-ESG 4:6 +-ESG 9:5 +-ESG 9:30 + +LEV 14:55 = LEV 14:55 +LEV 14:55 = LEV 14:56 +LEV 14:56 = LEV 14:57 diff --git a/src/Serval/test/Serval.Translation.Tests/data/pt-project3/41MATTe3.SFM b/src/Serval/test/Serval.Translation.Tests/data/pt-project3/41MATTe3.SFM new file mode 100644 index 000000000..90cb675c5 --- /dev/null +++ b/src/Serval/test/Serval.Translation.Tests/data/pt-project3/41MATTe3.SFM @@ -0,0 +1,5 @@ +\id MAT - TRG +\c 1 +\v 1 TRG - Chapter one, verse one. +\v 2 +\v 3 TRG - Chapter one, verse three. diff --git a/src/Serval/test/Serval.Translation.Tests/data/pt-project3/Settings.xml b/src/Serval/test/Serval.Translation.Tests/data/pt-project3/Settings.xml new file mode 100644 index 000000000..873f4ab4b --- /dev/null +++ b/src/Serval/test/Serval.Translation.Tests/data/pt-project3/Settings.xml @@ -0,0 +1,33 @@ + + usfm.sty + 4 + en::: + English + 8.0.100.76 + Test3 + 65001 + T + + NFC + Te3 + a7e0b3ce0200736062f9f810a444dbfbe64aca35 + Charis SIL + 12 + + + + 41MAT + + Ten.SFM + F + F + F + Public + Standard:: + + 3 + 000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + 000000000000000000000000000000000000000100000000000000000000000000000000000000000000000000000000000000000000000000000000000 + + + \ No newline at end of file diff --git a/src/Serval/test/Serval.Translation.Tests/data/pt-project3/custom.vrs b/src/Serval/test/Serval.Translation.Tests/data/pt-project3/custom.vrs new file mode 100644 index 000000000..9c1cd3873 --- /dev/null +++ b/src/Serval/test/Serval.Translation.Tests/data/pt-project3/custom.vrs @@ -0,0 +1,31 @@ +# custom.vrs + +LEV 14:56 +ROM 14:26 +REV 12:17 +TOB 5:22 +TOB 10:12 +SIR 23:28 +ESG 1:22 +ESG 3:15 +ESG 5:14 +ESG 8:17 +ESG 10:14 +SIR 33:33 +SIR 41:24 +BAR 1:22 +4MA 7:25 +4MA 12:20 + +# deliberately missing verses +-ROM 16:26 +-ROM 16:27 +-3JN 1:15 +-S3Y 1:49 +-ESG 4:6 +-ESG 9:5 +-ESG 9:30 + +LEV 14:55 = LEV 14:55 +LEV 14:55 = LEV 14:56 +LEV 14:56 = LEV 14:57 diff --git a/src/Serval/test/Serval.WordAlignment.Tests/Services/EngineServiceTests.cs b/src/Serval/test/Serval.WordAlignment.Tests/Services/EngineServiceTests.cs index 631c8fad8..c9ff894da 100644 --- a/src/Serval/test/Serval.WordAlignment.Tests/Services/EngineServiceTests.cs +++ b/src/Serval/test/Serval.WordAlignment.Tests/Services/EngineServiceTests.cs @@ -1361,27 +1361,6 @@ public TestEnvironment() .Returns(WordAlignmentServiceClient); IOptionsMonitor dataFileOptions = Substitute.For>(); dataFileOptions.CurrentValue.Returns(new DataFileOptions()); - var scriptureDataFileService = Substitute.For(); - scriptureDataFileService - .GetParatextProjectSettings(Arg.Any()) - .Returns( - new ParatextProjectSettings( - guid: "Id", - name: "Tst", - fullName: "Test", - encoding: Encoding.UTF8, - versification: ScrVers.English, - stylesheet: new UsfmStylesheet("usfm.sty"), - fileNamePrefix: "TST", - fileNameForm: "MAT", - fileNameSuffix: ".USFM", - biblicalTermsListType: "BiblicalTerms", - biblicalTermsProjectName: "", - biblicalTermsFileName: "BiblicalTerms.xml", - languageCode: "en", - translationType: "Standard" - ) - ); WordAlignments = new MemoryRepository(); OutboxService = Substitute.For(); @@ -1392,7 +1371,7 @@ public TestEnvironment() new WordAlignmentOptions { Engines = [new EngineInfo { Type = "Statistical" }] } ); - Service = new EngineService( + Service = new TestEngineService( Engines, new MemoryRepository(), WordAlignments, @@ -1400,7 +1379,6 @@ public TestEnvironment() dataFileOptions, new MemoryDataAccessContext(), new LoggerFactory(), - scriptureDataFileService, OutboxService, wordAlignmentOptions ); @@ -1835,4 +1813,40 @@ private static AsyncUnaryCall CreateAsyncUnaryCall(TRespon } return alignedWordPairs; } + + private class TestEngineService( + IRepository engines, + IRepository builds, + IRepository wordAlignments, + GrpcClientFactory grpcClientFactory, + IOptionsMonitor dataFileOptions, + IDataAccessContext dataAccessContext, + ILoggerFactory loggerFactory, + IOutboxService outboxService, + IOptionsMonitor wordAlignmentOptions + ) + : EngineService( + engines, + builds, + wordAlignments, + grpcClientFactory, + dataFileOptions, + dataAccessContext, + loggerFactory, + outboxService, + wordAlignmentOptions + ) + { + protected override Dictionary> GetChapters(string fileLocation, string scriptureRange) + { + try + { + return ScriptureRangeParser.GetChapters(scriptureRange); + } + catch (ArgumentException ae) + { + throw new InvalidOperationException($"The scripture range {scriptureRange} is not valid: {ae.Message}"); + } + } + } } diff --git a/src/Serval/test/Serval.WordAlignment.Tests/Usings.cs b/src/Serval/test/Serval.WordAlignment.Tests/Usings.cs index 7b14ff4a8..e1c4deec7 100644 --- a/src/Serval/test/Serval.WordAlignment.Tests/Usings.cs +++ b/src/Serval/test/Serval.WordAlignment.Tests/Usings.cs @@ -1,4 +1,3 @@ -global using System.Text; global using Grpc.Core; global using Grpc.Net.ClientFactory; global using MassTransit; @@ -8,9 +7,6 @@ global using NUnit.Framework; global using Serval.Shared.Configuration; global using Serval.Shared.Models; -global using Serval.Shared.Services; global using Serval.Shared.Utils; global using Serval.WordAlignment.Models; global using SIL.DataAccess; -global using SIL.Machine.Corpora; -global using SIL.Scripture; diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Configuration/IServiceCollectionsExtensions.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Configuration/IServiceCollectionsExtensions.cs index 058414b64..6b681401c 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Configuration/IServiceCollectionsExtensions.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Configuration/IServiceCollectionsExtensions.cs @@ -4,8 +4,7 @@ public static class IServiceCollectionExtensions { public static IServiceCollection AddParallelCorpusPreprocessor(this IServiceCollection services) { - services.TryAddSingleton(); - services.TryAddSingleton(); + services.TryAddSingleton(); return services; } diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/MissingParentProjectError.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/MissingParentProjectError.cs new file mode 100644 index 000000000..fb76591dc --- /dev/null +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/MissingParentProjectError.cs @@ -0,0 +1,7 @@ +namespace SIL.ServiceToolkit.Models; + +public record MissingParentProjectError +{ + public required string ProjectName { get; init; } + public required string ParentProjectName { get; init; } +} diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/TrainingDataType.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/TrainingDataType.cs index 2597ffc4b..46f23035b 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/TrainingDataType.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/TrainingDataType.cs @@ -3,5 +3,5 @@ namespace SIL.ServiceToolkit.Models; public enum TrainingDataType { Text = 0, - KeyTerms = 1, + KeyTerm = 1, } diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusBundle.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusBundle.cs new file mode 100644 index 000000000..ea0d3f0e4 --- /dev/null +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusBundle.cs @@ -0,0 +1,226 @@ +using ZipParatextProjectTextUpdater = SIL.ServiceToolkit.Services.ZipParatextProjectTextUpdater; + +namespace SIL.ServiceToolkit.Utils; + +public class CorpusBundle +{ + private readonly Dictionary< + string, + (ParatextProjectSettings DaughterSettings, string? ParentLocation, ParatextProjectSettings? ParentSettings) + > _settings; + + public IEnumerable<( + ParallelCorpus ParallelCorpus, + MonolingualCorpus MonolingualCorpus, + IReadOnlyList CorpusFile, + IReadOnlyList TextCorpora + )> SourceTextCorpora { get; } + + public IEnumerable<( + ParallelCorpus ParallelCorpus, + MonolingualCorpus MonolingualCorpus, + IReadOnlyList CorpusFile, + IReadOnlyList TextCorpora + )> TargetTextCorpora { get; } + + public IEnumerable<( + ParallelCorpus ParallelCorpus, + MonolingualCorpus MonolingualCorpus, + IReadOnlyList CorpusFile, + IReadOnlyList TextCorpora + )> TextCorpora => SourceTextCorpora.Concat(TargetTextCorpora); + + public IEnumerable<( + ParallelCorpus ParallelCorpus, + MonolingualCorpus MonolingualCorpus, + IReadOnlyList CorpusFile, + IReadOnlyList TextCorpora + )> SourceTermCorpora { get; } + + public IEnumerable<( + ParallelCorpus ParallelCorpus, + MonolingualCorpus MonolingualCorpus, + IReadOnlyList CorpusFile, + IReadOnlyList TextCorpora + )> TargetTermCorpora { get; } + public IEnumerable ParallelCorpora { get; } + + public CorpusBundle(IEnumerable parallelCorpora) + { + ParallelCorpora = parallelCorpora.ToArray(); + + _settings = []; + IEnumerable corpusFiles = parallelCorpora.SelectMany(corpus => + corpus.SourceCorpora.Concat(corpus.TargetCorpora).SelectMany(c => c.Files) + ); + List<(string Location, ParatextProjectSettings Settings)> paratextProjects = []; + foreach (CorpusFile file in corpusFiles.Where(f => f.Format == FileFormat.Paratext)) + { + using IZipContainer archive = new ZipContainer(file.Location); + ParatextProjectSettings settings = new Services.ZipParatextProjectSettingsParser(archive).Parse(); + paratextProjects.Add((file.Location, settings)); + } + + foreach ((string daughterLocation, ParatextProjectSettings daughterSettings) in paratextProjects) + { + foreach ((string parentLocation, ParatextProjectSettings parentSettings) in paratextProjects) + { + if ( + daughterSettings != parentSettings + && daughterSettings.HasParent + && daughterSettings.IsDaughterProjectOf(parentSettings) + ) + { + daughterSettings.Parent = parentSettings; + _settings[daughterLocation] = (daughterSettings, parentLocation, parentSettings); + } + else + { + _settings[daughterLocation] = (daughterSettings, null, null); + } + } + } + + SourceTextCorpora = parallelCorpora.SelectMany(parallelCorpus => + parallelCorpus.SourceCorpora.Select(corpus => + (parallelCorpus, corpus, corpus.Files, CreateTextCorpora(corpus.Files)) + ) + ); + + TargetTextCorpora = parallelCorpora.SelectMany(parallelCorpus => + parallelCorpus.TargetCorpora.Select(corpus => + (parallelCorpus, corpus, corpus.Files, CreateTextCorpora(corpus.Files)) + ) + ); + + SourceTermCorpora = parallelCorpora.SelectMany(parallelCorpus => + parallelCorpus.SourceCorpora.Select(corpus => + (parallelCorpus, corpus, corpus.Files, CreateTermCorpora(corpus.Files)) + ) + ); + + TargetTermCorpora = parallelCorpora.SelectMany(parallelCorpus => + parallelCorpus.TargetCorpora.Select(corpus => + (parallelCorpus, corpus, corpus.Files, CreateTermCorpora(corpus.Files)) + ) + ); + } + + public (string Location, ParatextProjectSettings Settings)? ParentOf(string daughterLocation) + { + if ( + !_settings.TryGetValue( + daughterLocation, + out (ParatextProjectSettings _, string? Location, ParatextProjectSettings? Settings) parent + ) + ) + { + return null; + } + if (parent.Location == null || parent.Settings == null) + { + return null; + } + return (parent.Location, parent.Settings); + } + + public ParatextProjectSettings? GetSettings(string location) + { + if ( + !_settings.TryGetValue( + location, + out ( + ParatextProjectSettings ParatextProjectSettings, + string? ParentLocation, + ParatextProjectSettings? ParentSettings + ) settings + ) + ) + { + return null; + } + return settings.ParatextProjectSettings; + } + + public ZipParatextProjectTextUpdater GetTextUpdater(string location) + { + IZipContainer container = new ZipContainer(location); + ParatextProjectSettings? parentSettings = ParentOf(location)?.Settings; + return new ZipParatextProjectTextUpdater(container, parentSettings); + } + + protected virtual IReadOnlyList CreateTextCorpora(IReadOnlyList files) + { + List corpora = []; + + List> textFileCorpora = []; + foreach (CorpusFile file in files) + { + switch (file.Format) + { + case FileFormat.Text: + // if there are multiple texts with the same id, then add it to a new corpus or the first + // corpus that doesn't contain a text with that id + Dictionary? corpus = textFileCorpora.FirstOrDefault(c => + !c.ContainsKey(file.TextId) + ); + if (corpus is null) + { + corpus = []; + textFileCorpora.Add(corpus); + } + corpus[file.TextId] = new TextFileText(file.TextId, file.Location); + break; + + case FileFormat.Paratext: + string? parentLocation = null; + if ( + _settings.TryGetValue( + file.Location, + out (ParatextProjectSettings, string? ParentLocation, ParatextProjectSettings?) settings + ) + ) + { + parentLocation = settings.ParentLocation; + } + corpora.Add( + new ParatextBackupTextCorpus( + file.Location, + includeAllText: true, + parentFileName: parentLocation + ) + ); + break; + } + } + foreach (Dictionary corpus in textFileCorpora) + corpora.Add(new DictionaryTextCorpus(corpus.Values)); + + return corpora; + } + + private IReadOnlyList CreateTermCorpora(IReadOnlyList files) + { + List corpora = []; + foreach (CorpusFile file in files) + { + switch (file.Format) + { + case FileFormat.Paratext: + string? parentLocation = null; + if ( + _settings.TryGetValue( + file.Location, + out (ParatextProjectSettings, string? ParentLocation, ParatextProjectSettings?) settings + ) + ) + { + parentLocation = settings.ParentLocation; + } + corpora.Add(new ParatextBackupTermsCorpus(file.Location, ["PN"], parentFileName: parentLocation)); + break; + } + } + return corpora; + } +} diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/IParallelCorpusPreprocessingService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/IParallelCorpusPreprocessingService.cs deleted file mode 100644 index b1dbacd89..000000000 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/IParallelCorpusPreprocessingService.cs +++ /dev/null @@ -1,17 +0,0 @@ -namespace SIL.ServiceToolkit.Services; - -public interface IParallelCorpusPreprocessingService -{ - QuoteConventionAnalysis? AnalyzeTargetCorpusQuoteConvention(ParallelCorpus corpus); - IReadOnlyList<(string CorpusId, IReadOnlyList Errors)> AnalyzeUsfmVersification( - ParallelCorpus parallelCorpus - ); - - Task PreprocessAsync( - IReadOnlyList corpora, - Func train, - Func inference, - bool useKeyTerms = false, - HashSet? ignoreUsfmMarkers = null - ); -} diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/IParallelCorpusService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/IParallelCorpusService.cs new file mode 100644 index 000000000..bf3d7aef3 --- /dev/null +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/IParallelCorpusService.cs @@ -0,0 +1,26 @@ +namespace SIL.ServiceToolkit.Services; + +public interface IParallelCorpusService +{ + QuoteConventionAnalysis AnalyzeTargetQuoteConvention(CorpusBundle corpusBundle); + + IReadOnlyList<( + string ParallelCorpusId, + string MonolingualCorpusId, + IReadOnlyList Errors + )> AnalyzeUsfmVersification(CorpusBundle corpusBundle); + + IReadOnlyList<( + string ParallelCorpusId, + string MonolingualCorpusId, + MissingParentProjectError + )> FindMissingParentProjects(CorpusBundle corpusBundle); + + Task PreprocessAsync( + CorpusBundle corpusBundle, + Func train, + Func inference, + bool useKeyTerms = false, + HashSet? ignoreUsfmMarkers = null + ); +} diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ITextCorpusService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ITextCorpusService.cs deleted file mode 100644 index 7e7651583..000000000 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ITextCorpusService.cs +++ /dev/null @@ -1,7 +0,0 @@ -namespace SIL.ServiceToolkit.Services; - -public interface ITextCorpusService -{ - IEnumerable CreateTextCorpora(IReadOnlyList files); - IEnumerable CreateTermCorpora(IReadOnlyList corpusFiles); -} diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusService.cs similarity index 66% rename from src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs rename to src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusService.cs index b5b3d4fcb..c69ece379 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusService.cs @@ -2,80 +2,65 @@ namespace SIL.ServiceToolkit.Services; -public class ParallelCorpusPreprocessingService(ITextCorpusService textCorpusService) - : IParallelCorpusPreprocessingService +public class ParallelCorpusService : IParallelCorpusService { - private readonly ITextCorpusService _textCorpusService = textCorpusService; private const int Seed = 1234; - public IReadOnlyList<(string CorpusId, IReadOnlyList Errors)> AnalyzeUsfmVersification( - ParallelCorpus parallelCorpus - ) + public IReadOnlyList<( + string ParallelCorpusId, + string MonolingualCorpusId, + IReadOnlyList Errors + )> AnalyzeUsfmVersification(CorpusBundle corpusBundle) { - List<(string CorpusId, IReadOnlyList Errors)> errorsPerCorpus = []; + List<( + string ParallelCorpusId, + string MonolingualCorpusId, + IReadOnlyList Errors + )> errorsPerCorpus = []; foreach ( - (CorpusFile file, MonolingualCorpus monolingualCorpus, bool isSource) in parallelCorpus - .SourceCorpora.SelectMany(c => - c.Files.Where(f => f.Format == FileFormat.Paratext).Select(f => (f, c, true)) - ) - .Concat( - parallelCorpus.TargetCorpora.SelectMany(c => - c.Files.Where(f => f.Format == FileFormat.Paratext).Select(f => (f, c, false)) - ) - ) - .DistinctBy(tuple => tuple.f.Location) + ( + ParallelCorpus parallelCorpus, + MonolingualCorpus monolingualCorpus, + IReadOnlyList files, + _ + ) in corpusBundle.TextCorpora ) { - using ZipArchive zipArchive = ZipFile.OpenRead(file.Location); - IReadOnlyList errors = new ZipParatextProjectVersificationErrorDetector( - zipArchive - ).GetUsfmVersificationErrors(books: GetBooks(monolingualCorpus, isSource)); - if (errors.Count > 0) + foreach (CorpusFile file in files.Where(f => f.Format == FileFormat.Paratext)) { - errorsPerCorpus.Add((monolingualCorpus.Id, errors)); + using ZipArchive zipArchive = ZipFile.OpenRead(file.Location); + IReadOnlyList errors = new ZipParatextProjectVersificationErrorDetector( + zipArchive, + corpusBundle.ParentOf(file.Location)?.Settings + ).GetUsfmVersificationErrors(books: GetBooks(monolingualCorpus)); + if (errors.Count > 0) + { + errorsPerCorpus.Add((parallelCorpus.Id, monolingualCorpus.Id, errors)); + } } } return errorsPerCorpus; } - private static HashSet? GetBooks(MonolingualCorpus corpus, bool isSource) - { - if (!corpus.IsFiltered) - return null; - - List books = []; - if (corpus.TrainOnTextIds != null) - { - books.AddRange(corpus.TrainOnTextIds); - } - else if (corpus.TrainOnChapters != null) - { - books.AddRange(corpus.TrainOnChapters.Keys); - } - - if (isSource) - { - if (corpus.InferenceTextIds != null) - { - books.AddRange(corpus.InferenceTextIds); - } - else if (corpus.InferenceChapters != null) - { - books.AddRange(corpus.InferenceChapters.Keys); - } - } - return [.. books.Select(bookName => Canon.BookIdToNumber(bookName))]; - } - - public QuoteConventionAnalysis? AnalyzeTargetCorpusQuoteConvention(ParallelCorpus parallelCorpus) + public QuoteConventionAnalysis AnalyzeTargetQuoteConvention(CorpusBundle corpusBundle) { - List analyses = []; - foreach (MonolingualCorpus targetMonolingualCorpus in parallelCorpus.TargetCorpora) + Dictionary> analyses = []; + foreach ( + ( + ParallelCorpus parallelCorpus, + MonolingualCorpus targetMonolingualCorpus, + IReadOnlyList corpusFiles, + _ + ) in corpusBundle.TargetTextCorpora + ) { - foreach (CorpusFile file in targetMonolingualCorpus.Files.Where(f => f.Format == FileFormat.Paratext)) + foreach (CorpusFile file in corpusFiles.Where(f => f.Format == FileFormat.Paratext)) { using ZipArchive zipArchive = ZipFile.OpenRead(file.Location); - var quoteConventionDetector = new ZipParatextProjectQuoteConventionDetector(zipArchive); + var quoteConventionDetector = new ZipParatextProjectQuoteConventionDetector( + zipArchive, + corpusBundle.ParentOf(file.Location)?.Settings + ); Dictionary>? chapters = null; if (targetMonolingualCorpus.TrainOnTextIds is not null) { @@ -91,20 +76,60 @@ ParallelCorpus parallelCorpus kvp => kvp.Value.ToList() ); } + if (!analyses.ContainsKey(parallelCorpus.Id)) + analyses[parallelCorpus.Id] = []; if (chapters != null) - analyses.Add(quoteConventionDetector.GetQuoteConventionAnalysis(chapters)); + analyses[parallelCorpus.Id].Add(quoteConventionDetector.GetQuoteConventionAnalysis(chapters)); else - analyses.Add(quoteConventionDetector.GetQuoteConventionAnalysis()); + analyses[parallelCorpus.Id].Add(quoteConventionDetector.GetQuoteConventionAnalysis()); } } - return QuoteConventionAnalysis.CombineWithWeightedAverage(analyses); + return QuoteConventionAnalysis.CombineWithWeightedAverage( + analyses.Select(kvp => QuoteConventionAnalysis.CombineWithWeightedAverage(kvp.Value)).ToList() + ); + } + + public IReadOnlyList<( + string ParallelCorpusId, + string MonolingualCorpusId, + MissingParentProjectError + )> FindMissingParentProjects(CorpusBundle corpusBundle) + { + List<(string, string, MissingParentProjectError)> errors = []; + foreach ( + ( + ParallelCorpus parallelCorpus, + MonolingualCorpus monolingualCorpus, + IReadOnlyList files, + _ + ) in corpusBundle.TextCorpora + ) + { + foreach (CorpusFile file in files.Where(f => f.Format == FileFormat.Paratext)) + { + using ZipArchive archive = ZipFile.OpenRead(file.Location); + ParatextProjectSettings settings = Machine.Corpora.ZipParatextProjectSettingsParser.Parse(archive); + if (settings.HasParent && corpusBundle.ParentOf(file.Location) == null) + { + errors.Add( + ( + parallelCorpus.Id, + monolingualCorpus.Id, + new() { ProjectName = settings.Name, ParentProjectName = settings.ParentName } + ) + ); + } + } + } + + return errors; } public async Task PreprocessAsync( - IReadOnlyList corpora, + CorpusBundle corpusBundle, Func train, - Func inference, + Func inference, bool useKeyTerms = false, HashSet? ignoreUsfmMarkers = null ) @@ -114,43 +139,18 @@ public async Task PreprocessAsync( bool parallelTrainingDataPresent = false; List keyTermTrainingData = new(); - // Create source and target dictionaries that map from a parallel corpus id - // to an array of all of that parallel corpus' monolingual corpora and associated text corpora - Dictionary sourceCorpora = corpora - .Select(corpus => - ( - CorpusId: corpus.Id, - Corpora: corpus - .SourceCorpora.SelectMany(c => - _textCorpusService.CreateTextCorpora(c.Files).Select(tc => (c, tc)) - ) - .ToArray() - ) + // Create source and target arrays of text corpora filtered for training + // based on the filters specified in the associated monolingual corpora + ITextCorpus[] sourceTrainingCorpora = corpusBundle + .SourceTextCorpora.SelectMany(c => + c.TextCorpora.Select(tc => FilterTrainingCorpora(c.MonolingualCorpus, tc)) ) - .ToDictionary(tup => tup.CorpusId, tup => tup.Corpora); - - Dictionary targetCorpora = corpora - .Select(corpus => - ( - CorpusId: corpus.Id, - Corpora: corpus - .TargetCorpora.SelectMany(c => - _textCorpusService.CreateTextCorpora(c.Files).Select(tc => (c, tc)) - ) - .ToArray() - ) - ) - .ToDictionary(tup => tup.CorpusId, tup => tup.Corpora); - - // Filter the text corpora for training based on the filters specified in the monolingual corpora - ITextCorpus[] sourceTrainingCorpora = sourceCorpora - .Values.SelectMany(sc => sc) - .Select(sc => FilterTrainingCorpora(sc.Corpus, sc.TextCorpus)) .ToArray(); - ITextCorpus[] targetTrainingCorpora = targetCorpora - .Values.SelectMany(tc => tc) - .Select(tc => FilterTrainingCorpora(tc.Corpus, tc.TextCorpus)) + ITextCorpus[] targetTrainingCorpora = corpusBundle + .TargetTextCorpora.SelectMany(c => + c.TextCorpora.Select(tc => FilterTrainingCorpora(c.MonolingualCorpus, tc)) + ) .ToArray(); // To support mixed source, collapse multiple source text corpora into one text corpus @@ -190,48 +190,45 @@ public async Task PreprocessAsync( if (useKeyTerms) { // Create a terms corpus for each corpus file - ITextCorpus[]? sourceTermCorpora = _textCorpusService - .CreateTermCorpora( - sourceCorpora.Values.SelectMany(sc => sc).SelectMany(corpus => corpus.Corpus.Files).ToArray() - ) - .ToArray(); - ITextCorpus[]? targetTermCorpora = _textCorpusService - .CreateTermCorpora( - targetCorpora.Values.SelectMany(tc => tc).SelectMany(corpus => corpus.Corpus.Files).ToArray() - ) - .ToArray(); + ITextCorpus[] sourceTermCorpora = corpusBundle.SourceTermCorpora.SelectMany(c => c.TextCorpora).ToArray(); + ITextCorpus[] targetTermCorpora = corpusBundle.TargetTermCorpora.SelectMany(c => c.TextCorpora).ToArray(); + + // As with scripture data, interlace the source rows randomly + // but choose the first non-empty target row, then align + IParallelTextCorpus parallelKeyTermCorpus = sourceTermCorpora + .ChooseRandom(Seed) + .AlignRows(targetTermCorpora.ChooseFirst()); - if (sourceTermCorpora is not null && targetTermCorpora is not null) + // Only train on unique key terms pairs + foreach (ParallelTextRow row in parallelKeyTermCorpus.DistinctBy(row => (row.SourceText, row.TargetText))) { - // As with scripture data, interlace the source rows randomly - // but choose the first non-empty target row, then align - IParallelTextCorpus parallelKeyTermsCorpus = sourceTermCorpora - .ChooseRandom(Seed) - .AlignRows(targetTermCorpora.ChooseFirst()); - - // Only train on unique key terms pairs - foreach ( - ParallelTextRow row in parallelKeyTermsCorpus.DistinctBy(row => (row.SourceText, row.TargetText)) - ) - { - keyTermTrainingData.Add( - new Row(row.TextId, row.SourceRefs, row.TargetRefs, row.SourceText, row.TargetText, 1) - ); - } + keyTermTrainingData.Add( + new Row(row.TextId, row.SourceRefs, row.TargetRefs, row.SourceText, row.TargetText, 1) + ); } } // Since we ultimately need to provide inferences for a particular parallel corpus, // we need to preprocess the content on which to inference per parallel corpus - foreach (ParallelCorpus corpus in corpora) + foreach (ParallelCorpus parallelCorpus in corpusBundle.ParallelCorpora) { // Filter the text corpora based on the filters specified in the monolingual corpora - ITextCorpus sourceInferencingCorpus = sourceCorpora[corpus.Id] - .Select(sc => FilterInferencingCorpora(sc.Corpus, sc.TextCorpus, ignoreUsfmMarkers)) + ITextCorpus sourceInferencingCorpus = corpusBundle + .SourceTextCorpora.Where(c => c.ParallelCorpus.Id == parallelCorpus.Id) + .SelectMany(sc => + sc.TextCorpora.Select(textCorpus => + FilterInferencingCorpora(sc.MonolingualCorpus, textCorpus, ignoreUsfmMarkers) + ) + ) .ChooseFirst(); - ITextCorpus targetInferencingCorpus = targetCorpora[corpus.Id] - .Select(tc => FilterInferencingCorpora(tc.Corpus, tc.TextCorpus, ignoreUsfmMarkers)) + ITextCorpus targetInferencingCorpus = corpusBundle + .TargetTextCorpora.Where(c => c.ParallelCorpus.Id == parallelCorpus.Id) + .SelectMany(tc => + tc.TextCorpora.Select(textCorpus => + FilterInferencingCorpora(tc.MonolingualCorpus, textCorpus, ignoreUsfmMarkers) + ) + ) .ChooseFirst(); // We need to align all three of these corpora because we need both the source and target @@ -247,7 +244,7 @@ ParallelTextRow row in parallelKeyTermsCorpus.DistinctBy(row => (row.SourceText, foreach ((Row row, bool isInTrainingData) in CollapseInferencingRanges(inferencingCorpus.ToArray())) { - await inference(row, isInTrainingData, corpus); + await inference(row, isInTrainingData, parallelCorpus.Id); } } @@ -258,7 +255,7 @@ ParallelTextRow row in parallelKeyTermsCorpus.DistinctBy(row => (row.SourceText, { foreach (Row row in keyTermTrainingData) { - await train(row, TrainingDataType.KeyTerms); + await train(row, TrainingDataType.KeyTerm); } } } @@ -467,4 +464,33 @@ private static TextRow CleanSegment(TextRow row) row.Segment = []; return row; } + + private static HashSet? GetBooks(MonolingualCorpus corpus) + { + if (!corpus.IsFiltered) + return null; + + List books = []; + if (corpus.TrainOnTextIds != null) + { + books.AddRange(corpus.TrainOnTextIds); + } + else if (corpus.TrainOnChapters != null) + { + books.AddRange(corpus.TrainOnChapters.Keys); + } + + // if (isSource) + // { + if (corpus.InferenceTextIds != null) + { + books.AddRange(corpus.InferenceTextIds); + } + else if (corpus.InferenceChapters != null) + { + books.AddRange(corpus.InferenceChapters.Keys); + } + // } + return [.. books.Select(bookName => Canon.BookIdToNumber(bookName))]; + } } diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/TextCorpusService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/TextCorpusService.cs deleted file mode 100644 index 4ee909b2a..000000000 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/TextCorpusService.cs +++ /dev/null @@ -1,51 +0,0 @@ -namespace SIL.ServiceToolkit.Services; - -public class TextCorpusService : ITextCorpusService -{ - public IEnumerable CreateTextCorpora(IReadOnlyList files) - { - List corpora = []; - - List> textFileCorpora = []; - foreach (CorpusFile file in files) - { - switch (file.Format) - { - case FileFormat.Text: - // if there are multiple texts with the same id, then add it to a new corpus or the first - // corpus that doesn't contain a text with that id - Dictionary? corpus = textFileCorpora.FirstOrDefault(c => - !c.ContainsKey(file.TextId) - ); - if (corpus is null) - { - corpus = []; - textFileCorpora.Add(corpus); - } - corpus[file.TextId] = new TextFileText(file.TextId, file.Location); - break; - - case FileFormat.Paratext: - corpora.Add(new ParatextBackupTextCorpus(file.Location, includeAllText: true)); - break; - } - } - foreach (Dictionary corpus in textFileCorpora) - corpora.Add(new DictionaryTextCorpus(corpus.Values)); - - return corpora; - } - - public IEnumerable CreateTermCorpora(IReadOnlyList corpusFiles) - { - foreach (CorpusFile file in corpusFiles) - { - switch (file.Format) - { - case FileFormat.Paratext: - yield return new ParatextBackupTermsCorpus(file.Location, ["PN"]); - break; - } - } - } -} diff --git a/src/Serval/src/Serval.Shared/Services/ZipParatextProjectFileHandler.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ZipParatextProjectFileHandler.cs similarity index 97% rename from src/Serval/src/Serval.Shared/Services/ZipParatextProjectFileHandler.cs rename to src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ZipParatextProjectFileHandler.cs index c0e50d549..79a9cd49d 100644 --- a/src/Serval/src/Serval.Shared/Services/ZipParatextProjectFileHandler.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ZipParatextProjectFileHandler.cs @@ -1,6 +1,6 @@ using SIL.IO; -namespace Serval.Shared.Services; +namespace SIL.ServiceToolkit.Services; public class ZipParatextProjectFileHandler(IZipContainer container) : IParatextProjectFileHandler { diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ZipParatextProjectSettingsParser.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ZipParatextProjectSettingsParser.cs new file mode 100644 index 000000000..5d3075d58 --- /dev/null +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ZipParatextProjectSettingsParser.cs @@ -0,0 +1,6 @@ +namespace SIL.ServiceToolkit.Services; + +public class ZipParatextProjectSettingsParser( + IZipContainer projectContainer, + ParatextProjectSettings? parentProjectSettings = null +) : ParatextProjectSettingsParserBase(new ZipParatextProjectFileHandler(projectContainer), parentProjectSettings) { } diff --git a/src/Serval/src/Serval.Shared/Services/ZipParatextProjectTextUpdater.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ZipParatextProjectTextUpdater.cs similarity index 91% rename from src/Serval/src/Serval.Shared/Services/ZipParatextProjectTextUpdater.cs rename to src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ZipParatextProjectTextUpdater.cs index cc83e84dc..506c8dd2a 100644 --- a/src/Serval/src/Serval.Shared/Services/ZipParatextProjectTextUpdater.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ZipParatextProjectTextUpdater.cs @@ -1,9 +1,9 @@ -namespace Serval.Shared.Services; +namespace SIL.ServiceToolkit.Services; public class ZipParatextProjectTextUpdater(IZipContainer projectContainer, ParatextProjectSettings? settings = null) : ParatextProjectTextUpdaterBase( new ZipParatextProjectFileHandler(projectContainer), - settings ?? new ZipParatextProjectSettingsParser(projectContainer).Parse() + settings ?? new ZipParatextProjectSettingsParser(projectContainer, settings).Parse() ), IDisposable { diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Usings.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Usings.cs index e4b01cb60..b52b3f447 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Usings.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Usings.cs @@ -26,4 +26,5 @@ global using SIL.ServiceToolkit.Configuration; global using SIL.ServiceToolkit.Models; global using SIL.ServiceToolkit.Services; +global using SIL.ServiceToolkit.Utils; global using SIL.WritingSystems; diff --git a/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/CorpusBundleTests.cs b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/CorpusBundleTests.cs new file mode 100644 index 000000000..857963a93 --- /dev/null +++ b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/CorpusBundleTests.cs @@ -0,0 +1,279 @@ +using SIL.ServiceToolkit.Utils; + +namespace SIL.ServiceToolkit.Services; + +public class CorpusBundleTests +{ + [Test] + public void GetSettings() + { + using TestEnvironment env = new(addParatext: true, addText: false); + string fileLocation = env.CorpusBundle.ParallelCorpora.First().SourceCorpora[0].Files[0].Location; + ParatextProjectSettings? settings = env.CorpusBundle.GetSettings(fileLocation); + Assert.That(settings, Is.Not.Null); + Assert.That(settings.Name, Is.EqualTo("Te1")); + Assert.That(env.CorpusBundle.ParentOf(fileLocation), Is.Null); + } + + [Test] + public void GetSettings_TextFile() + { + using TestEnvironment env = new(addParatext: false, addText: true); + string fileLocation = env.CorpusBundle.ParallelCorpora.First().SourceCorpora[0].Files[0].Location; + ParatextProjectSettings? settings = env.CorpusBundle.GetSettings(fileLocation); + Assert.That(settings, Is.Null); + Assert.That(env.CorpusBundle.ParentOf(fileLocation), Is.Null); + } + + [Test] + public void GetTextUpdater() + { + using TestEnvironment env = new(addParatext: true, addText: false); + string fileLocation = env.CorpusBundle.ParallelCorpora.First().SourceCorpora[0].Files[0].Location; + using ZipParatextProjectTextUpdater updater = env.CorpusBundle.GetTextUpdater(fileLocation); + Assert.That( + updater.UpdateUsfm("MAT", [], textBehavior: UpdateUsfmTextBehavior.PreferExisting).ReplaceLineEndings("\n"), + Is.EqualTo( + $@"\id MAT - Test +\h Matthew +\mt Matthew +\ip An introduction to Matthew +\c 1 +\p +\v 1 Source one, chapter one, verse one. +\v 2-3 Source one, chapter one, verse two and three. +\v 4 Source one, chapter one, verse four. +\v 5 Source one, chapter one, verse five. +\v 6 Source one, chapter one, verse six. +\v 7-9 Source one, chapter one, verse seven, eight, and nine. +\v 10 Source one, chapter one, verse ten. +\c 2 +\p +\v 1 Source one, chapter two, verse one. +\v 2 Source one, chapter two, verse two. “a quotation” +\v 3 ... +\v 4 ... +" + ) + .IgnoreLineEndings() + ); + } + + [Test] + public void GetTextUpdater_TextFile() + { + using TestEnvironment env = new(addParatext: false, addText: true); + string fileLocation = env.CorpusBundle.ParallelCorpora.First().SourceCorpora[0].Files[0].Location; + Assert.Throws(() => env.CorpusBundle.GetTextUpdater(fileLocation)); + } + + [Test] + public void GetTextCorpora() + { + using TestEnvironment env = new(addParatext: true, addText: true); + + Assert.That(env.CorpusBundle.ParallelCorpora.Count(), Is.EqualTo(3)); + + Assert.That(env.CorpusBundle.SourceTermCorpora.Count(c => c.TextCorpora.Any()), Is.EqualTo(2)); + Assert.That( + env.CorpusBundle.SourceTermCorpora.SelectMany(c => c.TextCorpora) + .All(tc => tc.First().ContentType == TextRowContentType.Word) + ); + Assert.That(env.CorpusBundle.TargetTermCorpora.Count(c => c.TextCorpora.Any()), Is.EqualTo(2)); + Assert.That( + env.CorpusBundle.TargetTermCorpora.SelectMany(c => c.TextCorpora) + .All(tc => tc.First().ContentType == TextRowContentType.Word) + ); + + Assert.That(env.CorpusBundle.SourceTextCorpora.SelectMany(c => c.TextCorpora).Count(), Is.EqualTo(4)); + Assert.That( + env.CorpusBundle.SourceTextCorpora.SelectMany(c => c.TextCorpora) + .All(tc => tc.First().ContentType == TextRowContentType.Segment) + ); + Assert.That(env.CorpusBundle.TargetTextCorpora.SelectMany(c => c.TextCorpora).Count(), Is.EqualTo(3)); + Assert.That( + env.CorpusBundle.TargetTextCorpora.SelectMany(c => c.TextCorpora) + .All(tc => tc.First().ContentType == TextRowContentType.Segment) + ); + } + + private class TestEnvironment : DisposableBase + { + public TestEnvironment(bool addParatext, bool addText) + { + CorpusBundle = new CorpusBundle(GetCorpora(addParatext, addText)); + } + + public CorpusBundle CorpusBundle { get; } + + private static readonly string TestDataPath = Path.Combine( + AppContext.BaseDirectory, + "..", + "..", + "..", + "Services", + "data" + ); + private readonly TempDirectory _tempDir = new(name: "CorpusBundleTests"); + + public ParallelCorpus[] GetCorpora(bool addParatext, bool addText) + { + List parallelCorpora = []; + if (addParatext) + { + parallelCorpora.AddRange( + new ParallelCorpus + { + Id = "corpus1", + SourceCorpora = + [ + new MonolingualCorpus + { + Id = "pt-source1", + Language = "en", + Files = + [ + new CorpusFile + { + TextId = "textId1", + Format = FileFormat.Paratext, + Location = ZipParatextProject("pt-source1"), + }, + ], + InferenceTextIds = [], + }, + ], + TargetCorpora = + [ + new MonolingualCorpus + { + Id = "pt-target1", + Language = "en", + Files = + [ + new CorpusFile + { + TextId = "textId1", + Format = FileFormat.Paratext, + Location = ZipParatextProject("pt-target1"), + }, + ], + }, + ], + }, + new ParallelCorpus + { + Id = "corpus2", + SourceCorpora = + [ + new MonolingualCorpus + { + Id = "pt-source1", + Language = "en", + Files = + [ + new CorpusFile + { + TextId = "textId1", + Format = FileFormat.Paratext, + Location = ZipParatextProject("pt-source1"), + }, + ], + TrainOnTextIds = [], + }, + ], + TargetCorpora = + [ + new MonolingualCorpus + { + Id = "pt-target1", + Language = "en", + Files = + [ + new CorpusFile + { + TextId = "textId1", + Format = FileFormat.Paratext, + Location = ZipParatextProject("pt-target1"), + }, + ], + TrainOnTextIds = [], + }, + ], + } + ); + } + if (addText) + { + parallelCorpora.AddRange( + new ParallelCorpus + { + Id = "corpus1", + SourceCorpora = + [ + new MonolingualCorpus + { + Id = "source-corpus1", + Language = "en", + Files = + [ + new CorpusFile + { + TextId = "textId1", + Format = FileFormat.Text, + Location = Path.Combine(TestDataPath, "source1.txt"), + }, + ], + }, + new MonolingualCorpus + { + Id = "source-corpus2", + Language = "en", + Files = + [ + new CorpusFile + { + TextId = "textId1", + Format = FileFormat.Text, + Location = Path.Combine(TestDataPath, "source2.txt"), + }, + ], + }, + ], + TargetCorpora = + [ + new MonolingualCorpus + { + Id = "target-corpus1", + Language = "en", + Files = + [ + new CorpusFile + { + TextId = "textId1", + Format = FileFormat.Text, + Location = Path.Combine(TestDataPath, "target1.txt"), + }, + ], + }, + ], + } + ); + } + return parallelCorpora.ToArray(); + } + + protected override void DisposeManagedResources() + { + _tempDir.Dispose(); + } + + private string ZipParatextProject(string name) + { + string fileName = Path.Combine(_tempDir.Path, $"{name}.zip"); + if (!File.Exists(fileName)) + ZipFile.CreateFromDirectory(Path.Combine(TestDataPath, name), fileName); + return fileName; + } + } +} diff --git a/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/ParallelCorpusProcessingServiceTests.cs b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/ParallelCorpusServiceTests.cs similarity index 95% rename from src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/ParallelCorpusProcessingServiceTests.cs rename to src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/ParallelCorpusServiceTests.cs index d861a9599..c48b3f008 100644 --- a/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/ParallelCorpusProcessingServiceTests.cs +++ b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/ParallelCorpusServiceTests.cs @@ -1,7 +1,9 @@ +using SIL.ServiceToolkit.Utils; + namespace SIL.ServiceToolkit.Services; [TestFixture] -public class ParallelCorpusPreprocessingServiceTests +public class ParallelCorpusServiceTests { [Test] public void TestParallelCorpusAnalysis_FileFormatParatext() @@ -10,8 +12,8 @@ public void TestParallelCorpusAnalysis_FileFormatParatext() ParallelCorpus parallelCorpus = env.GetCorpora(paratextProject: true).First(); const string ExpectedTargetName = "typewriter_english"; - QuoteConventionAnalysis? targetQuotationConvention = env.Processor.AnalyzeTargetCorpusQuoteConvention( - parallelCorpus + QuoteConventionAnalysis? targetQuotationConvention = env.Processor.AnalyzeTargetQuoteConvention( + new CorpusBundle([parallelCorpus]) ); Assert.Multiple(() => @@ -27,8 +29,8 @@ public void TestParallelCorpusAnalysis_FileFormatText() using var env = new TestEnvironment(); ParallelCorpus parallelCorpus = env.GetCorpora(paratextProject: false).First(); - QuoteConventionAnalysis? targetQuotationConvention = env.Processor.AnalyzeTargetCorpusQuoteConvention( - parallelCorpus + QuoteConventionAnalysis? targetQuotationConvention = env.Processor.AnalyzeTargetQuoteConvention( + new CorpusBundle([parallelCorpus]) ); Assert.Multiple(() => @@ -46,7 +48,7 @@ public async Task TestParallelCorpusPreprocessor_FileFormatText() int trainCount = 0; int inferenceCount = 0; await env.Processor.PreprocessAsync( - corpora, + new CorpusBundle(corpora), (row, _) => { if (row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0) @@ -82,7 +84,7 @@ public async Task TestParallelCorpusPreprocessor_FileFormatParatext() var trainRefs = new List(); var inferenceRefs = new List(); await env.Processor.PreprocessAsync( - corpora, + new CorpusBundle(corpora), (row, _) => { if (row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0) @@ -123,10 +125,9 @@ private class TestEnvironment : DisposableBase "Services", "data" ); - private readonly TempDirectory _tempDir = new TempDirectory(name: "ParallelCorpusProcessingServiceTests"); + private readonly TempDirectory _tempDir = new TempDirectory(name: "ParallelCorpusServiceTests"); - public IParallelCorpusPreprocessingService Processor { get; } = - new ParallelCorpusPreprocessingService(new TextCorpusService()); + public IParallelCorpusService Processor { get; } = new ParallelCorpusService(); public ParallelCorpus[] GetCorpora(bool paratextProject) { diff --git a/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Usings.cs b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Usings.cs index 4fb4a8aed..19d74f19e 100644 --- a/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Usings.cs +++ b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Usings.cs @@ -6,7 +6,9 @@ global using NSubstitute; global using NSubstitute.ExceptionExtensions; global using NUnit.Framework; +global using NUnit.Framework.Constraints; global using SIL.DataAccess; +global using SIL.Machine.Corpora; global using SIL.Machine.PunctuationAnalysis; global using SIL.Machine.Utils; global using SIL.ObjectModel; diff --git a/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Utils/NUnitExtensions.cs b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Utils/NUnitExtensions.cs new file mode 100644 index 000000000..e52803012 --- /dev/null +++ b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Utils/NUnitExtensions.cs @@ -0,0 +1,9 @@ +namespace SIL.ServiceToolkit.Utils; + +public static class NUnitExtensions +{ + public static EqualUsingConstraint IgnoreLineEndings(this EqualStringConstraint constraint) + { + return constraint.Using(new IgnoreLineEndingsStringComparer()); + } +} From 8afd47a946e7b97c46a4f584c7747078a8e14c43 Mon Sep 17 00:00:00 2001 From: Damien Daspit Date: Thu, 12 Mar 2026 11:10:43 -0400 Subject: [PATCH 2/5] Make CorpusBundle an implementation detail of ParallelCorpusService - add update USFM methods to ParallelCorpusService --- .../EchoEngine/TranslationEngineServiceV1.cs | 2 +- .../WordAlignmentEngineServiceV1.cs | 3 +- .../Services/NmtPreprocessBuildJob.cs | 13 +- .../Services/PreprocessBuildJob.cs | 20 +- .../Services/TranslationPreprocessBuildJob.cs | 8 +- .../WordAlignmentPreprocessBuildJob.cs | 10 +- .../Services/PreprocessBuildJobTests.cs | 2 +- .../Models/Pretranslation.cs | 4 +- .../Services/PretranslationService.cs | 534 ++++-------------- .../SIL.ServiceToolkit/Models/ParallelRow.cs | 11 + .../Services/CorpusBundle.cs | 2 +- .../Services/IParallelCorpusService.cs | 34 +- .../Services/ParallelCorpusService.cs | 290 +++++++++- .../Services/ParallelCorpusServiceTests.cs | 18 +- 14 files changed, 474 insertions(+), 477 deletions(-) create mode 100644 src/ServiceToolkit/src/SIL.ServiceToolkit/Models/ParallelRow.cs diff --git a/src/Echo/src/EchoEngine/TranslationEngineServiceV1.cs b/src/Echo/src/EchoEngine/TranslationEngineServiceV1.cs index 24998c3ec..6ed6178b8 100644 --- a/src/Echo/src/EchoEngine/TranslationEngineServiceV1.cs +++ b/src/Echo/src/EchoEngine/TranslationEngineServiceV1.cs @@ -125,7 +125,7 @@ await client.BuildStartedAsync( List pretranslationsRequests = []; await _parallelCorpusPreprocessingService.PreprocessAsync( - new SIL.ServiceToolkit.Utils.CorpusBundle(request.Corpora.Select(Map)), + request.Corpora.Select(Map), (row, _) => { if (row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0) diff --git a/src/Echo/src/EchoEngine/WordAlignmentEngineServiceV1.cs b/src/Echo/src/EchoEngine/WordAlignmentEngineServiceV1.cs index df88dd181..12c3e9d12 100644 --- a/src/Echo/src/EchoEngine/WordAlignmentEngineServiceV1.cs +++ b/src/Echo/src/EchoEngine/WordAlignmentEngineServiceV1.cs @@ -1,5 +1,4 @@ using Serval.WordAlignment.V1; -using SIL.ServiceToolkit.Utils; namespace EchoEngine; @@ -80,7 +79,7 @@ await client.BuildStartedAsync( int wordAlignCount = 0; List wordAlignmentsRequests = []; await _parallelCorpusPreprocessingService.PreprocessAsync( - new CorpusBundle(request.Corpora.Select(Map)), + request.Corpora.Select(Map), (row, _) => { if (row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0) diff --git a/src/Machine/src/Serval.Machine.Shared/Services/NmtPreprocessBuildJob.cs b/src/Machine/src/Serval.Machine.Shared/Services/NmtPreprocessBuildJob.cs index 20e95f3bc..f4ff9487b 100644 --- a/src/Machine/src/Serval.Machine.Shared/Services/NmtPreprocessBuildJob.cs +++ b/src/Machine/src/Serval.Machine.Shared/Services/NmtPreprocessBuildJob.cs @@ -33,12 +33,13 @@ private bool ResolveLanguageCode(string languageCode, out string resolvedCode) protected override async Task UpdateTargetQuoteConventionAsync( string engineId, string buildId, - CorpusBundle corpusBundle, + IReadOnlyList parallelCorpora, CancellationToken cancellationToken ) { string overallTargetQuoteConventionAnalysis = - ParallelCorpusService.AnalyzeTargetQuoteConvention(corpusBundle)?.BestQuoteConvention?.Name ?? string.Empty; + ParallelCorpusService.AnalyzeTargetQuoteConvention(parallelCorpora)?.BestQuoteConvention?.Name + ?? string.Empty; await PlatformService.UpdateTargetQuoteConventionAsync( engineId, @@ -55,7 +56,7 @@ protected override async Task UpdateBuildExecutionData( int pretranslateCount, string sourceLanguageTag, string targetLanguageTag, - CorpusBundle corpusBundle, + IReadOnlyList parallelCorpora, CancellationToken cancellationToken ) { @@ -74,7 +75,7 @@ CancellationToken cancellationToken pretranslateCount, sourceLanguageTag, targetLanguageTag, - corpusBundle + parallelCorpora ); int maxWarnings = BuildJobOptions.MaxWarnings; @@ -118,12 +119,12 @@ protected override IReadOnlyList GetWarnings( int inferenceCount, string sourceLanguageTag, string targetLanguageTag, - CorpusBundle corpusBundle + IReadOnlyList parallelCorpora ) { List warnings = [ - .. base.GetWarnings(trainCount, inferenceCount, sourceLanguageTag, targetLanguageTag, corpusBundle), + .. base.GetWarnings(trainCount, inferenceCount, sourceLanguageTag, targetLanguageTag, parallelCorpora), ]; // Has at least a Gospel of Mark amount of data and not the special case of no data which will be caught elsewhere diff --git a/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs b/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs index 5e82c1d35..d3488e8d3 100644 --- a/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs +++ b/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs @@ -45,11 +45,9 @@ CancellationToken cancellationToken if (engine is null) throw new OperationCanceledException($"Engine {engineId} does not exist. Build canceled."); - CorpusBundle corpusBundle = new(data); - (int trainCount, int inferenceCount) = await WriteDataFilesAsync( buildId, - corpusBundle, + data, buildOptions, cancellationToken ); @@ -61,11 +59,11 @@ await UpdateBuildExecutionData( inferenceCount, engine.SourceLanguage, engine.TargetLanguage, - corpusBundle, + data, cancellationToken ); - await UpdateTargetQuoteConventionAsync(engineId, buildId, corpusBundle, cancellationToken); + await UpdateTargetQuoteConventionAsync(engineId, buildId, data, cancellationToken); if (inferenceCount == 0 && engine is TranslationEngine { IsModelPersisted: false }) { @@ -96,20 +94,20 @@ protected abstract Task UpdateBuildExecutionData( int inferenceCount, string sourceLanguageTag, string targetLanguageTag, - CorpusBundle corpusBundle, + IReadOnlyList parallelCorpora, CancellationToken cancellationToken ); protected virtual Task UpdateTargetQuoteConventionAsync( string engineId, string buildId, - CorpusBundle corpusBundle, + IReadOnlyList parallelCorpora, CancellationToken cancellationToken ) => Task.CompletedTask; protected abstract Task<(int TrainCount, int InferenceCount)> WriteDataFilesAsync( string buildId, - CorpusBundle corpusBundle, + IReadOnlyList parallelCorpora, string? buildOptions, CancellationToken cancellationToken ); @@ -139,7 +137,7 @@ protected virtual IReadOnlyList GetWarnings( int inferenceCount, string sourceLanguageTag, string targetLanguageTag, - CorpusBundle corpusBundle + IReadOnlyList parallelCorpora ) { List warnings = []; @@ -149,7 +147,7 @@ CorpusBundle corpusBundle string parallelCorpusId, string monolingualCorpusId, IReadOnlyList errors - ) in ParallelCorpusService.AnalyzeUsfmVersification(corpusBundle) + ) in ParallelCorpusService.AnalyzeUsfmVersification(parallelCorpora) ) { foreach (UsfmVersificationError error in errors) @@ -173,7 +171,7 @@ IReadOnlyList errors string parallelCorpusId, string monolingualCorpusId, MissingParentProjectError error - ) in ParallelCorpusService.FindMissingParentProjects(corpusBundle) + ) in ParallelCorpusService.FindMissingParentProjects(parallelCorpora) ) { warnings.Add( diff --git a/src/Machine/src/Serval.Machine.Shared/Services/TranslationPreprocessBuildJob.cs b/src/Machine/src/Serval.Machine.Shared/Services/TranslationPreprocessBuildJob.cs index 7870b3703..47e07c1cd 100644 --- a/src/Machine/src/Serval.Machine.Shared/Services/TranslationPreprocessBuildJob.cs +++ b/src/Machine/src/Serval.Machine.Shared/Services/TranslationPreprocessBuildJob.cs @@ -23,7 +23,7 @@ IOptionsMonitor options { protected override async Task<(int TrainCount, int InferenceCount)> WriteDataFilesAsync( string buildId, - CorpusBundle corpusBundle, + IReadOnlyList parallelCorpora, string? buildOptions, CancellationToken cancellationToken ) @@ -56,7 +56,7 @@ await SharedFileService.OpenWriteAsync($"builds/{buildId}/train.key-terms.trg.tx int pretranslateCount = 0; pretranslateWriter.WriteStartArray(); await ParallelCorpusService.PreprocessAsync( - corpusBundle, + parallelCorpora, async (row, trainingDataType) => { if (row.SourceSegment.Length > 0 || row.TargetSegment.Length > 0) @@ -109,7 +109,7 @@ protected override async Task UpdateBuildExecutionData( int pretranslateCount, string sourceLanguageTag, string targetLanguageTag, - CorpusBundle corpusBundle, + IReadOnlyList parallelCorpora, CancellationToken cancellationToken ) { @@ -118,7 +118,7 @@ CancellationToken cancellationToken pretranslateCount, sourceLanguageTag, targetLanguageTag, - corpusBundle + parallelCorpora ); // Log summary of build data diff --git a/src/Machine/src/Serval.Machine.Shared/Services/WordAlignmentPreprocessBuildJob.cs b/src/Machine/src/Serval.Machine.Shared/Services/WordAlignmentPreprocessBuildJob.cs index 0af94cd63..2eb3c06c9 100644 --- a/src/Machine/src/Serval.Machine.Shared/Services/WordAlignmentPreprocessBuildJob.cs +++ b/src/Machine/src/Serval.Machine.Shared/Services/WordAlignmentPreprocessBuildJob.cs @@ -23,7 +23,7 @@ IOptionsMonitor options { protected override async Task<(int TrainCount, int InferenceCount)> WriteDataFilesAsync( string buildId, - CorpusBundle corpusBundle, + IReadOnlyList parallelCorpora, string? buildOptions, CancellationToken cancellationToken ) @@ -56,7 +56,7 @@ await SharedFileService.OpenWriteAsync($"builds/{buildId}/train.key-terms.trg.tx int inferenceCount = 0; wordAlignmentWriter.WriteStartArray(); await ParallelCorpusService.PreprocessAsync( - corpusBundle, + parallelCorpora, async (row, trainingDataType) => { if (row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0) @@ -109,7 +109,7 @@ protected override async Task UpdateBuildExecutionData( int wordAlignCount, string sourceLanguageTag, string targetLanguageTag, - CorpusBundle corpusBundle, + IReadOnlyList parallelCorpora, CancellationToken cancellationToken ) { @@ -118,7 +118,7 @@ CancellationToken cancellationToken wordAlignCount, sourceLanguageTag, targetLanguageTag, - corpusBundle + parallelCorpora ); // Log summary of build data @@ -148,7 +148,7 @@ CancellationToken cancellationToken protected override Task UpdateTargetQuoteConventionAsync( string engineId, string buildId, - CorpusBundle corpusBundle, + IReadOnlyList parallelCorpora, CancellationToken cancellationToken ) { diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs index afcb87275..1417d318d 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs @@ -347,7 +347,7 @@ public void RunAsync_OnlyParseSelectedBooks_NoBadBooks() env.ParallelCorpusService = Substitute.For(); env.ParallelCorpusService.When(s => s.PreprocessAsync( - Arg.Any(), + Arg.Any>(), Arg.Any>(), Arg.Any>(), Arg.Any(), diff --git a/src/Serval/src/Serval.Translation/Models/Pretranslation.cs b/src/Serval/src/Serval.Translation/Models/Pretranslation.cs index 854203203..74f76ad2f 100644 --- a/src/Serval/src/Serval.Translation/Models/Pretranslation.cs +++ b/src/Serval/src/Serval.Translation/Models/Pretranslation.cs @@ -12,7 +12,7 @@ public class Pretranslation : IEntity public required IReadOnlyList? TargetRefs { get; init; } = []; public required IReadOnlyList Refs { get; init; } public required string Translation { get; init; } - public IEnumerable? SourceTokens { get; init; } - public IEnumerable? TranslationTokens { get; init; } + public IReadOnlyList? SourceTokens { get; init; } + public IReadOnlyList? TranslationTokens { get; init; } public IReadOnlyList? Alignment { get; init; } } diff --git a/src/Serval/src/Serval.Translation/Services/PretranslationService.cs b/src/Serval/src/Serval.Translation/Services/PretranslationService.cs index 9340388b9..311085b7a 100644 --- a/src/Serval/src/Serval.Translation/Services/PretranslationService.cs +++ b/src/Serval/src/Serval.Translation/Services/PretranslationService.cs @@ -1,6 +1,4 @@ using SIL.Machine.Corpora; -using SIL.Machine.PunctuationAnalysis; -using SIL.Machine.Translation; namespace Serval.Translation.Services; @@ -8,11 +6,13 @@ public class PretranslationService( IRepository pretranslations, IRepository engines, IRepository builds, - IOptionsMonitor dataFileOptions + IOptionsMonitor dataFileOptions, + IParallelCorpusService parallelCorpusService ) : EntityServiceBase(pretranslations), IPretranslationService { private readonly IRepository _engines = engines; private readonly IRepository _builds = builds; + private readonly IParallelCorpusService _parallelCorpusService = parallelCorpusService; private const string AIDisclaimerRemark = "This draft of {0} was generated using AI on {1}. It should be reviewed and edited carefully."; private readonly IOptionsMonitor _dataFileOptions = dataFileOptions; @@ -52,77 +52,12 @@ public async Task GetUsfmAsync( Engine? engine = await _engines.GetAsync(engineId, cancellationToken); Corpus? corpus = engine?.Corpora.SingleOrDefault(c => c.Id == corpusId); ParallelCorpus? parallelCorpus = engine?.ParallelCorpora.SingleOrDefault(c => c.Id == corpusId); - Build? build = (await _builds.GetAllAsync(b => b.EngineRef == engineId, cancellationToken)) - .OrderByDescending(b => b.DateFinished) - .FirstOrDefault(); - if (build is null || build.DateFinished is null) - throw new InvalidOperationException($"Could not find any completed builds for engine '{engineId}'."); - - CorpusBundle corpusBundle; - if (build.TrainOn == null || build.Pretranslate == null) - { - if (parallelCorpus != null) - { - corpusBundle = new CorpusBundle(engine!.ParallelCorpora.Select(Map)); - } - else - { - corpusBundle = new CorpusBundle(engine!.Corpora.Select(c => Map(c, engine))); - } - } - else - { - HashSet referencedCorpora; - if (parallelCorpus != null) - { - referencedCorpora = build - .TrainOn.Select(t => t.ParallelCorpusRef) - .Concat(build.Pretranslate.Select(p => p.ParallelCorpusRef)) - .Where(r => r != null) - .Select(r => r!) - .ToHashSet(); - corpusBundle = new CorpusBundle( - engine!.ParallelCorpora.Where(pc => referencedCorpora.Contains(pc.Id)).Select(Map) - ); - } - else - { - referencedCorpora = build - .TrainOn.Select(t => t.CorpusRef) - .Concat(build.Pretranslate.Select(p => p.CorpusRef)) - .Where(r => r != null) - .Select(r => r!) - .ToHashSet(); - corpusBundle = new CorpusBundle( - engine!.Corpora.Where(c => referencedCorpora.Contains(c.Id)).Select(c => Map(c, engine)) - ); - } - } - - string disclaimerRemark = string.Format( - CultureInfo.InvariantCulture, - AIDisclaimerRemark, - textId, - build.DateFinished.Value.ToUniversalTime().ToString("u") - ); - string markerPlacementRemark = GenerateMarkerPlacementRemark( - paragraphMarkerBehavior, - embedBehavior, - styleMarkerBehavior - ); - - List remarks = [disclaimerRemark, markerPlacementRemark]; - - CorpusFile sourceFile; - CorpusFile targetFile; if (corpus is not null) { if (corpus.SourceFiles.Count == 0) throw new InvalidOperationException($"The corpus {corpus.Id} has no source files."); - sourceFile = corpus.SourceFiles[0]; if (corpus.TargetFiles.Count == 0) throw new InvalidOperationException($"The corpus {corpus.Id} has no target files."); - targetFile = corpus.TargetFiles[0]; } else if (parallelCorpus is not null) { @@ -136,7 +71,6 @@ public async Task GetUsfmAsync( $"The corpus {parallelCorpus.SourceCorpora[0].Id} referenced in parallel corpus {parallelCorpus.Id} has no files associated with it." ); } - sourceFile = parallelCorpus.SourceCorpora[0].Files[0]; if (parallelCorpus.TargetCorpora.Count == 0) { throw new InvalidOperationException($"The parallel corpus {parallelCorpus.Id} has no target corpora."); @@ -147,132 +81,118 @@ public async Task GetUsfmAsync( $"The corpus {parallelCorpus.TargetCorpora[0].Id} referenced in parallel corpus {parallelCorpus.Id} has no files associated with it." ); } - targetFile = parallelCorpus.TargetCorpora[0].Files[0]; } else { throw new EntityNotFoundException($"Could not find the corpus '{corpusId}' in engine '{engineId}'."); } - if (sourceFile.Format is not FileFormat.Paratext || targetFile.Format is not FileFormat.Paratext) - throw new InvalidOperationException("USFM format is not valid for non-Scripture corpora."); - ParatextProjectSettings sourceSettings = corpusBundle.GetSettings(GetFilePath(sourceFile.Filename))!; - ParatextProjectSettings targetSettings = corpusBundle.GetSettings(GetFilePath(targetFile.Filename))!; + Build? build = (await _builds.GetAllAsync(b => b.EngineRef == engineId, cancellationToken)) + .OrderByDescending(b => b.DateFinished) + .FirstOrDefault(); + if (build is null || build.DateFinished is null) + throw new InvalidOperationException($"Could not find any completed builds for engine '{engineId}'."); - IEnumerable pretranslations = await GetAllAsync( - engineId, - modelRevision, - corpusId, + string disclaimerRemark = string.Format( + CultureInfo.InvariantCulture, + AIDisclaimerRemark, textId, - cancellationToken + build.DateFinished.Value.ToUniversalTime().ToString("u") + ); + string markerPlacementRemark = GenerateMarkerPlacementRemark( + paragraphMarkerBehavior, + embedBehavior, + styleMarkerBehavior ); - IEnumerable<( - IReadOnlyList SourceScriptureRefs, - IReadOnlyList TargetScriptureRefs, - Pretranslation Pretranslation, - PretranslationUsfmMarkerBehavior ParagraphBehavior, - PretranslationUsfmMarkerBehavior StyleBehavior - )> pretranslationRows = pretranslations - .Select(p => Map(p, sourceSettings, targetSettings, paragraphMarkerBehavior, styleMarkerBehavior)) - .Where(p => p.TargetScriptureRefs.Any()) - .OrderBy(p => p.TargetScriptureRefs[0]); + List remarks = [disclaimerRemark, markerPlacementRemark]; - List updateBlockHandlers = []; - if ( - paragraphMarkerBehavior == PretranslationUsfmMarkerBehavior.PreservePosition - && template == PretranslationUsfmTemplate.Source - ) + SIL.ServiceToolkit.Models.ParallelCorpus[] parallelCorpora; + if (build.TrainOn == null || build.Pretranslate == null) { - updateBlockHandlers.Add(new PlaceMarkersUsfmUpdateBlockHandler()); + if (parallelCorpus != null) + { + parallelCorpora = engine!.ParallelCorpora.Select(Map).ToArray(); + } + else + { + parallelCorpora = engine!.Corpora.Select(c => Map(c, engine)).ToArray(); + } + } + else + { + HashSet referencedCorpora; + if (parallelCorpus != null) + { + referencedCorpora = build + .TrainOn.Select(t => t.ParallelCorpusRef) + .Concat(build.Pretranslate.Select(p => p.ParallelCorpusRef)) + .Where(r => r != null) + .Select(r => r!) + .ToHashSet(); + parallelCorpora = engine! + .ParallelCorpora.Where(pc => referencedCorpora.Contains(pc.Id)) + .Select(Map) + .ToArray(); + } + else + { + referencedCorpora = build + .TrainOn.Select(t => t.CorpusRef) + .Concat(build.Pretranslate.Select(p => p.CorpusRef)) + .Where(r => r != null) + .Select(r => r!) + .ToHashSet(); + parallelCorpora = engine! + .Corpora.Where(c => referencedCorpora.Contains(c.Id)) + .Select(c => Map(c, engine)) + .ToArray(); + } } + IEnumerable pretranslations = ( + await GetAllAsync(engineId, modelRevision, corpusId, textId, cancellationToken) + ).Select(p => new SIL.ServiceToolkit.Models.ParallelRow + { + SourceRefs = p.SourceRefs ?? [], + TargetRefs = p.TargetRefs ?? [], + TargetText = p.Translation, + Alignment = p + .Alignment?.Select(wp => new SIL.Machine.Corpora.AlignedWordPair(wp.SourceIndex, wp.TargetIndex)) + .ToArray(), + SourceTokens = p.SourceTokens, + TargetTokens = p.TranslationTokens, + }); + + string? targetQuoteConvention = null; + if (quoteNormalizationBehavior == PretranslationNormalizationBehavior.Denormalized) + targetQuoteConvention = build.TargetQuoteConvention; + string usfm = ""; // Update the target book if it exists if (template is PretranslationUsfmTemplate.Auto or PretranslationUsfmTemplate.Target) { - // the pretranslations are generated from the source book and inserted into the target book - // use relaxed references since the USFM structure may not be the same - pretranslationRows = pretranslationRows.Select(p => - ( - // we won't use the source refs - (IReadOnlyList)[], - (IReadOnlyList)p.TargetScriptureRefs.Select(r => r.ToRelaxed()).ToArray(), - p.Pretranslation, - p.ParagraphBehavior, - p.StyleBehavior - ) - ); - using SIL.ServiceToolkit.Services.ZipParatextProjectTextUpdater updater = corpusBundle.GetTextUpdater( - GetFilePath(targetFile.Filename) - ); - switch (textOrigin) + UpdateUsfmTextBehavior textBehavior = textOrigin switch { - case PretranslationUsfmTextOrigin.PreferExisting: - usfm = - updater.UpdateUsfm( - textId, - pretranslationRows.Select(pr => Map(pr, isSource: false)).ToList(), - fullName: targetSettings.FullName, - textBehavior: UpdateUsfmTextBehavior.PreferExisting, - paragraphBehavior: Map(paragraphMarkerBehavior), - embedBehavior: Map(embedBehavior), - styleBehavior: Map(styleMarkerBehavior), - updateBlockHandlers: updateBlockHandlers, - remarks: remarks, - errorHandler: (_) => true, - compareSegments: false - ) ?? ""; - break; - case PretranslationUsfmTextOrigin.PreferPretranslated: - usfm = - updater.UpdateUsfm( - textId, - pretranslationRows.Select(pr => Map(pr, isSource: false)).ToList(), - fullName: targetSettings.FullName, - textBehavior: UpdateUsfmTextBehavior.PreferNew, - paragraphBehavior: Map(paragraphMarkerBehavior), - embedBehavior: Map(embedBehavior), - styleBehavior: Map(styleMarkerBehavior), - updateBlockHandlers: updateBlockHandlers, - remarks: remarks, - errorHandler: (_) => true, - compareSegments: false - ) ?? ""; - break; - case PretranslationUsfmTextOrigin.OnlyExisting: - usfm = - updater.UpdateUsfm( - textId, - [], // don't put any pretranslations, we only want the existing text. - fullName: targetSettings.FullName, - textBehavior: UpdateUsfmTextBehavior.PreferNew, - paragraphBehavior: Map(paragraphMarkerBehavior), - embedBehavior: Map(embedBehavior), - styleBehavior: Map(styleMarkerBehavior), - updateBlockHandlers: updateBlockHandlers, - remarks: remarks, - errorHandler: (_) => true, - compareSegments: false - ) ?? ""; - break; - case PretranslationUsfmTextOrigin.OnlyPretranslated: - usfm = - updater.UpdateUsfm( - textId, - pretranslationRows.Select(pr => Map(pr, isSource: false)).ToList(), - fullName: targetSettings.FullName, - textBehavior: UpdateUsfmTextBehavior.StripExisting, - paragraphBehavior: Map(paragraphMarkerBehavior), - embedBehavior: Map(embedBehavior), - styleBehavior: Map(styleMarkerBehavior), - updateBlockHandlers: updateBlockHandlers, - remarks: remarks, - errorHandler: (_) => true, - compareSegments: false - ) ?? ""; - break; - } + PretranslationUsfmTextOrigin.PreferExisting => UpdateUsfmTextBehavior.PreferExisting, + PretranslationUsfmTextOrigin.PreferPretranslated => UpdateUsfmTextBehavior.PreferNew, + PretranslationUsfmTextOrigin.OnlyExisting => UpdateUsfmTextBehavior.PreferNew, + PretranslationUsfmTextOrigin.OnlyPretranslated => UpdateUsfmTextBehavior.StripExisting, + _ => throw new InvalidEnumArgumentException(nameof(textOrigin)), + }; + + usfm = _parallelCorpusService.UpdateTargetUsfm( + parallelCorpora, + corpusId, + textId, + pretranslations.ToArray(), + textBehavior, + Map(paragraphMarkerBehavior), + Map(embedBehavior), + Map(styleMarkerBehavior), + remarks, + targetQuoteConvention + ); } if ( @@ -280,207 +200,24 @@ PretranslationUsfmMarkerBehavior StyleBehavior && (template is PretranslationUsfmTemplate.Auto or PretranslationUsfmTemplate.Source) ) { - using SIL.ServiceToolkit.Services.ZipParatextProjectTextUpdater updater = corpusBundle.GetTextUpdater( - GetFilePath(sourceFile.Filename) - ); - // Copy and update the source book if it exists - switch (textOrigin) - { - case PretranslationUsfmTextOrigin.PreferExisting: - case PretranslationUsfmTextOrigin.PreferPretranslated: - case PretranslationUsfmTextOrigin.OnlyPretranslated: - usfm = - updater.UpdateUsfm( - textId, - pretranslationRows.Select(pr => Map(pr, isSource: true)).ToList(), - fullName: targetSettings.FullName, - textBehavior: UpdateUsfmTextBehavior.StripExisting, - paragraphBehavior: Map(paragraphMarkerBehavior), - embedBehavior: Map(embedBehavior), - styleBehavior: Map(styleMarkerBehavior), - updateBlockHandlers: updateBlockHandlers, - remarks: remarks, - errorHandler: (_) => true, - compareSegments: true - ) ?? ""; - break; - case PretranslationUsfmTextOrigin.OnlyExisting: - usfm = - updater.UpdateUsfm( - textId, - [], // don't pass the pretranslations, we only want the existing text. - fullName: targetSettings.FullName, - textBehavior: UpdateUsfmTextBehavior.StripExisting, - paragraphBehavior: Map(paragraphMarkerBehavior), - embedBehavior: Map(embedBehavior), - styleBehavior: Map(styleMarkerBehavior), - updateBlockHandlers: updateBlockHandlers, - remarks: remarks, - errorHandler: (_) => true, - compareSegments: true - ) ?? ""; - break; - } - } - if ( - quoteNormalizationBehavior == PretranslationNormalizationBehavior.Denormalized - && !string.IsNullOrEmpty(build.TargetQuoteConvention) - ) - { - usfm = DenormalizeQuotationMarks(usfm, build.TargetQuoteConvention); - } - - return usfm; - } - - private static ( - IReadOnlyList SourceScriptureRefs, - IReadOnlyList TargetScriptureRefs, - Pretranslation Pretranslation, - PretranslationUsfmMarkerBehavior ParagraphMarkerBehavior, - PretranslationUsfmMarkerBehavior StyleMarkerBehavior - ) Map( - Pretranslation pretranslation, - ParatextProjectSettings sourceSettings, - ParatextProjectSettings targetSettings, - PretranslationUsfmMarkerBehavior paragraphMarkerBehavior, - PretranslationUsfmMarkerBehavior styleMarkerBehavior - ) - { - IReadOnlyList sourceScriptureRefs, - targetScriptureRefs; - if (pretranslation.TargetRefs?.Any() ?? false) - { - sourceScriptureRefs = - pretranslation - .SourceRefs?.Select(r => - { - bool parsed = ScriptureRef.TryParse(r, sourceSettings.Versification, out ScriptureRef sr); - return new { Parsed = parsed, ScriptureRef = sr }; - }) - .Where(r => r.Parsed) - .Select(r => r.ScriptureRef) - .ToArray() - ?? []; - targetScriptureRefs = pretranslation - .TargetRefs.Select(r => - { - bool parsed = ScriptureRef.TryParse(r, targetSettings.Versification, out ScriptureRef sr); - return new { Parsed = parsed, ScriptureRef = sr }; - }) - .Where(r => r.Parsed) - .Select(r => r.ScriptureRef) - .ToArray(); - } - else - { - sourceScriptureRefs = []; - targetScriptureRefs = pretranslation - .Refs.Select(r => - { - bool parsed = ScriptureRef.TryParse(r, targetSettings.Versification, out ScriptureRef sr); - return new { Parsed = parsed, ScriptureRef = sr }; - }) - .Where(r => r.Parsed) - .Select(r => r.ScriptureRef) - .ToArray(); - } - - return (sourceScriptureRefs, targetScriptureRefs, pretranslation, paragraphMarkerBehavior, styleMarkerBehavior); - } - - private static string DenormalizeQuotationMarks(string usfm, string quoteConvention) - { - QuoteConvention targetQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName(quoteConvention); - if (targetQuoteConvention is null) - return usfm; - - QuotationMarkDenormalizationFirstPass quotationMarkDenormalizationFirstPass = new(targetQuoteConvention); - - UsfmParser.Parse(usfm, quotationMarkDenormalizationFirstPass); - List<(int ChapterNumber, QuotationMarkUpdateStrategy Strategy)> bestChapterStrategies = - quotationMarkDenormalizationFirstPass.FindBestChapterStrategies(); - - QuotationMarkDenormalizationUsfmUpdateBlockHandler quotationMarkDenormalizer = new( - targetQuoteConvention, - new QuotationMarkUpdateSettings( - chapterStrategies: bestChapterStrategies.Select(tuple => tuple.Strategy).ToList() - ) - ); - int denormalizableChapterCount = bestChapterStrategies.Count(tup => - tup.Strategy != QuotationMarkUpdateStrategy.Skip - ); - List remarks = []; - string quotationDenormalizationRemark; - if (denormalizableChapterCount == bestChapterStrategies.Count) - { - quotationDenormalizationRemark = - "The quote style in all chapters has been automatically adjusted to match the rest of the project."; - } - else if (denormalizableChapterCount > 0) - { - quotationDenormalizationRemark = - "The quote style in the following chapters has been automatically adjusted to match the rest of the project: " - + GetChapterRangesString( - bestChapterStrategies - .Where(tuple => tuple.Strategy != QuotationMarkUpdateStrategy.Skip) - .Select(tuple => tuple.ChapterNumber) - .ToList() - ) - + "."; - } - else - { - quotationDenormalizationRemark = - "The quote style was not automatically adjusted to match the rest of your project in any chapters."; + usfm = _parallelCorpusService.UpdateSourceUsfm( + parallelCorpora, + corpusId, + textId, + textOrigin == PretranslationUsfmTextOrigin.OnlyExisting ? [] : pretranslations.ToArray(), + Map(paragraphMarkerBehavior), + Map(embedBehavior), + Map(styleMarkerBehavior), + placeParagraphMarkers: paragraphMarkerBehavior == PretranslationUsfmMarkerBehavior.PreservePosition, + remarks, + targetQuoteConvention + ); } - remarks.Add(quotationDenormalizationRemark); - - var updater = new UpdateUsfmParserHandler(updateBlockHandlers: [quotationMarkDenormalizer], remarks: remarks); - UsfmParser.Parse(usfm, updater); - usfm = updater.GetUsfm(); return usfm; } - public static string GetChapterRangesString(List chapterNumbers) - { - chapterNumbers = chapterNumbers.Order().ToList(); - int start = chapterNumbers[0]; - int end = chapterNumbers[0]; - List chapterRangeStrings = []; - foreach (int chapterNumber in chapterNumbers[1..]) - { - if (chapterNumber == end + 1) - { - end = chapterNumber; - } - else - { - if (start == end) - { - chapterRangeStrings.Add(start.ToString(CultureInfo.InvariantCulture)); - } - else - { - chapterRangeStrings.Add($"{start}-{end}"); - } - start = chapterNumber; - end = chapterNumber; - } - } - if (start == end) - { - chapterRangeStrings.Add(start.ToString(CultureInfo.InvariantCulture)); - } - else - { - chapterRangeStrings.Add($"{start}-{end}"); - } - return string.Join(", ", chapterRangeStrings); - } - /// /// Generate a natural sounding remark/comment describing marker placement. /// @@ -550,61 +287,6 @@ private static UpdateUsfmMarkerBehavior Map(PretranslationUsfmMarkerBehavior beh }; } - private static WordAlignmentMatrix Map(IEnumerable? alignedWordPairs) - { - int rowCount = 0; - int columnCount = 0; - if (alignedWordPairs is not null) - { - foreach (Models.AlignedWordPair pair in alignedWordPairs) - { - if (pair.SourceIndex + 1 > rowCount) - rowCount = pair.SourceIndex + 1; - if (pair.TargetIndex + 1 > columnCount) - columnCount = pair.TargetIndex + 1; - } - } - return new WordAlignmentMatrix( - rowCount, - columnCount, - alignedWordPairs?.Select(wp => (wp.SourceIndex, wp.TargetIndex)) - ); - } - - private static UpdateUsfmRow Map( - ( - IReadOnlyList SourceScriptureRefs, - IReadOnlyList TargetScriptureRefs, - Pretranslation Pretranslation, - PretranslationUsfmMarkerBehavior ParagraphBehavior, - PretranslationUsfmMarkerBehavior StyleBehavior - ) pretranslationRow, - bool isSource - ) - { - return new UpdateUsfmRow( - isSource && pretranslationRow.SourceScriptureRefs.Any() - ? pretranslationRow.SourceScriptureRefs - : pretranslationRow.TargetScriptureRefs, - pretranslationRow.Pretranslation.Translation, - pretranslationRow.Pretranslation.Alignment is not null - ? new Dictionary - { - { - PlaceMarkersAlignmentInfo.MetadataKey, - new PlaceMarkersAlignmentInfo( - pretranslationRow.Pretranslation.SourceTokens?.ToList() ?? [], - pretranslationRow.Pretranslation.TranslationTokens?.ToList() ?? [], - Map(pretranslationRow.Pretranslation.Alignment), - paragraphBehavior: Map(pretranslationRow.ParagraphBehavior), - styleBehavior: Map(pretranslationRow.StyleBehavior) - ) - }, - } - : null - ); - } - private SIL.ServiceToolkit.Models.ParallelCorpus Map(ParallelCorpus source) { return new SIL.ServiceToolkit.Models.ParallelCorpus diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/ParallelRow.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/ParallelRow.cs new file mode 100644 index 000000000..13a40e319 --- /dev/null +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/ParallelRow.cs @@ -0,0 +1,11 @@ +namespace SIL.ServiceToolkit.Models; + +public record ParallelRow +{ + public required IReadOnlyList SourceRefs { get; init; } + public required IReadOnlyList TargetRefs { get; init; } + public required string TargetText { get; init; } + public required IReadOnlyList? SourceTokens { get; init; } + public required IReadOnlyList? TargetTokens { get; init; } + public IReadOnlyList? Alignment { get; init; } +} diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusBundle.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusBundle.cs index ea0d3f0e4..09720985e 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusBundle.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusBundle.cs @@ -43,7 +43,7 @@ IReadOnlyList TextCorpora IReadOnlyList CorpusFile, IReadOnlyList TextCorpora )> TargetTermCorpora { get; } - public IEnumerable ParallelCorpora { get; } + public IReadOnlyList ParallelCorpora { get; } public CorpusBundle(IEnumerable parallelCorpora) { diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/IParallelCorpusService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/IParallelCorpusService.cs index bf3d7aef3..ae35ebbbb 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/IParallelCorpusService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/IParallelCorpusService.cs @@ -2,25 +2,51 @@ namespace SIL.ServiceToolkit.Services; public interface IParallelCorpusService { - QuoteConventionAnalysis AnalyzeTargetQuoteConvention(CorpusBundle corpusBundle); + QuoteConventionAnalysis AnalyzeTargetQuoteConvention(IEnumerable parallelCorpora); IReadOnlyList<( string ParallelCorpusId, string MonolingualCorpusId, IReadOnlyList Errors - )> AnalyzeUsfmVersification(CorpusBundle corpusBundle); + )> AnalyzeUsfmVersification(IEnumerable parallelCorpora); IReadOnlyList<( string ParallelCorpusId, string MonolingualCorpusId, MissingParentProjectError - )> FindMissingParentProjects(CorpusBundle corpusBundle); + )> FindMissingParentProjects(IEnumerable parallelCorpora); Task PreprocessAsync( - CorpusBundle corpusBundle, + IEnumerable parallelCorpora, Func train, Func inference, bool useKeyTerms = false, HashSet? ignoreUsfmMarkers = null ); + + public string UpdateSourceUsfm( + IReadOnlyList parallelCorpora, + string corpusId, + string bookId, + IReadOnlyList rows, + UpdateUsfmMarkerBehavior paragraphBehavior, + UpdateUsfmMarkerBehavior embedBehavior, + UpdateUsfmMarkerBehavior styleBehavior, + bool placeParagraphMarkers, + IEnumerable? remarks, + string? targetQuoteConvention + ); + + public string UpdateTargetUsfm( + IReadOnlyList parallelCorpora, + string corpusId, + string bookId, + IReadOnlyList rows, + UpdateUsfmTextBehavior textBehavior, + UpdateUsfmMarkerBehavior paragraphBehavior, + UpdateUsfmMarkerBehavior embedBehavior, + UpdateUsfmMarkerBehavior styleBehavior, + IEnumerable? remarks, + string? targetQuoteConvention + ); } diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusService.cs index c69ece379..3b8c4982a 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusService.cs @@ -1,3 +1,5 @@ +using System.Globalization; +using SIL.Machine.Translation; using SIL.Scripture; namespace SIL.ServiceToolkit.Services; @@ -10,8 +12,9 @@ public class ParallelCorpusService : IParallelCorpusService string ParallelCorpusId, string MonolingualCorpusId, IReadOnlyList Errors - )> AnalyzeUsfmVersification(CorpusBundle corpusBundle) + )> AnalyzeUsfmVersification(IEnumerable parallelCorpora) { + CorpusBundle corpusBundle = new(parallelCorpora); List<( string ParallelCorpusId, string MonolingualCorpusId, @@ -42,8 +45,9 @@ IReadOnlyList Errors return errorsPerCorpus; } - public QuoteConventionAnalysis AnalyzeTargetQuoteConvention(CorpusBundle corpusBundle) + public QuoteConventionAnalysis AnalyzeTargetQuoteConvention(IEnumerable parallelCorpora) { + CorpusBundle corpusBundle = new(parallelCorpora); Dictionary> analyses = []; foreach ( ( @@ -94,8 +98,9 @@ public QuoteConventionAnalysis AnalyzeTargetQuoteConvention(CorpusBundle corpusB string ParallelCorpusId, string MonolingualCorpusId, MissingParentProjectError - )> FindMissingParentProjects(CorpusBundle corpusBundle) + )> FindMissingParentProjects(IEnumerable parallelCorpora) { + CorpusBundle corpusBundle = new(parallelCorpora); List<(string, string, MissingParentProjectError)> errors = []; foreach ( ( @@ -127,13 +132,14 @@ public QuoteConventionAnalysis AnalyzeTargetQuoteConvention(CorpusBundle corpusB } public async Task PreprocessAsync( - CorpusBundle corpusBundle, + IEnumerable parallelCorpora, Func train, Func inference, bool useKeyTerms = false, HashSet? ignoreUsfmMarkers = null ) { + CorpusBundle corpusBundle = new(parallelCorpora); ignoreUsfmMarkers ??= []; bool parallelTrainingDataPresent = false; @@ -493,4 +499,280 @@ private static TextRow CleanSegment(TextRow row) // } return [.. books.Select(bookName => Canon.BookIdToNumber(bookName))]; } + + public string UpdateSourceUsfm( + IReadOnlyList parallelCorpora, + string corpusId, + string bookId, + IReadOnlyList rows, + UpdateUsfmMarkerBehavior paragraphBehavior, + UpdateUsfmMarkerBehavior embedBehavior, + UpdateUsfmMarkerBehavior styleBehavior, + bool placeParagraphMarkers, + IEnumerable? remarks, + string? targetQuoteConvention + ) + { + return UpdateUsfm( + parallelCorpora, + corpusId, + bookId, + rows, + UpdateUsfmTextBehavior.StripExisting, + paragraphBehavior, + embedBehavior, + styleBehavior, + placeParagraphMarkers ? [new PlaceMarkersUsfmUpdateBlockHandler()] : null, + remarks, + targetQuoteConvention, + isSource: true + ); + } + + public string UpdateTargetUsfm( + IReadOnlyList parallelCorpora, + string corpusId, + string bookId, + IReadOnlyList rows, + UpdateUsfmTextBehavior textBehavior, + UpdateUsfmMarkerBehavior paragraphBehavior, + UpdateUsfmMarkerBehavior embedBehavior, + UpdateUsfmMarkerBehavior styleBehavior, + IEnumerable? remarks, + string? targetQuoteConvention + ) + { + return UpdateUsfm( + parallelCorpora, + corpusId, + bookId, + rows, + textBehavior, + paragraphBehavior, + embedBehavior, + styleBehavior, + updateBlockHandlers: null, + remarks, + targetQuoteConvention, + isSource: false + ); + } + + private static string UpdateUsfm( + IReadOnlyList parallelCorpora, + string corpusId, + string bookId, + IEnumerable rows, + UpdateUsfmTextBehavior textBehavior, + UpdateUsfmMarkerBehavior paragraphBehavior, + UpdateUsfmMarkerBehavior embedBehavior, + UpdateUsfmMarkerBehavior styleBehavior, + IEnumerable? updateBlockHandlers, + IEnumerable? remarks, + string? targetQuoteConvention, + bool isSource + ) + { + CorpusBundle corpusBundle = new(parallelCorpora); + ParallelCorpus corpus = corpusBundle.ParallelCorpora.Single(c => c.Id == corpusId); + CorpusFile sourceFile = corpus.SourceCorpora[0].Files[0]; + CorpusFile targetFile = corpus.TargetCorpora[0].Files[0]; + ParatextProjectSettings? sourceSettings = corpusBundle.GetSettings(sourceFile.Location); + ParatextProjectSettings? targetSettings = corpusBundle.GetSettings(targetFile.Location); + + using ZipParatextProjectTextUpdater updater = corpusBundle.GetTextUpdater( + isSource ? sourceFile.Location : targetFile.Location + ); + string usfm = + updater.UpdateUsfm( + bookId, + rows.Select(p => + Map( + p, + isSource, + sourceSettings?.Versification, + targetSettings?.Versification, + paragraphBehavior, + styleBehavior + ) + ) + .Where(row => row.Refs.Any()) + .OrderBy(row => row.Refs[0]) + .ToArray(), + sourceSettings?.FullName, + textBehavior, + paragraphBehavior, + embedBehavior, + styleBehavior, + updateBlockHandlers: updateBlockHandlers, + remarks: remarks, + errorHandler: (_) => true, + compareSegments: isSource + ) ?? ""; + + if (!string.IsNullOrEmpty(targetQuoteConvention)) + usfm = DenormalizeQuotationMarks(usfm, targetQuoteConvention); + return usfm; + } + + private static UpdateUsfmRow Map( + ParallelRow row, + bool isSource, + ScrVers? sourceVersification, + ScrVers? targetVersification, + UpdateUsfmMarkerBehavior paragraphBehavior, + UpdateUsfmMarkerBehavior styleBehavior + ) + { + Dictionary? metadata = null; + if (row.Alignment is not null) + { + metadata = new Dictionary + { + { + PlaceMarkersAlignmentInfo.MetadataKey, + new PlaceMarkersAlignmentInfo( + row.SourceTokens, + row.TargetTokens, + CreateWordAlignmentMatrix(row), + paragraphBehavior, + styleBehavior + ) + }, + }; + } + + ScriptureRef[] refs; + if (isSource) + { + refs = ( + row.SourceRefs.Any() + ? Map(row.SourceRefs, sourceVersification) + : Map(row.TargetRefs, targetVersification) + ).ToArray(); + } + else + { + // the pretranslations are generated from the source book and inserted into the target book + // use relaxed references since the USFM structure may not be the same + refs = Map(row.TargetRefs, targetVersification).Select(r => r.ToRelaxed()).ToArray(); + } + + return new UpdateUsfmRow(refs, row.TargetText, metadata); + } + + private static IEnumerable Map(IEnumerable refs, ScrVers? versification) + { + return refs.Select(r => + { + ScriptureRef.TryParse(r, versification, out ScriptureRef sr); + return sr; + }) + .Where(r => !r.IsEmpty); + } + + private static WordAlignmentMatrix? CreateWordAlignmentMatrix(ParallelRow row) + { + if (row.Alignment is null || row.SourceTokens is null || row.TargetTokens is null) + { + return null; + } + + var matrix = new WordAlignmentMatrix(row.SourceTokens.Count, row.TargetTokens.Count); + foreach (AlignedWordPair wordPair in row.Alignment) + matrix[wordPair.SourceIndex, wordPair.TargetIndex] = true; + + return matrix; + } + + private static string DenormalizeQuotationMarks(string usfm, string quoteConvention) + { + QuoteConvention targetQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName(quoteConvention); + if (targetQuoteConvention is null) + return usfm; + + QuotationMarkDenormalizationFirstPass quotationMarkDenormalizationFirstPass = new(targetQuoteConvention); + + UsfmParser.Parse(usfm, quotationMarkDenormalizationFirstPass); + List<(int ChapterNumber, QuotationMarkUpdateStrategy Strategy)> bestChapterStrategies = + quotationMarkDenormalizationFirstPass.FindBestChapterStrategies(); + + QuotationMarkDenormalizationUsfmUpdateBlockHandler quotationMarkDenormalizer = new( + targetQuoteConvention, + new QuotationMarkUpdateSettings( + chapterStrategies: bestChapterStrategies.Select(tuple => tuple.Strategy).ToList() + ) + ); + int denormalizableChapterCount = bestChapterStrategies.Count(tup => + tup.Strategy != QuotationMarkUpdateStrategy.Skip + ); + List remarks = []; + string quotationDenormalizationRemark; + if (denormalizableChapterCount == bestChapterStrategies.Count) + { + quotationDenormalizationRemark = + "The quote style in all chapters has been automatically adjusted to match the rest of the project."; + } + else if (denormalizableChapterCount > 0) + { + quotationDenormalizationRemark = + "The quote style in the following chapters has been automatically adjusted to match the rest of the project: " + + GetChapterRangesString( + bestChapterStrategies + .Where(tuple => tuple.Strategy != QuotationMarkUpdateStrategy.Skip) + .Select(tuple => tuple.ChapterNumber) + .ToList() + ) + + "."; + } + else + { + quotationDenormalizationRemark = + "The quote style was not automatically adjusted to match the rest of your project in any chapters."; + } + remarks.Add(quotationDenormalizationRemark); + + var updater = new UpdateUsfmParserHandler(updateBlockHandlers: [quotationMarkDenormalizer], remarks: remarks); + UsfmParser.Parse(usfm, updater); + + usfm = updater.GetUsfm(); + return usfm; + } + + public static string GetChapterRangesString(List chapterNumbers) + { + chapterNumbers = chapterNumbers.Order().ToList(); + int start = chapterNumbers[0]; + int end = chapterNumbers[0]; + List chapterRangeStrings = []; + foreach (int chapterNumber in chapterNumbers[1..]) + { + if (chapterNumber == end + 1) + { + end = chapterNumber; + } + else + { + if (start == end) + { + chapterRangeStrings.Add(start.ToString(CultureInfo.InvariantCulture)); + } + else + { + chapterRangeStrings.Add($"{start}-{end}"); + } + start = chapterNumber; + end = chapterNumber; + } + } + if (start == end) + { + chapterRangeStrings.Add(start.ToString(CultureInfo.InvariantCulture)); + } + else + { + chapterRangeStrings.Add($"{start}-{end}"); + } + return string.Join(", ", chapterRangeStrings); + } } diff --git a/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/ParallelCorpusServiceTests.cs b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/ParallelCorpusServiceTests.cs index c48b3f008..2903d0007 100644 --- a/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/ParallelCorpusServiceTests.cs +++ b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/ParallelCorpusServiceTests.cs @@ -1,5 +1,3 @@ -using SIL.ServiceToolkit.Utils; - namespace SIL.ServiceToolkit.Services; [TestFixture] @@ -12,9 +10,9 @@ public void TestParallelCorpusAnalysis_FileFormatParatext() ParallelCorpus parallelCorpus = env.GetCorpora(paratextProject: true).First(); const string ExpectedTargetName = "typewriter_english"; - QuoteConventionAnalysis? targetQuotationConvention = env.Processor.AnalyzeTargetQuoteConvention( - new CorpusBundle([parallelCorpus]) - ); + QuoteConventionAnalysis? targetQuotationConvention = env.Processor.AnalyzeTargetQuoteConvention([ + parallelCorpus, + ]); Assert.Multiple(() => { @@ -29,9 +27,9 @@ public void TestParallelCorpusAnalysis_FileFormatText() using var env = new TestEnvironment(); ParallelCorpus parallelCorpus = env.GetCorpora(paratextProject: false).First(); - QuoteConventionAnalysis? targetQuotationConvention = env.Processor.AnalyzeTargetQuoteConvention( - new CorpusBundle([parallelCorpus]) - ); + QuoteConventionAnalysis? targetQuotationConvention = env.Processor.AnalyzeTargetQuoteConvention([ + parallelCorpus, + ]); Assert.Multiple(() => { @@ -48,7 +46,7 @@ public async Task TestParallelCorpusPreprocessor_FileFormatText() int trainCount = 0; int inferenceCount = 0; await env.Processor.PreprocessAsync( - new CorpusBundle(corpora), + corpora, (row, _) => { if (row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0) @@ -84,7 +82,7 @@ public async Task TestParallelCorpusPreprocessor_FileFormatParatext() var trainRefs = new List(); var inferenceRefs = new List(); await env.Processor.PreprocessAsync( - new CorpusBundle(corpora), + corpora, (row, _) => { if (row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0) From 272c9d2874460af6f981311a1393e627a83d3669 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Mon, 16 Mar 2026 12:45:40 -0400 Subject: [PATCH 3/5] Add GetChapters method; move mapping to shared service; fix other small typos; progress toward passing tests Fix pretranslation service bug Passing translation engine tests Move all mapping to corpus mapping service Remove commented out code --- src/Echo/src/EchoEngine/Program.cs | 2 +- .../EchoEngine/TranslationEngineServiceV1.cs | 6 +- .../WordAlignmentEngineServiceV1.cs | 10 +- .../IServiceCollectionExtensions.cs | 2 +- .../Services/NmtPreprocessBuildJob.cs | 4 +- .../Services/SmtTransferPreprocessBuildJob.cs | 4 +- .../Services/TranslationPreprocessBuildJob.cs | 4 +- .../WordAlignmentPreprocessBuildJob.cs | 4 +- .../Services/PreprocessBuildJobTests.cs | 10 +- .../Configuration/IServalBuilderExtensions.cs | 3 + .../Services/CorpusMappingService.cs | 344 +++++++++++++++ .../Services/EngineService.cs | 403 +++--------------- .../Services/ICorpusMappingService.cs | 7 + .../Services/PretranslationService.cs | 106 +---- .../TranslationEngineTests.cs | 98 +---- .../data/file_c/41MATTe1.SFM | 6 + .../data/file_c/Settings.xml | 34 ++ .../data/file_d/Settings.xml | 33 ++ .../Services/EngineServiceTests.cs | 55 +-- .../Services/PretranslationServiceTests.cs | 23 +- .../data/pt-project2/41MATTe2.SFM | 0 .../IServiceCollectionsExtensions.cs | 2 +- .../Models/ParallelCorpus.cs | 2 + .../Services/IParallelCorpusService.cs | 10 +- .../Services/ParallelCorpusService.cs | 40 +- .../Services/CorpusBundleTests.cs | 10 +- .../Services/ParallelCorpusServiceTests.cs | 4 +- 27 files changed, 615 insertions(+), 611 deletions(-) create mode 100644 src/Serval/src/Serval.Translation/Services/CorpusMappingService.cs create mode 100644 src/Serval/src/Serval.Translation/Services/ICorpusMappingService.cs create mode 100644 src/Serval/test/Serval.ApiServer.IntegrationTests/data/file_c/41MATTe1.SFM create mode 100644 src/Serval/test/Serval.ApiServer.IntegrationTests/data/file_c/Settings.xml create mode 100644 src/Serval/test/Serval.ApiServer.IntegrationTests/data/file_d/Settings.xml delete mode 100644 src/Serval/test/Serval.Translation.Tests/data/pt-project2/41MATTe2.SFM diff --git a/src/Echo/src/EchoEngine/Program.cs b/src/Echo/src/EchoEngine/Program.cs index 58a1c8da3..796260736 100644 --- a/src/Echo/src/EchoEngine/Program.cs +++ b/src/Echo/src/EchoEngine/Program.cs @@ -24,7 +24,7 @@ builder.Services.AddHostedService(); builder.Services.AddSingleton(); -builder.Services.AddParallelCorpusPreprocessor(); +builder.Services.AddParallelCorpusService(); builder.Services.AddHealthChecks().AddCheck("Live", () => HealthCheckResult.Healthy()); diff --git a/src/Echo/src/EchoEngine/TranslationEngineServiceV1.cs b/src/Echo/src/EchoEngine/TranslationEngineServiceV1.cs index 6ed6178b8..dd84b9521 100644 --- a/src/Echo/src/EchoEngine/TranslationEngineServiceV1.cs +++ b/src/Echo/src/EchoEngine/TranslationEngineServiceV1.cs @@ -4,7 +4,7 @@ namespace EchoEngine; public class TranslationEngineServiceV1( BackgroundTaskQueue taskQueue, - IParallelCorpusService parallelCorpusPreprocessingService, + IParallelCorpusService parallelCorpusService, TranslationPlatformApi.TranslationPlatformApiClient platformApiClient ) : TranslationEngineApi.TranslationEngineApiBase { @@ -12,7 +12,7 @@ TranslationPlatformApi.TranslationPlatformApiClient platformApiClient private readonly BackgroundTaskQueue _taskQueue = taskQueue; private readonly TranslationPlatformApi.TranslationPlatformApiClient _platformApiClient = platformApiClient; - private readonly IParallelCorpusService _parallelCorpusPreprocessingService = parallelCorpusPreprocessingService; + private readonly IParallelCorpusService _parallelCorpusService = parallelCorpusService; public override Task Create(CreateRequest request, ServerCallContext context) { @@ -124,7 +124,7 @@ await client.BuildStartedAsync( int pretranslateCount = 0; List pretranslationsRequests = []; - await _parallelCorpusPreprocessingService.PreprocessAsync( + await _parallelCorpusService.PreprocessAsync( request.Corpora.Select(Map), (row, _) => { diff --git a/src/Echo/src/EchoEngine/WordAlignmentEngineServiceV1.cs b/src/Echo/src/EchoEngine/WordAlignmentEngineServiceV1.cs index 12c3e9d12..74ad1f07d 100644 --- a/src/Echo/src/EchoEngine/WordAlignmentEngineServiceV1.cs +++ b/src/Echo/src/EchoEngine/WordAlignmentEngineServiceV1.cs @@ -2,14 +2,12 @@ namespace EchoEngine; -public class WordAlignmentEngineServiceV1( - BackgroundTaskQueue taskQueue, - IParallelCorpusService parallelCorpusPreprocessingService -) : WordAlignmentEngineApi.WordAlignmentEngineApiBase +public class WordAlignmentEngineServiceV1(BackgroundTaskQueue taskQueue, IParallelCorpusService parallelCorpusService) + : WordAlignmentEngineApi.WordAlignmentEngineApiBase { private static readonly Empty Empty = new(); private readonly BackgroundTaskQueue _taskQueue = taskQueue; - private readonly IParallelCorpusService _parallelCorpusPreprocessingService = parallelCorpusPreprocessingService; + private readonly IParallelCorpusService _parallelCorpusService = parallelCorpusService; public override Task Create(CreateRequest request, ServerCallContext context) { @@ -78,7 +76,7 @@ await client.BuildStartedAsync( int trainCount = 0; int wordAlignCount = 0; List wordAlignmentsRequests = []; - await _parallelCorpusPreprocessingService.PreprocessAsync( + await _parallelCorpusService.PreprocessAsync( request.Corpora.Select(Map), (row, _) => { diff --git a/src/Machine/src/Serval.Machine.Shared/Configuration/IServiceCollectionExtensions.cs b/src/Machine/src/Serval.Machine.Shared/Configuration/IServiceCollectionExtensions.cs index 352768e4e..f5d82f55a 100644 --- a/src/Machine/src/Serval.Machine.Shared/Configuration/IServiceCollectionExtensions.cs +++ b/src/Machine/src/Serval.Machine.Shared/Configuration/IServiceCollectionExtensions.cs @@ -19,7 +19,7 @@ public static IMachineBuilder AddMachine(this IServiceCollection services, IConf (sp, cancellationToken) => sp.GetRequiredService().InitAsync(cancellationToken) ); - services.AddParallelCorpusPreprocessor(); + services.AddParallelCorpusService(); services.Configure(configuration.GetSection("Bugsnag")); services.AddBugsnag(); services.AddDiagnostics(); diff --git a/src/Machine/src/Serval.Machine.Shared/Services/NmtPreprocessBuildJob.cs b/src/Machine/src/Serval.Machine.Shared/Services/NmtPreprocessBuildJob.cs index f4ff9487b..2b5f06393 100644 --- a/src/Machine/src/Serval.Machine.Shared/Services/NmtPreprocessBuildJob.cs +++ b/src/Machine/src/Serval.Machine.Shared/Services/NmtPreprocessBuildJob.cs @@ -8,7 +8,7 @@ public class NmtPreprocessBuildJob( IBuildJobService buildJobService, ISharedFileService sharedFileService, ILanguageTagService languageTagService, - IParallelCorpusService parallelCorpusPreprocessingService, + IParallelCorpusService parallelCorpusService, IOptionsMonitor options ) : TranslationPreprocessBuildJob( @@ -18,7 +18,7 @@ IOptionsMonitor options logger, buildJobService, sharedFileService, - parallelCorpusPreprocessingService, + parallelCorpusService, options ) { diff --git a/src/Machine/src/Serval.Machine.Shared/Services/SmtTransferPreprocessBuildJob.cs b/src/Machine/src/Serval.Machine.Shared/Services/SmtTransferPreprocessBuildJob.cs index d9f9253c3..cdd618312 100644 --- a/src/Machine/src/Serval.Machine.Shared/Services/SmtTransferPreprocessBuildJob.cs +++ b/src/Machine/src/Serval.Machine.Shared/Services/SmtTransferPreprocessBuildJob.cs @@ -9,7 +9,7 @@ public class SmtTransferPreprocessBuildJob( ISharedFileService sharedFileService, IDistributedReaderWriterLockFactory lockFactory, IRepository trainSegmentPairs, - IParallelCorpusService parallelCorpusPreprocessingService, + IParallelCorpusService parallelCorpusService, IOptionsMonitor options ) : TranslationPreprocessBuildJob( @@ -19,7 +19,7 @@ IOptionsMonitor options logger, buildJobService, sharedFileService, - parallelCorpusPreprocessingService, + parallelCorpusService, options ) { diff --git a/src/Machine/src/Serval.Machine.Shared/Services/TranslationPreprocessBuildJob.cs b/src/Machine/src/Serval.Machine.Shared/Services/TranslationPreprocessBuildJob.cs index 47e07c1cd..9715afb92 100644 --- a/src/Machine/src/Serval.Machine.Shared/Services/TranslationPreprocessBuildJob.cs +++ b/src/Machine/src/Serval.Machine.Shared/Services/TranslationPreprocessBuildJob.cs @@ -7,7 +7,7 @@ public class TranslationPreprocessBuildJob( ILogger> logger, IBuildJobService buildJobService, ISharedFileService sharedFileService, - IParallelCorpusService parallelCorpusPreprocessingService, + IParallelCorpusService parallelCorpusService, IOptionsMonitor options ) : PreprocessBuildJob( @@ -17,7 +17,7 @@ IOptionsMonitor options logger, buildJobService, sharedFileService, - parallelCorpusPreprocessingService, + parallelCorpusService, options ) { diff --git a/src/Machine/src/Serval.Machine.Shared/Services/WordAlignmentPreprocessBuildJob.cs b/src/Machine/src/Serval.Machine.Shared/Services/WordAlignmentPreprocessBuildJob.cs index 2eb3c06c9..8459232aa 100644 --- a/src/Machine/src/Serval.Machine.Shared/Services/WordAlignmentPreprocessBuildJob.cs +++ b/src/Machine/src/Serval.Machine.Shared/Services/WordAlignmentPreprocessBuildJob.cs @@ -7,7 +7,7 @@ public class WordAlignmentPreprocessBuildJob( ILogger logger, IBuildJobService buildJobService, ISharedFileService sharedFileService, - IParallelCorpusService parallelCorpusPreprocessingService, + IParallelCorpusService parallelCorpusService, IOptionsMonitor options ) : PreprocessBuildJob( @@ -17,7 +17,7 @@ IOptionsMonitor options logger, buildJobService, sharedFileService, - parallelCorpusPreprocessingService, + parallelCorpusService, options ) { diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs index 1417d318d..c62ecaae4 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs @@ -356,7 +356,7 @@ public void RunAsync_OnlyParseSelectedBooks_NoBadBooks() ) .Do(async callInfo => { - CorpusBundle corpusBundle = callInfo.ArgAt(0); + CorpusBundle corpusBundle = new(callInfo.ArgAt>(0)); DummyCorpusBundle dummyCorpusBundle = new DummyCorpusBundle( corpusBundle, ["LEV", "MRK", "MAT"], @@ -387,7 +387,7 @@ public async Task RunAsync_OnlyParseSelectedBooks_TrainOnBadBook() ArgumentException? ex = null; env.ParallelCorpusService.When(s => s.PreprocessAsync( - Arg.Any(), + Arg.Any>(), Arg.Any>(), Arg.Any>(), Arg.Any(), @@ -396,7 +396,7 @@ public async Task RunAsync_OnlyParseSelectedBooks_TrainOnBadBook() ) .Do(async callInfo => { - CorpusBundle corpusBundle = callInfo.ArgAt(0); + CorpusBundle corpusBundle = new(callInfo.ArgAt>(0)); DummyCorpusBundle dummyCorpusBundle = new DummyCorpusBundle( corpusBundle, ["LEV", "MRK", "MAT"], @@ -431,7 +431,7 @@ public void RunAsync_OnlyParseSelectedBooks_PretranslateOnBadBook() ArgumentException? ex = null; env.ParallelCorpusService.When(s => s.PreprocessAsync( - Arg.Any(), + Arg.Any>(), Arg.Any>(), Arg.Any>(), Arg.Any(), @@ -440,7 +440,7 @@ public void RunAsync_OnlyParseSelectedBooks_PretranslateOnBadBook() ) .Do(async callInfo => { - CorpusBundle corpusBundle = callInfo.ArgAt(0); + CorpusBundle corpusBundle = new(callInfo.ArgAt>(0)); DummyCorpusBundle dummyCorpusBundle = new DummyCorpusBundle( corpusBundle, ["LEV", "MRK", "MAT"], diff --git a/src/Serval/src/Serval.Translation/Configuration/IServalBuilderExtensions.cs b/src/Serval/src/Serval.Translation/Configuration/IServalBuilderExtensions.cs index 46cf84114..cc298534d 100644 --- a/src/Serval/src/Serval.Translation/Configuration/IServalBuilderExtensions.cs +++ b/src/Serval/src/Serval.Translation/Configuration/IServalBuilderExtensions.cs @@ -10,7 +10,10 @@ public static IServalBuilder AddTranslation(this IServalBuilder builder) builder.AddApiOptions(builder.Configuration.GetSection(ApiOptions.Key)); builder.AddDataFileOptions(builder.Configuration.GetSection(DataFileOptions.Key)); + builder.Services.AddSingleton(); + builder.Services.AddScoped(); + builder.Services.AddScoped(); builder.Services.AddScoped(); builder.Services.AddScoped(); diff --git a/src/Serval/src/Serval.Translation/Services/CorpusMappingService.cs b/src/Serval/src/Serval.Translation/Services/CorpusMappingService.cs new file mode 100644 index 000000000..aa1fd168f --- /dev/null +++ b/src/Serval/src/Serval.Translation/Services/CorpusMappingService.cs @@ -0,0 +1,344 @@ +using SIL.Extensions; + +namespace Serval.Translation.Services; + +public class CorpusMappingService( + IOptionsMonitor dataFileOptions, + IParallelCorpusService parallelCorpusService +) : ICorpusMappingService +{ + private readonly IOptionsMonitor _dataFileOptions = dataFileOptions; + private readonly IParallelCorpusService _parallelCorpusService = parallelCorpusService; + + public IReadOnlyList Map(Build build, Engine engine) + { + if (engine.ParallelCorpora.Any()) + { + return Map(build, engine.ParallelCorpora); + } + else + { + return Map(build, engine, engine.Corpora); + } + } + + public IReadOnlyList Map( + Build build, + Engine engine, + IReadOnlyList corpora + ) + { + List mappedParallelCorpora = []; + + Dictionary? trainingCorpora = build.TrainOn?.ToDictionary(c => c.CorpusRef!); + Dictionary? pretranslateCorpora = build.Pretranslate?.ToDictionary(c => + c.CorpusRef! + ); + bool trainOnAllCorpora = trainingCorpora is null; + bool pretranslateAllCorpora = pretranslateCorpora is null; + + foreach ( + Corpus source in corpora.Where(c => + trainingCorpora == null + || trainingCorpora.ContainsKey(c.Id) + || pretranslateCorpora == null + || pretranslateCorpora.ContainsKey(c.Id) + ) + ) + { + TrainingCorpus? trainingCorpus = trainingCorpora?.GetValueOrDefault(source.Id); + PretranslateCorpus? pretranslateCorpus = pretranslateCorpora?.GetValueOrDefault(source.Id); + + IEnumerable sourceFiles = source.SourceFiles.Select(Map); + IEnumerable targetFiles = source.TargetFiles.Select(Map); + SIL.ServiceToolkit.Models.MonolingualCorpus sourceCorpus = new() + { + Id = source.Id, + Language = source.SourceLanguage, + Files = source.SourceFiles.Select(Map).ToArray(), + }; + SIL.ServiceToolkit.Models.MonolingualCorpus targetCorpus = new() + { + Id = source.Id, + Language = source.TargetLanguage, + Files = source.TargetFiles.Select(Map).ToArray(), + }; + + if (trainingCorpus is not null) + { + if (trainingCorpus.TextIds is not null && trainingCorpus.ScriptureRange is not null) + { + throw new InvalidOperationException( + $"The corpus {source.Id} cannot specify both 'textIds' and 'scriptureRange' for trainOn" + ); + } + if (trainingCorpus.TextIds is not null) + { + sourceCorpus.TrainOnTextIds.AddRange(trainingCorpus.TextIds); + targetCorpus.TrainOnTextIds.AddRange(trainingCorpus.TextIds); + } + if (!string.IsNullOrEmpty(trainingCorpus.ScriptureRange)) + { + if ( + targetCorpus.Files.Count > 1 + || targetCorpus.Files[0].Format != SIL.ServiceToolkit.Models.FileFormat.Paratext + ) + { + throw new InvalidOperationException( + $"The corpus {source.Id} is not compatible with using a scripture range" + ); + } + var chapters = _parallelCorpusService + .GetChapters( + corpora.Select(c => Map(c, engine)).ToArray(), + GetFilePath(targetCorpus.Files[0].Location), + trainingCorpus.ScriptureRange + ) + .ToDictionary(kvp => kvp.Key, kvp => kvp.Value.ToHashSet()); + sourceCorpus.TrainOnChapters = chapters; + targetCorpus.TrainOnChapters = chapters; + } + } + + if (pretranslateCorpus is not null) + { + if (pretranslateCorpus.TextIds is not null && pretranslateCorpus.ScriptureRange is not null) + { + throw new InvalidOperationException( + $"The corpus {source.Id} cannot specify both 'textIds' and 'scriptureRange' for 'pretranslate'." + ); + } + if (pretranslateCorpus.TextIds is not null) + sourceCorpus.InferenceTextIds.AddRange(pretranslateCorpus.TextIds); + if (!string.IsNullOrEmpty(pretranslateCorpus.ScriptureRange)) + { + if ( + targetCorpus.Files.Count > 1 + || targetCorpus.Files[0].Format != SIL.ServiceToolkit.Models.FileFormat.Paratext + ) + { + throw new InvalidOperationException( + $"The corpus {source.Id} is not compatible with using a scripture range" + ); + } + sourceCorpus.InferenceChapters = _parallelCorpusService + .GetChapters( + corpora.Select(c => Map(c, engine)).ToArray(), + GetFilePath(targetCorpus.Files[0].Location), + pretranslateCorpus.ScriptureRange + ) + .ToDictionary(kvp => kvp.Key, kvp => kvp.Value.ToHashSet()); + } + } + SIL.ServiceToolkit.Models.ParallelCorpus corpus = new() + { + Id = source.Id, + SourceCorpora = [sourceCorpus], + TargetCorpora = [targetCorpus], + TrainOnAllCorpora = trainOnAllCorpora, + PretranslateAllCorpora = pretranslateAllCorpora, + }; + mappedParallelCorpora.Add(corpus); + } + return mappedParallelCorpora; + } + + private IReadOnlyList Map( + Build build, + IReadOnlyList parallelCorpora + ) + { + List mappedParallelCorpora = []; + Dictionary? trainingCorpora = build.TrainOn?.ToDictionary(c => c.ParallelCorpusRef!); + Dictionary? pretranslateCorpora = build.Pretranslate?.ToDictionary(c => + c.ParallelCorpusRef! + ); + + bool trainOnAllCorpora = trainingCorpora is null; + bool pretranslateAllCorpora = pretranslateCorpora is null; + + parallelCorpora = parallelCorpora + .Where(pc => + trainingCorpora == null + || trainingCorpora.ContainsKey(pc.Id) + || pretranslateCorpora == null + || pretranslateCorpora.ContainsKey(pc.Id) + ) + .ToArray(); + foreach (ParallelCorpus source in parallelCorpora) + { + TrainingCorpus? trainingCorpus = trainingCorpora?.GetValueOrDefault(source.Id); + PretranslateCorpus? pretranslateCorpus = pretranslateCorpora?.GetValueOrDefault(source.Id); + + string? referenceFileLocation = + source.TargetCorpora.Count > 0 && source.TargetCorpora[0].Files.Count > 0 + ? Map(source.TargetCorpora[0].Files[0]).Location + : null; + + mappedParallelCorpora.Add( + new SIL.ServiceToolkit.Models.ParallelCorpus + { + Id = source.Id, + SourceCorpora = source + .SourceCorpora.Select(sc => + Map( + parallelCorpora, + sc, + trainingCorpus?.SourceFilters?.Where(sf => sf.CorpusRef == sc.Id).FirstOrDefault(), + pretranslateCorpus?.SourceFilters?.Where(sf => sf.CorpusRef == sc.Id).FirstOrDefault(), + referenceFileLocation + ) + ) + .ToArray(), + TargetCorpora = source + .TargetCorpora.Select(tc => + Map( + parallelCorpora, + tc, + trainingCorpus?.TargetFilters?.Where(sf => sf.CorpusRef == tc.Id).FirstOrDefault(), + null, + referenceFileLocation + ) + ) + .ToArray(), + TrainOnAllCorpora = trainOnAllCorpora, + PretranslateAllCorpora = pretranslateAllCorpora, + } + ); + } + return mappedParallelCorpora; + } + + private SIL.ServiceToolkit.Models.MonolingualCorpus Map( + IReadOnlyList parallelCorpora, + MonolingualCorpus inputCorpus, + ParallelCorpusFilter? trainingFilter, + ParallelCorpusFilter? pretranslateFilter, + string? referenceFileLocation + ) + { + Dictionary>? trainOnChapters = null; + if ( + trainingFilter is not null + && trainingFilter.ScriptureRange is not null + && referenceFileLocation is not null + ) + { + trainOnChapters = _parallelCorpusService + .GetChapters( + parallelCorpora.Select(Map).ToArray(), + GetFilePath(referenceFileLocation), + trainingFilter.ScriptureRange + ) + .ToDictionary(kvp => kvp.Key, kvp => kvp.Value.ToHashSet()); + } + + Dictionary>? pretranslateChapters = null; + if ( + pretranslateFilter is not null + && pretranslateFilter.ScriptureRange is not null + && referenceFileLocation is not null + ) + { + pretranslateChapters = _parallelCorpusService + .GetChapters( + parallelCorpora.Select(Map).ToArray(), + GetFilePath(referenceFileLocation), + pretranslateFilter.ScriptureRange + ) + .ToDictionary(kvp => kvp.Key, kvp => kvp.Value.ToHashSet()); + } + + var returnCorpus = new SIL.ServiceToolkit.Models.MonolingualCorpus + { + Id = inputCorpus.Id, + Language = inputCorpus.Language, + Files = inputCorpus.Files.Select(Map).ToArray(), + }; + + if ( + trainingFilter is not null + && trainingFilter.TextIds is not null + && trainingFilter.ScriptureRange is not null + ) + { + throw new InvalidOperationException( + "Cannot specify both TextIds and ScriptureRange in the training filter." + ); + } + + returnCorpus.TrainOnChapters = trainOnChapters; + returnCorpus.TrainOnTextIds = trainingFilter?.TextIds?.ToHashSet(); + + if ( + pretranslateFilter is not null + && pretranslateFilter.TextIds is not null + && pretranslateFilter.ScriptureRange is not null + ) + { + throw new InvalidOperationException( + "Cannot specify both TextIds and ScriptureRange in the pretranslation filter." + ); + } + + returnCorpus.InferenceChapters = pretranslateChapters; + returnCorpus.InferenceTextIds = pretranslateFilter?.TextIds?.ToHashSet(); + + return returnCorpus; + } + + public SIL.ServiceToolkit.Models.ParallelCorpus Map(Corpus source, Engine engine) + { + return new SIL.ServiceToolkit.Models.ParallelCorpus + { + Id = source.Id, + SourceCorpora = source.SourceFiles.Select(f => Map(f, engine.SourceLanguage)).ToArray(), + TargetCorpora = source.TargetFiles.Select(f => Map(f, engine.TargetLanguage)).ToArray(), + }; + } + + private SIL.ServiceToolkit.Models.MonolingualCorpus Map(CorpusFile source, string language) + { + return new SIL.ServiceToolkit.Models.MonolingualCorpus + { + Id = source.Id, + Language = language, + Files = [Map(source)], + }; + } + + private SIL.ServiceToolkit.Models.CorpusFile Map(CorpusFile source) + { + return new SIL.ServiceToolkit.Models.CorpusFile + { + Location = GetFilePath(source.Filename), + Format = (SIL.ServiceToolkit.Models.FileFormat)source.Format, + TextId = source.TextId, + }; + } + + private SIL.ServiceToolkit.Models.ParallelCorpus Map(ParallelCorpus source) + { + return new SIL.ServiceToolkit.Models.ParallelCorpus + { + Id = source.Id, + SourceCorpora = source.SourceCorpora.Select(Map).ToArray(), + TargetCorpora = source.TargetCorpora.Select(Map).ToArray(), + }; + } + + private SIL.ServiceToolkit.Models.MonolingualCorpus Map(MonolingualCorpus source) + { + return new SIL.ServiceToolkit.Models.MonolingualCorpus + { + Id = source.Id, + Language = source.Language, + Files = source.Files.Select(Map).ToList(), + }; + } + + public string GetFilePath(string filename) + { + return Path.Combine(_dataFileOptions.CurrentValue.FilesDirectory, filename); + } +} diff --git a/src/Serval/src/Serval.Translation/Services/EngineService.cs b/src/Serval/src/Serval.Translation/Services/EngineService.cs index 1b987fe7a..40fbe843d 100644 --- a/src/Serval/src/Serval.Translation/Services/EngineService.cs +++ b/src/Serval/src/Serval.Translation/Services/EngineService.cs @@ -9,22 +9,22 @@ public class EngineService( IRepository pretranslations, IScopedMediator mediator, GrpcClientFactory grpcClientFactory, - IOptionsMonitor dataFileOptions, IDataAccessContext dataAccessContext, ILoggerFactory loggerFactory, IOutboxService outboxService, - IOptionsMonitor translationOptions + IOptionsMonitor translationOptions, + ICorpusMappingService corpusMappingService ) : OwnedEntityServiceBase(engines), IEngineService { private readonly IRepository _builds = builds; private readonly IRepository _pretranslations = pretranslations; private readonly IScopedMediator _mediator = mediator; private readonly GrpcClientFactory _grpcClientFactory = grpcClientFactory; - private readonly IOptionsMonitor _dataFileOptions = dataFileOptions; private readonly IDataAccessContext _dataAccessContext = dataAccessContext; private readonly ILogger _logger = loggerFactory.CreateLogger(); private readonly IOutboxService _outboxService = outboxService; private readonly IOptionsMonitor _translationOptions = translationOptions; + private readonly ICorpusMappingService _corpusMappingService = corpusMappingService; public async Task TranslateAsync( string engineId, @@ -262,24 +262,6 @@ await _outboxService.EnqueueMessageAsync( ); } - protected virtual Dictionary> GetChapters(string fileLocation, string scriptureRange) - { - try - { - using var archive = new ZipContainer( - Path.Combine(_dataFileOptions.CurrentValue.FilesDirectory, fileLocation) - ); - return ScriptureRangeParser.GetChapters( - scriptureRange, - new ZipParatextProjectSettingsParser(archive).Parse().Versification - ); - } - catch (ArgumentException ae) - { - throw new InvalidOperationException($"The scripture range {scriptureRange} is not valid: {ae.Message}"); - } - } - public async Task StartBuildAsync(Build build, CancellationToken cancellationToken = default) { return await _dataAccessContext.WithTransactionAsync( @@ -301,77 +283,13 @@ await _builds.ExistsAsync( await _builds.InsertAsync(build, ct); Engine engine = await GetAsync(build.EngineRef, ct); - StartBuildRequest request; - if (engine.ParallelCorpora.Any()) + StartBuildRequest request = new StartBuildRequest { - Dictionary? trainOn = build.TrainOn?.ToDictionary(c => - c.ParallelCorpusRef! - ); - Dictionary? pretranslate = build.Pretranslate?.ToDictionary(c => - c.ParallelCorpusRef! - ); - IReadOnlyList parallelCorpora = engine - .ParallelCorpora.Where(pc => - trainOn == null - || trainOn.ContainsKey(pc.Id) - || pretranslate == null - || pretranslate.ContainsKey(pc.Id) - ) - .ToList(); - - request = new StartBuildRequest - { - EngineType = engine.Type, - EngineId = engine.Id, - BuildId = build.Id, - Corpora = - { - parallelCorpora.Select(c => - Map( - c, - trainOn?.GetValueOrDefault(c.Id), - pretranslate?.GetValueOrDefault(c.Id), - trainOn is null, - pretranslate is null - ) - ), - }, - }; - } - else - { - Dictionary? trainOn = build.TrainOn?.ToDictionary(c => c.CorpusRef!); - Dictionary? pretranslate = build.Pretranslate?.ToDictionary(c => - c.CorpusRef! - ); - IReadOnlyList corpora = engine - .Corpora.Where(c => - trainOn == null - || trainOn.ContainsKey(c.Id) - || pretranslate == null - || pretranslate.ContainsKey(c.Id) - ) - .ToList(); - - request = new StartBuildRequest - { - EngineType = engine.Type, - EngineId = engine.Id, - BuildId = build.Id, - Corpora = - { - corpora.Select(c => - Map( - c, - trainOn?.GetValueOrDefault(c.Id), - pretranslate?.GetValueOrDefault(c.Id), - trainOn is null, - pretranslate is null - ) - ), - }, - }; - } + EngineType = engine.Type, + EngineId = engine.Id, + BuildId = build.Id, + Corpora = { _corpusMappingService.Map(build, engine).Select(Map) }, + }; if (build.Options is not null) request.Options = JsonSerializer.Serialize(build.Options); @@ -920,296 +838,99 @@ private Models.WordGraphArc Map(V1.WordGraphArc source) }; } - private V1.ParallelCorpus Map( - Corpus source, - TrainingCorpus? trainingCorpus, - PretranslateCorpus? pretranslateCorpus, - bool trainOnAllCorpora, - bool pretranslateOnAllCorpora - ) + private static V1.ParallelCorpus Map(SIL.ServiceToolkit.Models.ParallelCorpus source) { - IEnumerable sourceFiles = source.SourceFiles.Select(Map); - IEnumerable targetFiles = source.TargetFiles.Select(Map); - V1.MonolingualCorpus sourceCorpus = new() - { - Language = source.SourceLanguage, - Files = { source.SourceFiles.Select(Map) }, - }; - V1.MonolingualCorpus targetCorpus = new() - { - Language = source.TargetLanguage, - Files = { source.TargetFiles.Select(Map) }, - }; - - if ( - trainOnAllCorpora - || (trainingCorpus is not null && trainingCorpus.TextIds is null && trainingCorpus.ScriptureRange is null) - ) - { - sourceCorpus.TrainOnAll = true; - targetCorpus.TrainOnAll = true; - } - else if (trainingCorpus is not null) - { - if (trainingCorpus.TextIds is not null && trainingCorpus.ScriptureRange is not null) - { - throw new InvalidOperationException( - $"The corpus {source.Id} cannot specify both 'textIds' and 'scriptureRange' for trainOn" - ); - } - if (trainingCorpus.TextIds is not null) - { - sourceCorpus.TrainOnTextIds.Add(trainingCorpus.TextIds); - targetCorpus.TrainOnTextIds.Add(trainingCorpus.TextIds); - } - if (!string.IsNullOrEmpty(trainingCorpus.ScriptureRange)) - { - if (targetCorpus.Files.Count > 1 || targetCorpus.Files[0].Format != V1.FileFormat.Paratext) - { - throw new InvalidOperationException( - $"The corpus {source.Id} is not compatible with using a scripture range" - ); - } - var chapters = GetChapters(targetCorpus.Files[0].Location, trainingCorpus.ScriptureRange) - .Select( - (kvp) => - { - var scriptureChapters = new ScriptureChapters(); - scriptureChapters.Chapters.Add(kvp.Value); - return (kvp.Key, scriptureChapters); - } - ) - .ToDictionary(); - sourceCorpus.TrainOnChapters.Add(chapters); - targetCorpus.TrainOnChapters.Add(chapters); - } - } - if ( - pretranslateOnAllCorpora - || ( - pretranslateCorpus is not null - && pretranslateCorpus.TextIds is null - && pretranslateCorpus.ScriptureRange is null - ) - ) - { - sourceCorpus.PretranslateAll = true; - targetCorpus.PretranslateAll = true; - } - else if (pretranslateCorpus is not null) - { - if (pretranslateCorpus.TextIds is not null && pretranslateCorpus.ScriptureRange is not null) - { - throw new InvalidOperationException( - $"The corpus {source.Id} cannot specify both 'textIds' and 'scriptureRange' for 'pretranslate'." - ); - } - if (pretranslateCorpus.TextIds is not null) - sourceCorpus.PretranslateTextIds.Add(pretranslateCorpus.TextIds); - if (!string.IsNullOrEmpty(pretranslateCorpus.ScriptureRange)) - { - if (targetCorpus.Files.Count > 1 || targetCorpus.Files[0].Format != V1.FileFormat.Paratext) - { - throw new InvalidOperationException( - $"The corpus {source.Id} is not compatible with using a scripture range" - ); - } - sourceCorpus.PretranslateChapters.Add( - GetChapters(targetCorpus.Files[0].Location, pretranslateCorpus.ScriptureRange) - .Select( - (kvp) => - { - var scriptureChapters = new ScriptureChapters(); - scriptureChapters.Chapters.Add(kvp.Value); - return (kvp.Key, scriptureChapters); - } - ) - .ToDictionary() - ); - } - } - V1.ParallelCorpus corpus = new() { Id = source.Id }; - if (sourceCorpus.Files.Count > 0) - corpus.SourceCorpora.Add(sourceCorpus); - if (targetCorpus.Files.Count > 0) - corpus.TargetCorpora.Add(targetCorpus); - return corpus; - } - - private V1.ParallelCorpus Map( - Shared.Models.ParallelCorpus source, - TrainingCorpus? trainingCorpus, - PretranslateCorpus? pretranslateCorpus, - bool trainOnAllCorpora, - bool pretranslateOnAllCorpora - ) - { - string? referenceFileLocation = - source.TargetCorpora.Count > 0 && source.TargetCorpora[0].Files.Count > 0 - ? Map(source.TargetCorpora[0].Files[0]).Location - : null; - - bool trainOnAllSources = - trainOnAllCorpora || (trainingCorpus is not null && trainingCorpus.SourceFilters is null); - bool pretranslateAllSources = - pretranslateOnAllCorpora || (pretranslateCorpus is not null && pretranslateCorpus.SourceFilters is null); - - bool trainOnAllTargets = - trainOnAllCorpora || (trainingCorpus is not null && trainingCorpus.TargetFilters is null); - bool pretranslateAllTargets = pretranslateOnAllCorpora || pretranslateCorpus is not null; // there is no pretranslate Target filter. - return new V1.ParallelCorpus { Id = source.Id, SourceCorpora = { - source.SourceCorpora.Select(sc => - Map( - sc, - trainingCorpus?.SourceFilters?.Where(sf => sf.CorpusRef == sc.Id).FirstOrDefault(), - pretranslateCorpus?.SourceFilters?.Where(sf => sf.CorpusRef == sc.Id).FirstOrDefault(), - referenceFileLocation, - trainOnAllSources, - pretranslateAllSources - ) - ), + source.SourceCorpora.Select(c => Map(c, source.TrainOnAllCorpora, source.PretranslateAllCorpora)), }, TargetCorpora = { - source.TargetCorpora.Select(tc => - Map( - tc, - trainingCorpus?.TargetFilters?.Where(sf => sf.CorpusRef == tc.Id).FirstOrDefault(), - null, - referenceFileLocation, - trainOnAllTargets, - pretranslateAllTargets - ) - ), + source.TargetCorpora.Select(c => Map(c, source.TrainOnAllCorpora, source.PretranslateAllCorpora)), }, }; } - private V1.MonolingualCorpus Map( - Shared.Models.MonolingualCorpus inputCorpus, - ParallelCorpusFilter? trainingFilter, - ParallelCorpusFilter? pretranslateFilter, - string? referenceFileLocation, + private static V1.MonolingualCorpus Map( + SIL.ServiceToolkit.Models.MonolingualCorpus source, bool trainOnAll, - bool pretranslateOnAll + bool pretranslateAll ) { - Dictionary? trainOnChapters = null; - if ( - trainingFilter is not null - && trainingFilter.ScriptureRange is not null - && referenceFileLocation is not null - ) + var corpus = new V1.MonolingualCorpus { - trainOnChapters = GetChapters(referenceFileLocation, trainingFilter.ScriptureRange) - .Select( - (kvp) => - { - var scriptureChapters = new ScriptureChapters(); - scriptureChapters.Chapters.Add(kvp.Value); - return (kvp.Key, scriptureChapters); - } - ) - .ToDictionary(); - } + Id = source.Id, + Language = source.Language, + Files = { source.Files.Select(Map) }, + }; - Dictionary? pretranslateChapters = null; - if ( - pretranslateFilter is not null - && pretranslateFilter.ScriptureRange is not null - && referenceFileLocation is not null - ) + if (trainOnAll || (source.TrainOnTextIds is null && source.TrainOnChapters is null)) + { + corpus.TrainOnAll = true; + } + if (source.TrainOnTextIds is not null) + { + corpus.TrainOnTextIds.Add(source.TrainOnTextIds); + } + if (source.TrainOnChapters is not null) { - pretranslateChapters = GetChapters(referenceFileLocation, pretranslateFilter.ScriptureRange) - .Select( - (kvp) => + corpus.TrainOnChapters.Add( + source + .TrainOnChapters?.Select(kvp => { var scriptureChapters = new ScriptureChapters(); scriptureChapters.Chapters.Add(kvp.Value); return (kvp.Key, scriptureChapters); - } - ) - .ToDictionary(); - } - - var returnCorpus = new V1.MonolingualCorpus - { - Id = inputCorpus.Id, - Language = inputCorpus.Language, - Files = { inputCorpus.Files.Select(Map) }, - }; - - if ( - trainingFilter is not null - && trainingFilter.TextIds is not null - && trainingFilter.ScriptureRange is not null - ) - { - throw new InvalidOperationException( - "Cannot specify both TextIds and ScriptureRange in the training filter." + }) + .ToDictionary() ); } - if ( - trainOnAll - || (trainingFilter is not null && trainingFilter.TextIds is null && trainingFilter.ScriptureRange is null) - ) + if (pretranslateAll || (source.InferenceTextIds is null && source.InferenceChapters is null)) { - returnCorpus.TrainOnAll = true; + corpus.PretranslateAll = true; } - else + else if (source.InferenceTextIds is not null) { - if (trainOnChapters is not null) - returnCorpus.TrainOnChapters.Add(trainOnChapters); - if (trainingFilter?.TextIds is not null) - returnCorpus.TrainOnTextIds.Add(trainingFilter.TextIds); + corpus.PretranslateTextIds.Add(source.InferenceTextIds); } - - if ( - pretranslateFilter is not null - && pretranslateFilter.TextIds is not null - && pretranslateFilter.ScriptureRange is not null - ) + else if (source.InferenceChapters is not null) { - throw new InvalidOperationException( - "Cannot specify both TextIds and ScriptureRange in the pretranslation filter." + corpus.PretranslateChapters.Add( + source + .InferenceChapters?.Select(kvp => + { + var scriptureChapters = new ScriptureChapters(); + scriptureChapters.Chapters.Add(kvp.Value); + return (kvp.Key, scriptureChapters); + }) + .ToDictionary() ); } - if ( - pretranslateOnAll - || ( - pretranslateFilter is not null - && pretranslateFilter.TextIds is null - && pretranslateFilter.ScriptureRange is null - ) - ) - { - returnCorpus.PretranslateAll = true; - } - else - { - if (pretranslateChapters is not null) - returnCorpus.PretranslateChapters.Add(pretranslateChapters); - if (pretranslateFilter?.TextIds is not null) - returnCorpus.PretranslateTextIds.Add(pretranslateFilter.TextIds); - } - - return returnCorpus; + return corpus; } - private V1.CorpusFile Map(Shared.Models.CorpusFile source) + private static V1.CorpusFile Map(SIL.ServiceToolkit.Models.CorpusFile source) { return new V1.CorpusFile { + Location = source.Location, TextId = source.TextId, - Format = (V1.FileFormat)source.Format, - Location = Path.Combine(_dataFileOptions.CurrentValue.FilesDirectory, source.Filename), + Format = Map(source.Format), + }; + } + + private static V1.FileFormat Map(SIL.ServiceToolkit.Models.FileFormat source) + { + return source switch + { + SIL.ServiceToolkit.Models.FileFormat.Text => V1.FileFormat.Text, + SIL.ServiceToolkit.Models.FileFormat.Paratext => V1.FileFormat.Paratext, + _ => throw new InvalidEnumArgumentException(nameof(source)), }; } } diff --git a/src/Serval/src/Serval.Translation/Services/ICorpusMappingService.cs b/src/Serval/src/Serval.Translation/Services/ICorpusMappingService.cs new file mode 100644 index 000000000..01ee59985 --- /dev/null +++ b/src/Serval/src/Serval.Translation/Services/ICorpusMappingService.cs @@ -0,0 +1,7 @@ +namespace Serval.Translation.Services; + +public interface ICorpusMappingService +{ + IReadOnlyList Map(Build build, Engine engine); + string GetFilePath(string filename); +} diff --git a/src/Serval/src/Serval.Translation/Services/PretranslationService.cs b/src/Serval/src/Serval.Translation/Services/PretranslationService.cs index 311085b7a..b4d4d856f 100644 --- a/src/Serval/src/Serval.Translation/Services/PretranslationService.cs +++ b/src/Serval/src/Serval.Translation/Services/PretranslationService.cs @@ -6,16 +6,16 @@ public class PretranslationService( IRepository pretranslations, IRepository engines, IRepository builds, - IOptionsMonitor dataFileOptions, + ICorpusMappingService corpusMappingService, IParallelCorpusService parallelCorpusService ) : EntityServiceBase(pretranslations), IPretranslationService { private readonly IRepository _engines = engines; private readonly IRepository _builds = builds; private readonly IParallelCorpusService _parallelCorpusService = parallelCorpusService; + private readonly ICorpusMappingService _corpusMappingService = corpusMappingService; private const string AIDisclaimerRemark = "This draft of {0} was generated using AI on {1}. It should be reviewed and edited carefully."; - private readonly IOptionsMonitor _dataFileOptions = dataFileOptions; public async Task> GetAllAsync( string engineId, @@ -107,48 +107,9 @@ public async Task GetUsfmAsync( List remarks = [disclaimerRemark, markerPlacementRemark]; - SIL.ServiceToolkit.Models.ParallelCorpus[] parallelCorpora; - if (build.TrainOn == null || build.Pretranslate == null) - { - if (parallelCorpus != null) - { - parallelCorpora = engine!.ParallelCorpora.Select(Map).ToArray(); - } - else - { - parallelCorpora = engine!.Corpora.Select(c => Map(c, engine)).ToArray(); - } - } - else - { - HashSet referencedCorpora; - if (parallelCorpus != null) - { - referencedCorpora = build - .TrainOn.Select(t => t.ParallelCorpusRef) - .Concat(build.Pretranslate.Select(p => p.ParallelCorpusRef)) - .Where(r => r != null) - .Select(r => r!) - .ToHashSet(); - parallelCorpora = engine! - .ParallelCorpora.Where(pc => referencedCorpora.Contains(pc.Id)) - .Select(Map) - .ToArray(); - } - else - { - referencedCorpora = build - .TrainOn.Select(t => t.CorpusRef) - .Concat(build.Pretranslate.Select(p => p.CorpusRef)) - .Where(r => r != null) - .Select(r => r!) - .ToHashSet(); - parallelCorpora = engine! - .Corpora.Where(c => referencedCorpora.Contains(c.Id)) - .Select(c => Map(c, engine)) - .ToArray(); - } - } + SIL.ServiceToolkit.Models.ParallelCorpus[] parallelCorpora = _corpusMappingService + .Map(build, engine!) + .ToArray(); IEnumerable pretranslations = ( await GetAllAsync(engineId, modelRevision, corpusId, textId, cancellationToken) @@ -185,7 +146,7 @@ await GetAllAsync(engineId, modelRevision, corpusId, textId, cancellationToken) parallelCorpora, corpusId, textId, - pretranslations.ToArray(), + textOrigin == PretranslationUsfmTextOrigin.OnlyExisting ? [] : pretranslations.ToArray(), textBehavior, Map(paragraphMarkerBehavior), Map(embedBehavior), @@ -286,59 +247,4 @@ private static UpdateUsfmMarkerBehavior Map(PretranslationUsfmMarkerBehavior beh _ => throw new InvalidEnumArgumentException(nameof(behavior)), }; } - - private SIL.ServiceToolkit.Models.ParallelCorpus Map(ParallelCorpus source) - { - return new SIL.ServiceToolkit.Models.ParallelCorpus - { - Id = source.Id, - SourceCorpora = source.SourceCorpora.Select(Map).ToArray(), - TargetCorpora = source.TargetCorpora.Select(Map).ToArray(), - }; - } - - private SIL.ServiceToolkit.Models.MonolingualCorpus Map(MonolingualCorpus source) - { - return new SIL.ServiceToolkit.Models.MonolingualCorpus - { - Id = source.Id, - Language = source.Language, - Files = source.Files.Select(Map).ToList(), - }; - } - - private SIL.ServiceToolkit.Models.ParallelCorpus Map(Corpus source, Engine engine) - { - return new SIL.ServiceToolkit.Models.ParallelCorpus - { - Id = source.Id, - SourceCorpora = source.SourceFiles.Select(f => Map(f, engine.SourceLanguage)).ToArray(), - TargetCorpora = source.TargetFiles.Select(f => Map(f, engine.TargetLanguage)).ToArray(), - }; - } - - private SIL.ServiceToolkit.Models.MonolingualCorpus Map(CorpusFile source, string language) - { - return new SIL.ServiceToolkit.Models.MonolingualCorpus - { - Id = source.Id, - Language = language, - Files = [Map(source)], - }; - } - - private SIL.ServiceToolkit.Models.CorpusFile Map(CorpusFile source) - { - return new SIL.ServiceToolkit.Models.CorpusFile - { - Location = GetFilePath(source.Filename), - Format = (SIL.ServiceToolkit.Models.FileFormat)source.Format, - TextId = source.TextId, - }; - } - - private string GetFilePath(string filename) - { - return Path.Combine(_dataFileOptions.CurrentValue.FilesDirectory, filename); - } } diff --git a/src/Serval/test/Serval.ApiServer.IntegrationTests/TranslationEngineTests.cs b/src/Serval/test/Serval.ApiServer.IntegrationTests/TranslationEngineTests.cs index 3b287d705..24ee437e4 100644 --- a/src/Serval/test/Serval.ApiServer.IntegrationTests/TranslationEngineTests.cs +++ b/src/Serval/test/Serval.ApiServer.IntegrationTests/TranslationEngineTests.cs @@ -1,8 +1,8 @@ +using System.IO.Compression; using Google.Protobuf.WellKnownTypes; using Serval.Translation.Configuration; using Serval.Translation.Models; using Serval.Translation.V1; -using SIL.ServiceToolkit.Services; using static Serval.ApiServer.Utils; using Phase = Serval.Client.Phase; using PhaseStage = Serval.Client.PhaseStage; @@ -2323,7 +2323,7 @@ await _env.Builds.InsertAsync( Assert.That( usfm.Replace("\r\n", "\n"), Is.EqualTo( - @"\id MAT - TRG + @"\id MAT - Test1 \rem This draft of MAT was generated using AI on 1970-01-01 00:00:00Z. It should be reviewed and edited carefully. \rem Paragraph breaks and embed markers were moved to the end of the verse. Style markers were removed. \h @@ -2557,6 +2557,8 @@ public void TearDown() private class TestEnvironment : DisposableBase { private readonly IServiceScope _scope; + private readonly IOptionsMonitor _dataFileOptions; + private static readonly string TestDataPath = Path.Combine(AppContext.BaseDirectory, "..", "..", "..", "data"); public readonly MongoClient MongoClient; public TestEnvironment() @@ -2740,6 +2742,9 @@ public TestEnvironment() .Returns(CreateAsyncUnaryCall(StatusCode.Unimplemented)); SmtClient = Substitute.For(); + _dataFileOptions = _scope.ServiceProvider.GetRequiredService>(); + ZipParatextProject(FILE3_FILENAME); + ZipParatextProject(FILE4_FILENAME); } public ServalWebApplicationFactory Factory { get; } @@ -2768,7 +2773,6 @@ public TranslationBuildsClient CreateTranslationBuildsClient(IEnumerable .CreateClient("Nmt") .Returns(NmtClient); services.AddSingleton(grpcClientFactory); - services.AddTransient(CreateFileSystem); }); }) .CreateClient(); @@ -2798,7 +2802,6 @@ public TranslationEnginesClient CreateTranslationEnginesClient(IEnumerable("Nmt") .Returns(NmtClient); services.AddSingleton(grpcClientFactory); - services.AddTransient(CreateFileSystem); }); }) .CreateClient(); @@ -2859,15 +2862,7 @@ public TranslationEngineTypesClient CreateTranslationEngineTypesClient(IEnumerab public DataFilesClient CreateDataFilesClient() { IEnumerable scope = [Scopes.DeleteFiles, Scopes.ReadFiles, Scopes.UpdateFiles, Scopes.CreateFiles]; - HttpClient httpClient = Factory - .WithWebHostBuilder(builder => - { - builder.ConfigureTestServices(services => - { - services.AddTransient(CreateFileSystem); - }); - }) - .CreateClient(); + HttpClient httpClient = Factory.CreateClient(); if (scope is not null) httpClient.DefaultRequestHeaders.Add("Scope", string.Join(" ", scope)); return new DataFilesClient(httpClient); @@ -2888,80 +2883,21 @@ public void ResetDatabases() MongoClient.DropDatabase("serval_test_jobs"); } - private static IFileSystem CreateFileSystem(IServiceProvider sp) - { - IFileSystem fileSystem = Substitute.For(); - IOptionsMonitor dataFileOptions = sp.GetRequiredService< - IOptionsMonitor - >(); - fileSystem - .OpenZipFile(GetFilePath(dataFileOptions, FILE3_FILENAME)) - .Returns(ci => - { - IZipContainer source = CreateZipContainer("SRC"); - source.EntryExists("MATSRC.SFM").Returns(true); - string usfm = - $@"\id MAT - SRC -\h Matthew -\c 1 -\p -\v 1 Chapter one, verse one. -\v 2 Chapter one, verse two. -"; - source.OpenEntry("MATSRC.SFM").Returns(ci => new MemoryStream(Encoding.UTF8.GetBytes(usfm))); - return source; - }); - fileSystem - .OpenZipFile(GetFilePath(dataFileOptions, FILE4_FILENAME)) - .Returns(ci => - { - IZipContainer target = CreateZipContainer("TRG"); - target.EntryExists("MATTRG.SFM").Returns(false); - return target; - }); - fileSystem.OpenWrite(Arg.Any()).Returns(ci => new MemoryStream()); - return fileSystem; - } - - private static IZipContainer CreateZipContainer(string name) - { - IZipContainer container = Substitute.For(); - container.EntryExists("Settings.xml").Returns(true); - XElement settingsXml = new( - "ScriptureText", - new XElement("StyleSheet", "usfm.sty"), - new XElement("Guid", "Id"), - new XElement("Name", name), - new XElement("FullName", name), - new XElement("Encoding", "65001"), - new XElement( - "Naming", - new XAttribute("PrePart", ""), - new XAttribute("PostPart", $"{name}.SFM"), - new XAttribute("BookNameForm", "MAT") - ), - new XElement("BiblicalTermsListSetting", "Major::BiblicalTerms.xml") - ); - container - .OpenEntry("Settings.xml") - .Returns(new MemoryStream(Encoding.UTF8.GetBytes(settingsXml.ToString()))); - container.EntryExists("custom.vrs").Returns(false); - container.EntryExists("usfm.sty").Returns(false); - container.EntryExists("custom.sty").Returns(false); - return container; - } - - private static string GetFilePath(IOptionsMonitor dataFileOptions, string fileName) - { - return Path.Combine(dataFileOptions.CurrentValue.FilesDirectory, fileName); - } - protected override void DisposeManagedResources() { _scope.Dispose(); Factory.Dispose(); ResetDatabases(); } + + private string ZipParatextProject(string name) + { + string fileName = Path.Combine(_dataFileOptions.CurrentValue.FilesDirectory, name); + if (File.Exists(fileName)) + File.Delete(fileName); + ZipFile.CreateFromDirectory(Path.Combine(TestDataPath, name), fileName); + return fileName; + } } } diff --git a/src/Serval/test/Serval.ApiServer.IntegrationTests/data/file_c/41MATTe1.SFM b/src/Serval/test/Serval.ApiServer.IntegrationTests/data/file_c/41MATTe1.SFM new file mode 100644 index 000000000..24df58815 --- /dev/null +++ b/src/Serval/test/Serval.ApiServer.IntegrationTests/data/file_c/41MATTe1.SFM @@ -0,0 +1,6 @@ +\id MAT - SRC +\h Matthew +\c 1 +\p +\v 1 Chapter one, verse one. +\v 2 Chapter one, verse two. diff --git a/src/Serval/test/Serval.ApiServer.IntegrationTests/data/file_c/Settings.xml b/src/Serval/test/Serval.ApiServer.IntegrationTests/data/file_c/Settings.xml new file mode 100644 index 000000000..6358f4f0b --- /dev/null +++ b/src/Serval/test/Serval.ApiServer.IntegrationTests/data/file_c/Settings.xml @@ -0,0 +1,34 @@ + + usfm.sty + 4 + en::: + English + 8.0.100.76 + Test1 + 65001 + T + + NFC + Te1 + a7e0b3ce0200736062f9f810a444dbfbe64aca35 + Charis SIL + 12 + + + + 41MAT + + Tes.SFM + Major::BiblicalTerms.xml + F + F + F + Public + Standard:: + + 3 + 000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + 000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + + + \ No newline at end of file diff --git a/src/Serval/test/Serval.ApiServer.IntegrationTests/data/file_d/Settings.xml b/src/Serval/test/Serval.ApiServer.IntegrationTests/data/file_d/Settings.xml new file mode 100644 index 000000000..4ce9e238d --- /dev/null +++ b/src/Serval/test/Serval.ApiServer.IntegrationTests/data/file_d/Settings.xml @@ -0,0 +1,33 @@ + + usfm.sty + 4 + en::: + English + 8.0.100.76 + Test2 + 65001 + T + + NFC + Te2 + a7e0b3ce0200736062f9f810a444dbfbe64aca35 + Charis SIL + 12 + + + + 41MAT + + Ten.SFM + F + F + F + Public + Standard:: + + 3 + 000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + 000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + + + \ No newline at end of file diff --git a/src/Serval/test/Serval.Translation.Tests/Services/EngineServiceTests.cs b/src/Serval/test/Serval.Translation.Tests/Services/EngineServiceTests.cs index bc47ec373..5c6a17b1b 100644 --- a/src/Serval/test/Serval.Translation.Tests/Services/EngineServiceTests.cs +++ b/src/Serval/test/Serval.Translation.Tests/Services/EngineServiceTests.cs @@ -2365,18 +2365,29 @@ public TestEnvironment() translationOptions.CurrentValue.Returns( new TranslationOptions { Engines = [new EngineInfo { Type = "Smt" }] } ); + var parallelCorpusService = Substitute.For(); + parallelCorpusService + .GetChapters( + Arg.Any>(), + Arg.Any(), + Arg.Any() + ) + .Returns(callInfo => + { + return ScriptureRangeParser.GetChapters(callInfo.ArgAt(2)); + }); - Service = new TestEngineService( + Service = new EngineService( Engines, new MemoryRepository(), Pretranslations, Substitute.For(), grpcClientFactory, - dataFileOptions, new MemoryDataAccessContext(), new LoggerFactory(), OutboxService, - translationOptions + translationOptions, + new CorpusMappingService(dataFileOptions, parallelCorpusService) ); } @@ -2848,42 +2859,4 @@ private static AsyncUnaryCall CreateAsyncUnaryCall(TRespon ); } } - - private class TestEngineService( - IRepository engines, - IRepository builds, - IRepository pretranslations, - IScopedMediator mediator, - GrpcClientFactory grpcClientFactory, - IOptionsMonitor dataFileOptions, - IDataAccessContext dataAccessContext, - ILoggerFactory loggerFactory, - IOutboxService outboxService, - IOptionsMonitor translationOptions - ) - : EngineService( - engines, - builds, - pretranslations, - mediator, - grpcClientFactory, - dataFileOptions, - dataAccessContext, - loggerFactory, - outboxService, - translationOptions - ) - { - protected override Dictionary> GetChapters(string fileLocation, string scriptureRange) - { - try - { - return ScriptureRangeParser.GetChapters(scriptureRange); - } - catch (ArgumentException ae) - { - throw new InvalidOperationException($"The scripture range {scriptureRange} is not valid: {ae.Message}"); - } - } - } } diff --git a/src/Serval/test/Serval.Translation.Tests/Services/PretranslationServiceTests.cs b/src/Serval/test/Serval.Translation.Tests/Services/PretranslationServiceTests.cs index 286576f82..78b564d0a 100644 --- a/src/Serval/test/Serval.Translation.Tests/Services/PretranslationServiceTests.cs +++ b/src/Serval/test/Serval.Translation.Tests/Services/PretranslationServiceTests.cs @@ -19,7 +19,7 @@ public async Task GetUsfmAsync_Source_PreferExisting() Assert.That( usfm, Is.EqualTo( - @"\id MAT - Test2 + @"\id MAT - Test1 \rem This draft of MAT was generated using AI on 1970-01-01 00:00:00Z. It should be reviewed and edited carefully. \rem Paragraph breaks and embed markers were moved to the end of the verse. Style markers were removed. \c 1 @@ -46,7 +46,7 @@ public async Task GetUsfmAsync_Source_PreferPretranslated() Assert.That( usfm, Is.EqualTo( - @"\id MAT - Test2 + @"\id MAT - Test1 \rem This draft of MAT was generated using AI on 1970-01-01 00:00:00Z. It should be reviewed and edited carefully. \rem Paragraph breaks and embed markers were moved to the end of the verse. Style markers were removed. \c 1 @@ -73,7 +73,7 @@ public async Task GetUsfmAsync_Source_OnlyExisting() Assert.That( usfm, Is.EqualTo( - @"\id MAT - Test2 + @"\id MAT - Test1 \rem This draft of MAT was generated using AI on 1970-01-01 00:00:00Z. It should be reviewed and edited carefully. \rem Paragraph breaks and embed markers were moved to the end of the verse. Style markers were removed. \c 1 @@ -100,7 +100,7 @@ public async Task GetUsfmAsync_Source_OnlyPretranslated() Assert.That( usfm, Is.EqualTo( - @"\id MAT - Test2 + @"\id MAT - Test1 \rem This draft of MAT was generated using AI on 1970-01-01 00:00:00Z. It should be reviewed and edited carefully. \rem Paragraph breaks and embed markers were moved to the end of the verse. Style markers were removed. \c 1 @@ -128,7 +128,7 @@ public async Task GetUsfmAsync_Source_PlaceMarkers() Assert.That( usfm, Is.EqualTo( - @"\id MAT - Test2 + @"\id MAT - Test1 \rem This draft of MAT was generated using AI on 1970-01-01 00:00:00Z. It should be reviewed and edited carefully. \rem Embed markers were moved to the end of the verse. Paragraph breaks have positions preserved. Style markers were removed. \c 1 @@ -220,7 +220,7 @@ public async Task GetUsfmAsync_Auto_TargetBookDoesNotExist() Assert.That( usfm, Is.EqualTo( - @"\id MAT - Test2 + @"\id MAT - Test1 \rem This draft of MAT was generated using AI on 1970-01-01 00:00:00Z. It should be reviewed and edited carefully. \rem Paragraph breaks and embed markers were moved to the end of the verse. Style markers were removed. \c 1 @@ -424,7 +424,7 @@ public void GetUsfmAsync_BadPretranslationVerseRef() [TestCase(new int[] { 1 }, "1")] public void GetChapterRanges(int[] chapterNumbers, string expectedRangeString) { - string actualRangeString = PretranslationService.GetChapterRangesString(chapterNumbers.ToList()); + string actualRangeString = ParallelCorpusService.GetChapterRangesString(chapterNumbers.ToList()); Assert.That(actualRangeString, Is.EqualTo(expectedRangeString)); } @@ -650,7 +650,14 @@ public TestEnvironment(bool addMatthew = false) ]); IOptionsMonitor dataFileOptions = Substitute.For>(); dataFileOptions.CurrentValue.Returns(new DataFileOptions() { FilesDirectory = _tempDir.Path }); - Service = new PretranslationService(Pretranslations, Engines, Builds, dataFileOptions); + var parallelCorpusService = new ParallelCorpusService(); + Service = new PretranslationService( + Pretranslations, + Engines, + Builds, + new CorpusMappingService(dataFileOptions, parallelCorpusService), + parallelCorpusService + ); } public PretranslationService Service { get; } diff --git a/src/Serval/test/Serval.Translation.Tests/data/pt-project2/41MATTe2.SFM b/src/Serval/test/Serval.Translation.Tests/data/pt-project2/41MATTe2.SFM deleted file mode 100644 index e69de29bb..000000000 diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Configuration/IServiceCollectionsExtensions.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Configuration/IServiceCollectionsExtensions.cs index 6b681401c..d4941ca26 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Configuration/IServiceCollectionsExtensions.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Configuration/IServiceCollectionsExtensions.cs @@ -2,7 +2,7 @@ namespace Microsoft.Extensions.DependencyInjection; public static class IServiceCollectionExtensions { - public static IServiceCollection AddParallelCorpusPreprocessor(this IServiceCollection services) + public static IServiceCollection AddParallelCorpusService(this IServiceCollection services) { services.TryAddSingleton(); return services; diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/ParallelCorpus.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/ParallelCorpus.cs index 833741629..8c47cfdf9 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/ParallelCorpus.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/ParallelCorpus.cs @@ -5,4 +5,6 @@ public record ParallelCorpus public required string Id { get; set; } public IReadOnlyList SourceCorpora { get; set; } = new List(); public IReadOnlyList TargetCorpora { get; set; } = new List(); + public bool TrainOnAllCorpora { get; set; } + public bool PretranslateAllCorpora { get; set; } } diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/IParallelCorpusService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/IParallelCorpusService.cs index ae35ebbbb..3b26e2e16 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/IParallelCorpusService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/IParallelCorpusService.cs @@ -24,7 +24,7 @@ Task PreprocessAsync( HashSet? ignoreUsfmMarkers = null ); - public string UpdateSourceUsfm( + string UpdateSourceUsfm( IReadOnlyList parallelCorpora, string corpusId, string bookId, @@ -37,7 +37,7 @@ public string UpdateSourceUsfm( string? targetQuoteConvention ); - public string UpdateTargetUsfm( + string UpdateTargetUsfm( IReadOnlyList parallelCorpora, string corpusId, string bookId, @@ -49,4 +49,10 @@ public string UpdateTargetUsfm( IEnumerable? remarks, string? targetQuoteConvention ); + + Dictionary> GetChapters( + IReadOnlyList parallelCorpora, + string fileLocation, + string scriptureRange + ); } diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusService.cs index 3b8c4982a..a4e11124c 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusService.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusService.cs @@ -139,7 +139,17 @@ public async Task PreprocessAsync( HashSet? ignoreUsfmMarkers = null ) { - CorpusBundle corpusBundle = new(parallelCorpora); + await PreprocessAsync(new CorpusBundle(parallelCorpora), train, inference, useKeyTerms, ignoreUsfmMarkers); + } + + public async Task PreprocessAsync( + CorpusBundle corpusBundle, + Func train, + Func inference, + bool useKeyTerms = false, + HashSet? ignoreUsfmMarkers = null + ) + { ignoreUsfmMarkers ??= []; bool parallelTrainingDataPresent = false; @@ -486,8 +496,6 @@ private static TextRow CleanSegment(TextRow row) books.AddRange(corpus.TrainOnChapters.Keys); } - // if (isSource) - // { if (corpus.InferenceTextIds != null) { books.AddRange(corpus.InferenceTextIds); @@ -496,7 +504,7 @@ private static TextRow CleanSegment(TextRow row) { books.AddRange(corpus.InferenceChapters.Keys); } - // } + return [.. books.Select(bookName => Canon.BookIdToNumber(bookName))]; } @@ -504,7 +512,7 @@ public string UpdateSourceUsfm( IReadOnlyList parallelCorpora, string corpusId, string bookId, - IReadOnlyList rows, + IReadOnlyList rows, UpdateUsfmMarkerBehavior paragraphBehavior, UpdateUsfmMarkerBehavior embedBehavior, UpdateUsfmMarkerBehavior styleBehavior, @@ -599,7 +607,7 @@ bool isSource .Where(row => row.Refs.Any()) .OrderBy(row => row.Refs[0]) .ToArray(), - sourceSettings?.FullName, + isSource ? sourceSettings?.FullName : targetSettings?.FullName, textBehavior, paragraphBehavior, embedBehavior, @@ -775,4 +783,24 @@ public static string GetChapterRangesString(List chapterNumbers) } return string.Join(", ", chapterRangeStrings); } + + public Dictionary> GetChapters( + IReadOnlyList parallelCorpora, + string fileLocation, + string scriptureRange + ) + { + CorpusBundle corpusBundle = new(parallelCorpora); + try + { + return ScriptureRangeParser.GetChapters( + scriptureRange, + corpusBundle.GetSettings(fileLocation)?.Versification + ); + } + catch (ArgumentException ae) + { + throw new InvalidOperationException($"The scripture range {scriptureRange} is not valid: {ae.Message}"); + } + } } diff --git a/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/CorpusBundleTests.cs b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/CorpusBundleTests.cs index 857963a93..2cbf44bc8 100644 --- a/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/CorpusBundleTests.cs +++ b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/CorpusBundleTests.cs @@ -8,7 +8,7 @@ public class CorpusBundleTests public void GetSettings() { using TestEnvironment env = new(addParatext: true, addText: false); - string fileLocation = env.CorpusBundle.ParallelCorpora.First().SourceCorpora[0].Files[0].Location; + string fileLocation = env.CorpusBundle.ParallelCorpora[0].SourceCorpora[0].Files[0].Location; ParatextProjectSettings? settings = env.CorpusBundle.GetSettings(fileLocation); Assert.That(settings, Is.Not.Null); Assert.That(settings.Name, Is.EqualTo("Te1")); @@ -19,7 +19,7 @@ public void GetSettings() public void GetSettings_TextFile() { using TestEnvironment env = new(addParatext: false, addText: true); - string fileLocation = env.CorpusBundle.ParallelCorpora.First().SourceCorpora[0].Files[0].Location; + string fileLocation = env.CorpusBundle.ParallelCorpora[0].SourceCorpora[0].Files[0].Location; ParatextProjectSettings? settings = env.CorpusBundle.GetSettings(fileLocation); Assert.That(settings, Is.Null); Assert.That(env.CorpusBundle.ParentOf(fileLocation), Is.Null); @@ -29,7 +29,7 @@ public void GetSettings_TextFile() public void GetTextUpdater() { using TestEnvironment env = new(addParatext: true, addText: false); - string fileLocation = env.CorpusBundle.ParallelCorpora.First().SourceCorpora[0].Files[0].Location; + string fileLocation = env.CorpusBundle.ParallelCorpora[0].SourceCorpora[0].Files[0].Location; using ZipParatextProjectTextUpdater updater = env.CorpusBundle.GetTextUpdater(fileLocation); Assert.That( updater.UpdateUsfm("MAT", [], textBehavior: UpdateUsfmTextBehavior.PreferExisting).ReplaceLineEndings("\n"), @@ -63,7 +63,7 @@ public void GetTextUpdater() public void GetTextUpdater_TextFile() { using TestEnvironment env = new(addParatext: false, addText: true); - string fileLocation = env.CorpusBundle.ParallelCorpora.First().SourceCorpora[0].Files[0].Location; + string fileLocation = env.CorpusBundle.ParallelCorpora[0].SourceCorpora[0].Files[0].Location; Assert.Throws(() => env.CorpusBundle.GetTextUpdater(fileLocation)); } @@ -72,7 +72,7 @@ public void GetTextCorpora() { using TestEnvironment env = new(addParatext: true, addText: true); - Assert.That(env.CorpusBundle.ParallelCorpora.Count(), Is.EqualTo(3)); + Assert.That(env.CorpusBundle.ParallelCorpora, Has.Count.EqualTo(3)); Assert.That(env.CorpusBundle.SourceTermCorpora.Count(c => c.TextCorpora.Any()), Is.EqualTo(2)); Assert.That( diff --git a/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/ParallelCorpusServiceTests.cs b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/ParallelCorpusServiceTests.cs index 2903d0007..4032c8200 100644 --- a/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/ParallelCorpusServiceTests.cs +++ b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/ParallelCorpusServiceTests.cs @@ -39,7 +39,7 @@ public void TestParallelCorpusAnalysis_FileFormatText() } [Test] - public async Task TestParallelCorpusPreprocessor_FileFormatText() + public async Task TestPreprocess_FileFormatText() { using var env = new TestEnvironment(); IReadOnlyList corpora = env.GetCorpora(paratextProject: false); @@ -73,7 +73,7 @@ await env.Processor.PreprocessAsync( } [Test] - public async Task TestParallelCorpusPreprocessor_FileFormatParatext() + public async Task TestPreprocess_FileFormatParatext() { using var env = new TestEnvironment(); IReadOnlyList corpora = env.GetCorpora(paratextProject: true); From 8c9a2d9ce703854167b595b4030bcf64b20b110f Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Wed, 18 Mar 2026 11:07:31 -0400 Subject: [PATCH 4/5] Passing tests for pretranslateAll, trainOnAll --- .../Services/CorpusMappingService.cs | 48 ++++++++++++------- .../Services/EngineService.cs | 24 +++------- .../Services/EngineServiceTests.cs | 34 +++++++++++++ .../Models/MonolingualCorpus.cs | 2 + .../Models/ParallelCorpus.cs | 2 - 5 files changed, 73 insertions(+), 37 deletions(-) diff --git a/src/Serval/src/Serval.Translation/Services/CorpusMappingService.cs b/src/Serval/src/Serval.Translation/Services/CorpusMappingService.cs index aa1fd168f..c27889749 100644 --- a/src/Serval/src/Serval.Translation/Services/CorpusMappingService.cs +++ b/src/Serval/src/Serval.Translation/Services/CorpusMappingService.cs @@ -1,5 +1,3 @@ -using SIL.Extensions; - namespace Serval.Translation.Services; public class CorpusMappingService( @@ -56,12 +54,16 @@ Corpus source in corpora.Where(c => Id = source.Id, Language = source.SourceLanguage, Files = source.SourceFiles.Select(Map).ToArray(), + TrainOnAll = trainOnAllCorpora, + PretranslateAll = pretranslateAllCorpora, }; SIL.ServiceToolkit.Models.MonolingualCorpus targetCorpus = new() { Id = source.Id, Language = source.TargetLanguage, Files = source.TargetFiles.Select(Map).ToArray(), + TrainOnAll = trainOnAllCorpora, + PretranslateAll = pretranslateAllCorpora, }; if (trainingCorpus is not null) @@ -72,12 +74,10 @@ Corpus source in corpora.Where(c => $"The corpus {source.Id} cannot specify both 'textIds' and 'scriptureRange' for trainOn" ); } - if (trainingCorpus.TextIds is not null) - { - sourceCorpus.TrainOnTextIds.AddRange(trainingCorpus.TextIds); - targetCorpus.TrainOnTextIds.AddRange(trainingCorpus.TextIds); - } - if (!string.IsNullOrEmpty(trainingCorpus.ScriptureRange)) + sourceCorpus.TrainOnTextIds = trainingCorpus.TextIds?.ToHashSet(); + targetCorpus.TrainOnTextIds = trainingCorpus.TextIds?.ToHashSet(); + + if (trainingCorpus.ScriptureRange is not null) { if ( targetCorpus.Files.Count > 1 @@ -98,6 +98,8 @@ Corpus source in corpora.Where(c => sourceCorpus.TrainOnChapters = chapters; targetCorpus.TrainOnChapters = chapters; } + sourceCorpus.TrainOnAll = sourceCorpus.TrainOnChapters is null && sourceCorpus.TrainOnTextIds is null; + targetCorpus.TrainOnAll = targetCorpus.TrainOnChapters is null && targetCorpus.TrainOnTextIds is null; } if (pretranslateCorpus is not null) @@ -108,9 +110,8 @@ Corpus source in corpora.Where(c => $"The corpus {source.Id} cannot specify both 'textIds' and 'scriptureRange' for 'pretranslate'." ); } - if (pretranslateCorpus.TextIds is not null) - sourceCorpus.InferenceTextIds.AddRange(pretranslateCorpus.TextIds); - if (!string.IsNullOrEmpty(pretranslateCorpus.ScriptureRange)) + sourceCorpus.InferenceTextIds = pretranslateCorpus.TextIds?.ToHashSet(); + if (pretranslateCorpus.ScriptureRange is not null) { if ( targetCorpus.Files.Count > 1 @@ -129,14 +130,16 @@ Corpus source in corpora.Where(c => ) .ToDictionary(kvp => kvp.Key, kvp => kvp.Value.ToHashSet()); } + sourceCorpus.PretranslateAll = + sourceCorpus.InferenceChapters is null && sourceCorpus.InferenceTextIds is null; + targetCorpus.PretranslateAll = + targetCorpus.InferenceChapters is null && targetCorpus.InferenceTextIds is null; } SIL.ServiceToolkit.Models.ParallelCorpus corpus = new() { Id = source.Id, SourceCorpora = [sourceCorpus], TargetCorpora = [targetCorpus], - TrainOnAllCorpora = trainOnAllCorpora, - PretranslateAllCorpora = pretranslateAllCorpora, }; mappedParallelCorpora.Add(corpus); } @@ -186,7 +189,11 @@ IReadOnlyList parallelCorpora sc, trainingCorpus?.SourceFilters?.Where(sf => sf.CorpusRef == sc.Id).FirstOrDefault(), pretranslateCorpus?.SourceFilters?.Where(sf => sf.CorpusRef == sc.Id).FirstOrDefault(), - referenceFileLocation + referenceFileLocation, + trainOnAllCorpora + || (trainingCorpus is not null && trainingCorpus.SourceFilters is null), + pretranslateAllCorpora + || (pretranslateCorpus is not null && pretranslateCorpus.SourceFilters is null) ) ) .ToArray(), @@ -197,12 +204,13 @@ IReadOnlyList parallelCorpora tc, trainingCorpus?.TargetFilters?.Where(sf => sf.CorpusRef == tc.Id).FirstOrDefault(), null, - referenceFileLocation + referenceFileLocation, + trainOnAllCorpora + || (trainingCorpus is not null && trainingCorpus.TargetFilters is null), + pretranslateAllCorpora || pretranslateCorpus is not null ) ) .ToArray(), - TrainOnAllCorpora = trainOnAllCorpora, - PretranslateAllCorpora = pretranslateAllCorpora, } ); } @@ -214,7 +222,9 @@ private SIL.ServiceToolkit.Models.MonolingualCorpus Map( MonolingualCorpus inputCorpus, ParallelCorpusFilter? trainingFilter, ParallelCorpusFilter? pretranslateFilter, - string? referenceFileLocation + string? referenceFileLocation, + bool trainOnAll, + bool pretranslateAll ) { Dictionary>? trainOnChapters = null; @@ -254,6 +264,8 @@ pretranslateFilter is not null Id = inputCorpus.Id, Language = inputCorpus.Language, Files = inputCorpus.Files.Select(Map).ToArray(), + TrainOnAll = trainOnAll, + PretranslateAll = pretranslateAll, }; if ( diff --git a/src/Serval/src/Serval.Translation/Services/EngineService.cs b/src/Serval/src/Serval.Translation/Services/EngineService.cs index 40fbe843d..e32d3bb5a 100644 --- a/src/Serval/src/Serval.Translation/Services/EngineService.cs +++ b/src/Serval/src/Serval.Translation/Services/EngineService.cs @@ -843,22 +843,12 @@ private static V1.ParallelCorpus Map(SIL.ServiceToolkit.Models.ParallelCorpus so return new V1.ParallelCorpus { Id = source.Id, - SourceCorpora = - { - source.SourceCorpora.Select(c => Map(c, source.TrainOnAllCorpora, source.PretranslateAllCorpora)), - }, - TargetCorpora = - { - source.TargetCorpora.Select(c => Map(c, source.TrainOnAllCorpora, source.PretranslateAllCorpora)), - }, + SourceCorpora = { source.SourceCorpora.Select(Map) }, + TargetCorpora = { source.TargetCorpora.Select(Map) }, }; } - private static V1.MonolingualCorpus Map( - SIL.ServiceToolkit.Models.MonolingualCorpus source, - bool trainOnAll, - bool pretranslateAll - ) + private static V1.MonolingualCorpus Map(SIL.ServiceToolkit.Models.MonolingualCorpus source) { var corpus = new V1.MonolingualCorpus { @@ -867,15 +857,15 @@ bool pretranslateAll Files = { source.Files.Select(Map) }, }; - if (trainOnAll || (source.TrainOnTextIds is null && source.TrainOnChapters is null)) + if (source.TrainOnAll) { corpus.TrainOnAll = true; } - if (source.TrainOnTextIds is not null) + else if (source.TrainOnTextIds is not null) { corpus.TrainOnTextIds.Add(source.TrainOnTextIds); } - if (source.TrainOnChapters is not null) + else if (source.TrainOnChapters is not null) { corpus.TrainOnChapters.Add( source @@ -889,7 +879,7 @@ bool pretranslateAll ); } - if (pretranslateAll || (source.InferenceTextIds is null && source.InferenceChapters is null)) + if (source.PretranslateAll) { corpus.PretranslateAll = true; } diff --git a/src/Serval/test/Serval.Translation.Tests/Services/EngineServiceTests.cs b/src/Serval/test/Serval.Translation.Tests/Services/EngineServiceTests.cs index 5c6a17b1b..98dd9c46e 100644 --- a/src/Serval/test/Serval.Translation.Tests/Services/EngineServiceTests.cs +++ b/src/Serval/test/Serval.Translation.Tests/Services/EngineServiceTests.cs @@ -138,6 +138,7 @@ await env.Service.StartBuildAsync( { new() { + Id = "corpus1", Language = "es", Files = { @@ -159,6 +160,7 @@ await env.Service.StartBuildAsync( { new() { + Id = "corpus1", Language = "en", Files = { @@ -217,6 +219,8 @@ await env.Service.StartBuildAsync( { new() { + Id = "corpus1", + Language = "es", TrainOnTextIds = { }, Files = @@ -239,6 +243,8 @@ await env.Service.StartBuildAsync( { new() { + Id = "corpus1", + Language = "en", TrainOnTextIds = { }, Files = @@ -298,6 +304,8 @@ await env.Service.StartBuildAsync( { new() { + Id = "corpus1", + Language = "es", TrainOnTextIds = { "text1" }, Files = @@ -320,6 +328,8 @@ await env.Service.StartBuildAsync( { new() { + Id = "corpus1", + Language = "en", TrainOnTextIds = { "text1" }, Files = @@ -379,6 +389,8 @@ await env.Service.StartBuildAsync( { new() { + Id = "corpus1", + Language = "es", Files = { @@ -400,6 +412,8 @@ await env.Service.StartBuildAsync( { new() { + Id = "corpus1", + Language = "en", Files = { @@ -459,6 +473,8 @@ await env.Service.StartBuildAsync( { new() { + Id = "corpus1", + Language = "es", Files = { @@ -480,6 +496,8 @@ await env.Service.StartBuildAsync( { new() { + Id = "corpus1", + Language = "en", Files = { @@ -539,6 +557,8 @@ await env.Service.StartBuildAsync( { new() { + Id = "corpus1", + Language = "es", Files = { @@ -560,6 +580,8 @@ await env.Service.StartBuildAsync( { new() { + Id = "corpus1", + Language = "en", Files = { @@ -585,6 +607,8 @@ await env.Service.StartBuildAsync( { new() { + Id = "corpus2", + Language = "es", Files = { @@ -606,6 +630,8 @@ await env.Service.StartBuildAsync( { new() { + Id = "corpus2", + Language = "en", Files = { @@ -682,6 +708,8 @@ await env.Service.StartBuildAsync( { new() { + Id = "corpus1", + Language = "es", TrainOnChapters = { @@ -714,6 +742,8 @@ await env.Service.StartBuildAsync( { new() { + Id = "corpus1", + Language = "en", TrainOnChapters = { @@ -783,6 +813,8 @@ await env.Service.StartBuildAsync( { new() { + Id = "corpus1", + Language = "es", Files = { @@ -804,6 +836,8 @@ await env.Service.StartBuildAsync( { new() { + Id = "corpus1", + Language = "en", Files = { diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/MonolingualCorpus.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/MonolingualCorpus.cs index 1ef59e6be..5b366a71b 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/MonolingualCorpus.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/MonolingualCorpus.cs @@ -9,6 +9,8 @@ public record MonolingualCorpus public Dictionary>? TrainOnChapters { get; set; } public HashSet? InferenceTextIds { get; set; } public Dictionary>? InferenceChapters { get; set; } + public bool TrainOnAll { get; set; } + public bool PretranslateAll { get; set; } public bool IsFiltered => TrainOnTextIds != null || TrainOnChapters != null || InferenceTextIds != null || InferenceChapters != null; diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/ParallelCorpus.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/ParallelCorpus.cs index 8c47cfdf9..833741629 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/ParallelCorpus.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/ParallelCorpus.cs @@ -5,6 +5,4 @@ public record ParallelCorpus public required string Id { get; set; } public IReadOnlyList SourceCorpora { get; set; } = new List(); public IReadOnlyList TargetCorpora { get; set; } = new List(); - public bool TrainOnAllCorpora { get; set; } - public bool PretranslateAllCorpora { get; set; } } From 0037c67cdbb8f21d63603b106c46ff25c2ebd7bd Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Thu, 19 Mar 2026 13:33:50 -0400 Subject: [PATCH 5/5] Address reviewer comments --- .../Configuration/IServalBuilderExtensions.cs | 2 +- .../Services/CorpusMappingService.cs | 2 +- .../Services/ICorpusMappingService.cs | 1 - .../Services/PretranslationService.cs | 10 +++++----- .../Services/PretranslationServiceTests.cs | 20 +++++++++++++++++++ 5 files changed, 27 insertions(+), 8 deletions(-) diff --git a/src/Serval/src/Serval.Translation/Configuration/IServalBuilderExtensions.cs b/src/Serval/src/Serval.Translation/Configuration/IServalBuilderExtensions.cs index cc298534d..782699663 100644 --- a/src/Serval/src/Serval.Translation/Configuration/IServalBuilderExtensions.cs +++ b/src/Serval/src/Serval.Translation/Configuration/IServalBuilderExtensions.cs @@ -10,7 +10,7 @@ public static IServalBuilder AddTranslation(this IServalBuilder builder) builder.AddApiOptions(builder.Configuration.GetSection(ApiOptions.Key)); builder.AddDataFileOptions(builder.Configuration.GetSection(DataFileOptions.Key)); - builder.Services.AddSingleton(); + builder.Services.AddParallelCorpusService(); builder.Services.AddScoped(); builder.Services.AddScoped(); diff --git a/src/Serval/src/Serval.Translation/Services/CorpusMappingService.cs b/src/Serval/src/Serval.Translation/Services/CorpusMappingService.cs index c27889749..1667e5a30 100644 --- a/src/Serval/src/Serval.Translation/Services/CorpusMappingService.cs +++ b/src/Serval/src/Serval.Translation/Services/CorpusMappingService.cs @@ -349,7 +349,7 @@ private SIL.ServiceToolkit.Models.MonolingualCorpus Map(MonolingualCorpus source }; } - public string GetFilePath(string filename) + private string GetFilePath(string filename) { return Path.Combine(_dataFileOptions.CurrentValue.FilesDirectory, filename); } diff --git a/src/Serval/src/Serval.Translation/Services/ICorpusMappingService.cs b/src/Serval/src/Serval.Translation/Services/ICorpusMappingService.cs index 01ee59985..038b7c420 100644 --- a/src/Serval/src/Serval.Translation/Services/ICorpusMappingService.cs +++ b/src/Serval/src/Serval.Translation/Services/ICorpusMappingService.cs @@ -3,5 +3,4 @@ namespace Serval.Translation.Services; public interface ICorpusMappingService { IReadOnlyList Map(Build build, Engine engine); - string GetFilePath(string filename); } diff --git a/src/Serval/src/Serval.Translation/Services/PretranslationService.cs b/src/Serval/src/Serval.Translation/Services/PretranslationService.cs index b4d4d856f..c31ac96f9 100644 --- a/src/Serval/src/Serval.Translation/Services/PretranslationService.cs +++ b/src/Serval/src/Serval.Translation/Services/PretranslationService.cs @@ -50,8 +50,10 @@ public async Task GetUsfmAsync( ) { Engine? engine = await _engines.GetAsync(engineId, cancellationToken); - Corpus? corpus = engine?.Corpora.SingleOrDefault(c => c.Id == corpusId); - ParallelCorpus? parallelCorpus = engine?.ParallelCorpora.SingleOrDefault(c => c.Id == corpusId); + if (engine is null) + throw new EntityNotFoundException($"Could not find the Engine '{engineId}'."); + Corpus? corpus = engine.Corpora.SingleOrDefault(c => c.Id == corpusId); + ParallelCorpus? parallelCorpus = engine.ParallelCorpora.SingleOrDefault(c => c.Id == corpusId); if (corpus is not null) { if (corpus.SourceFiles.Count == 0) @@ -107,9 +109,7 @@ public async Task GetUsfmAsync( List remarks = [disclaimerRemark, markerPlacementRemark]; - SIL.ServiceToolkit.Models.ParallelCorpus[] parallelCorpora = _corpusMappingService - .Map(build, engine!) - .ToArray(); + SIL.ServiceToolkit.Models.ParallelCorpus[] parallelCorpora = _corpusMappingService.Map(build, engine).ToArray(); IEnumerable pretranslations = ( await GetAllAsync(engineId, modelRevision, corpusId, textId, cancellationToken) diff --git a/src/Serval/test/Serval.Translation.Tests/Services/PretranslationServiceTests.cs b/src/Serval/test/Serval.Translation.Tests/Services/PretranslationServiceTests.cs index 78b564d0a..941714a53 100644 --- a/src/Serval/test/Serval.Translation.Tests/Services/PretranslationServiceTests.cs +++ b/src/Serval/test/Serval.Translation.Tests/Services/PretranslationServiceTests.cs @@ -416,6 +416,26 @@ public void GetUsfmAsync_BadPretranslationVerseRef() }); } + [Test] + public void GetUsfmAsync_EngineDoesNotExist() + { + using TestEnvironment env = new(); + Assert.ThrowsAsync(async () => + await env.Service.GetUsfmAsync( + engineId: "engine2", + modelRevision: 1, + corpusId: "corpus1", + textId: "MAT", + textOrigin: PretranslationUsfmTextOrigin.PreferPretranslated, + template: PretranslationUsfmTemplate.Auto, + paragraphMarkerBehavior: PretranslationUsfmMarkerBehavior.PreservePosition, + embedBehavior: PretranslationUsfmMarkerBehavior.Preserve, + styleMarkerBehavior: PretranslationUsfmMarkerBehavior.Strip, + quoteNormalizationBehavior: PretranslationNormalizationBehavior.Denormalized + ) + ); + } + [Test] [TestCase(new int[] { 1, 2, 3 }, "1-3")] [TestCase(new int[] { 1, 3, 4 }, "1, 3-4")]