diff --git a/src/Echo/src/EchoEngine/Program.cs b/src/Echo/src/EchoEngine/Program.cs index 58a1c8da3..796260736 100644 --- a/src/Echo/src/EchoEngine/Program.cs +++ b/src/Echo/src/EchoEngine/Program.cs @@ -24,7 +24,7 @@ builder.Services.AddHostedService(); builder.Services.AddSingleton(); -builder.Services.AddParallelCorpusPreprocessor(); +builder.Services.AddParallelCorpusService(); builder.Services.AddHealthChecks().AddCheck("Live", () => HealthCheckResult.Healthy()); diff --git a/src/Echo/src/EchoEngine/TranslationEngineServiceV1.cs b/src/Echo/src/EchoEngine/TranslationEngineServiceV1.cs index 60541a976..dd84b9521 100644 --- a/src/Echo/src/EchoEngine/TranslationEngineServiceV1.cs +++ b/src/Echo/src/EchoEngine/TranslationEngineServiceV1.cs @@ -4,7 +4,7 @@ namespace EchoEngine; public class TranslationEngineServiceV1( BackgroundTaskQueue taskQueue, - IParallelCorpusPreprocessingService parallelCorpusPreprocessingService, + IParallelCorpusService parallelCorpusService, TranslationPlatformApi.TranslationPlatformApiClient platformApiClient ) : TranslationEngineApi.TranslationEngineApiBase { @@ -12,8 +12,7 @@ TranslationPlatformApi.TranslationPlatformApiClient platformApiClient private readonly BackgroundTaskQueue _taskQueue = taskQueue; private readonly TranslationPlatformApi.TranslationPlatformApiClient _platformApiClient = platformApiClient; - private readonly IParallelCorpusPreprocessingService _parallelCorpusPreprocessingService = - parallelCorpusPreprocessingService; + private readonly IParallelCorpusService _parallelCorpusService = parallelCorpusService; public override Task Create(CreateRequest request, ServerCallContext context) { @@ -125,22 +124,22 @@ await client.BuildStartedAsync( int pretranslateCount = 0; List pretranslationsRequests = []; - await _parallelCorpusPreprocessingService.PreprocessAsync( - request.Corpora.Select(Map).ToList(), + await _parallelCorpusService.PreprocessAsync( + request.Corpora.Select(Map), (row, _) => { if (row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0) trainCount++; return Task.CompletedTask; }, - (row, isInTrainingData, corpus) => + (row, isInTrainingData, corpusId) => { string[] tokens = row.SourceSegment.Split(); pretranslationsRequests.Add( new InsertPretranslationsRequest { EngineId = request.EngineId, - CorpusId = corpus.Id, + CorpusId = corpusId, TextId = row.TextId, SourceRefs = { row.SourceRefs.Select(r => r.ToString()) }, TargetRefs = { row.TargetRefs.Select(r => r.ToString()) }, diff --git a/src/Echo/src/EchoEngine/WordAlignmentEngineServiceV1.cs b/src/Echo/src/EchoEngine/WordAlignmentEngineServiceV1.cs index 98ba34c93..74ad1f07d 100644 --- a/src/Echo/src/EchoEngine/WordAlignmentEngineServiceV1.cs +++ b/src/Echo/src/EchoEngine/WordAlignmentEngineServiceV1.cs @@ -2,15 +2,12 @@ namespace EchoEngine; -public class WordAlignmentEngineServiceV1( - BackgroundTaskQueue taskQueue, - IParallelCorpusPreprocessingService parallelCorpusPreprocessingService -) : WordAlignmentEngineApi.WordAlignmentEngineApiBase +public class WordAlignmentEngineServiceV1(BackgroundTaskQueue taskQueue, IParallelCorpusService parallelCorpusService) + : WordAlignmentEngineApi.WordAlignmentEngineApiBase { private static readonly Empty Empty = new(); private readonly BackgroundTaskQueue _taskQueue = taskQueue; - private readonly IParallelCorpusPreprocessingService _parallelCorpusPreprocessingService = - parallelCorpusPreprocessingService; + private readonly IParallelCorpusService _parallelCorpusService = parallelCorpusService; public override Task Create(CreateRequest request, ServerCallContext context) { @@ -79,21 +76,21 @@ await client.BuildStartedAsync( int trainCount = 0; int wordAlignCount = 0; List wordAlignmentsRequests = []; - await _parallelCorpusPreprocessingService.PreprocessAsync( - request.Corpora.Select(Map).ToList(), + await _parallelCorpusService.PreprocessAsync( + request.Corpora.Select(Map), (row, _) => { if (row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0) trainCount++; return Task.CompletedTask; }, - (row, isInTrainingData, corpus) => + (row, isInTrainingData, corpusId) => { wordAlignmentsRequests.Add( new InsertWordAlignmentsRequest { EngineId = request.EngineId, - CorpusId = corpus.Id, + CorpusId = corpusId, TextId = row.TextId, SourceRefs = { row.SourceRefs.Select(r => r.ToString()) }, TargetRefs = { row.TargetRefs.Select(r => r.ToString()) }, diff --git a/src/Machine/src/Serval.Machine.Shared/Configuration/IServiceCollectionExtensions.cs b/src/Machine/src/Serval.Machine.Shared/Configuration/IServiceCollectionExtensions.cs index 352768e4e..f5d82f55a 100644 --- a/src/Machine/src/Serval.Machine.Shared/Configuration/IServiceCollectionExtensions.cs +++ b/src/Machine/src/Serval.Machine.Shared/Configuration/IServiceCollectionExtensions.cs @@ -19,7 +19,7 @@ public static IMachineBuilder AddMachine(this IServiceCollection services, IConf (sp, cancellationToken) => sp.GetRequiredService().InitAsync(cancellationToken) ); - services.AddParallelCorpusPreprocessor(); + services.AddParallelCorpusService(); services.Configure(configuration.GetSection("Bugsnag")); services.AddBugsnag(); services.AddDiagnostics(); diff --git a/src/Machine/src/Serval.Machine.Shared/Services/NmtPreprocessBuildJob.cs b/src/Machine/src/Serval.Machine.Shared/Services/NmtPreprocessBuildJob.cs index f64da3c73..2b5f06393 100644 --- a/src/Machine/src/Serval.Machine.Shared/Services/NmtPreprocessBuildJob.cs +++ b/src/Machine/src/Serval.Machine.Shared/Services/NmtPreprocessBuildJob.cs @@ -8,7 +8,7 @@ public class NmtPreprocessBuildJob( IBuildJobService buildJobService, ISharedFileService sharedFileService, ILanguageTagService languageTagService, - IParallelCorpusPreprocessingService parallelCorpusPreprocessingService, + IParallelCorpusService parallelCorpusService, IOptionsMonitor options ) : TranslationPreprocessBuildJob( @@ -18,7 +18,7 @@ IOptionsMonitor options logger, buildJobService, sharedFileService, - parallelCorpusPreprocessingService, + parallelCorpusService, options ) { @@ -33,21 +33,12 @@ private bool ResolveLanguageCode(string languageCode, out string resolvedCode) protected override async Task UpdateTargetQuoteConventionAsync( string engineId, string buildId, - IReadOnlyList corpora, + IReadOnlyList parallelCorpora, CancellationToken cancellationToken ) { - List quoteConventionAnalyses = []; - foreach (ParallelCorpus parallelCorpus in corpora) - { - QuoteConventionAnalysis? targetQuotationConventionAnalysis = - ParallelCorpusPreprocessingService.AnalyzeTargetCorpusQuoteConvention(parallelCorpus); - if (targetQuotationConventionAnalysis != null) - quoteConventionAnalyses.Add(targetQuotationConventionAnalysis); - } - string overallTargetQuoteConventionAnalysis = - QuoteConventionAnalysis.CombineWithWeightedAverage(quoteConventionAnalyses)?.BestQuoteConvention?.Name + ParallelCorpusService.AnalyzeTargetQuoteConvention(parallelCorpora)?.BestQuoteConvention?.Name ?? string.Empty; await PlatformService.UpdateTargetQuoteConventionAsync( @@ -65,7 +56,7 @@ protected override async Task UpdateBuildExecutionData( int pretranslateCount, string sourceLanguageTag, string targetLanguageTag, - IReadOnlyList corpora, + IReadOnlyList parallelCorpora, CancellationToken cancellationToken ) { @@ -84,7 +75,7 @@ CancellationToken cancellationToken pretranslateCount, sourceLanguageTag, targetLanguageTag, - corpora + parallelCorpora ); int maxWarnings = BuildJobOptions.MaxWarnings; @@ -128,12 +119,12 @@ protected override IReadOnlyList GetWarnings( int inferenceCount, string sourceLanguageTag, string targetLanguageTag, - IReadOnlyList corpora + IReadOnlyList parallelCorpora ) { List warnings = [ - .. base.GetWarnings(trainCount, inferenceCount, sourceLanguageTag, targetLanguageTag, corpora), + .. base.GetWarnings(trainCount, inferenceCount, sourceLanguageTag, targetLanguageTag, parallelCorpora), ]; // Has at least a Gospel of Mark amount of data and not the special case of no data which will be caught elsewhere diff --git a/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs b/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs index 630cf5d2e..d3488e8d3 100644 --- a/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs +++ b/src/Machine/src/Serval.Machine.Shared/Services/PreprocessBuildJob.cs @@ -7,7 +7,7 @@ public abstract class PreprocessBuildJob( ILogger> logger, IBuildJobService buildJobService, ISharedFileService sharedFileService, - IParallelCorpusPreprocessingService parallelCorpusPreprocessingService, + IParallelCorpusService parallelCorpusService, IOptionsMonitor options ) : HangfireBuildJob>( @@ -31,8 +31,7 @@ IOptionsMonitor options internal BuildJobRunnerType TrainJobRunnerType { get; init; } = BuildJobRunnerType.ClearML; protected readonly BuildJobOptions BuildJobOptions = options.CurrentValue; protected readonly ISharedFileService SharedFileService = sharedFileService; - protected readonly IParallelCorpusPreprocessingService ParallelCorpusPreprocessingService = - parallelCorpusPreprocessingService; + protected readonly IParallelCorpusService ParallelCorpusService = parallelCorpusService; protected override async Task DoWorkAsync( string engineId, @@ -95,20 +94,20 @@ protected abstract Task UpdateBuildExecutionData( int inferenceCount, string sourceLanguageTag, string targetLanguageTag, - IReadOnlyList corpora, + IReadOnlyList parallelCorpora, CancellationToken cancellationToken ); protected virtual Task UpdateTargetQuoteConventionAsync( string engineId, string buildId, - IReadOnlyList corpora, + IReadOnlyList parallelCorpora, CancellationToken cancellationToken ) => Task.CompletedTask; protected abstract Task<(int TrainCount, int InferenceCount)> WriteDataFilesAsync( string buildId, - IReadOnlyList corpora, + IReadOnlyList parallelCorpora, string? buildOptions, CancellationToken cancellationToken ); @@ -116,7 +115,7 @@ CancellationToken cancellationToken protected override async Task CleanupAsync( string engineId, string buildId, - IReadOnlyList data, + IReadOnlyList parallelCorpora, JobCompletionStatus completionStatus ) { @@ -138,34 +137,48 @@ protected virtual IReadOnlyList GetWarnings( int inferenceCount, string sourceLanguageTag, string targetLanguageTag, - IReadOnlyList corpora + IReadOnlyList parallelCorpora ) { List warnings = []; - foreach (ParallelCorpus parallelCorpus in corpora) + foreach ( + ( + string parallelCorpusId, + string monolingualCorpusId, + IReadOnlyList errors + ) in ParallelCorpusService.AnalyzeUsfmVersification(parallelCorpora) + ) { - IReadOnlyList<(string MonolingualCorpusId, IReadOnlyList errors)> errorsPerCorpus = - ParallelCorpusPreprocessingService.AnalyzeUsfmVersification(parallelCorpus); - - foreach ((string monolingualCorpusId, IReadOnlyList errors) in errorsPerCorpus) + foreach (UsfmVersificationError error in errors) { - foreach (UsfmVersificationError error in errors) - { - warnings.Add( - error.Type switch - { - UsfmVersificationErrorType.InvalidChapterNumber => - $"Invalid chapter number error in project {error.ProjectName} at “{error.ActualVerseRef}” (parallel corpus {parallelCorpus.Id}, monolingual corpus {monolingualCorpusId})", - UsfmVersificationErrorType.InvalidVerseNumber => - $"Invalid verse number error in project {error.ProjectName} at “{error.ActualVerseRef}” (parallel corpus {parallelCorpus.Id}, monolingual corpus {monolingualCorpusId})", - _ => - $"USFM versification error in project {error.ProjectName}, expected verse “{error.ExpectedVerseRef}”, actual verse “{error.ActualVerseRef}”, mismatch type {error.Type} (parallel corpus {parallelCorpus.Id}, monolingual corpus {monolingualCorpusId})", - } - ); - } + warnings.Add( + error.Type switch + { + UsfmVersificationErrorType.InvalidChapterNumber => + $"Invalid chapter number error in project {error.ProjectName} at “{error.ActualVerseRef}” (parallel corpus {parallelCorpusId}, monolingual corpus {monolingualCorpusId})", + UsfmVersificationErrorType.InvalidVerseNumber => + $"Invalid verse number error in project {error.ProjectName} at “{error.ActualVerseRef}” (parallel corpus {parallelCorpusId}, monolingual corpus {monolingualCorpusId})", + _ => + $"USFM versification error in project {error.ProjectName}, expected verse “{error.ExpectedVerseRef}”, actual verse “{error.ActualVerseRef}”, mismatch type {error.Type} (parallel corpus {parallelCorpusId}, monolingual corpus {monolingualCorpusId})", + } + ); } } + + foreach ( + ( + string parallelCorpusId, + string monolingualCorpusId, + MissingParentProjectError error + ) in ParallelCorpusService.FindMissingParentProjects(parallelCorpora) + ) + { + warnings.Add( + $"Unable to locate parent project {error.ParentProjectName} of daughter project {error.ProjectName} (parallel corpus {parallelCorpusId}, monolingual corpus {monolingualCorpusId})" + ); + } + return warnings; } } diff --git a/src/Machine/src/Serval.Machine.Shared/Services/SmtTransferPreprocessBuildJob.cs b/src/Machine/src/Serval.Machine.Shared/Services/SmtTransferPreprocessBuildJob.cs index adbf66e85..cdd618312 100644 --- a/src/Machine/src/Serval.Machine.Shared/Services/SmtTransferPreprocessBuildJob.cs +++ b/src/Machine/src/Serval.Machine.Shared/Services/SmtTransferPreprocessBuildJob.cs @@ -9,7 +9,7 @@ public class SmtTransferPreprocessBuildJob( ISharedFileService sharedFileService, IDistributedReaderWriterLockFactory lockFactory, IRepository trainSegmentPairs, - IParallelCorpusPreprocessingService parallelCorpusPreprocessingService, + IParallelCorpusService parallelCorpusService, IOptionsMonitor options ) : TranslationPreprocessBuildJob( @@ -19,7 +19,7 @@ IOptionsMonitor options logger, buildJobService, sharedFileService, - parallelCorpusPreprocessingService, + parallelCorpusService, options ) { @@ -29,7 +29,7 @@ IOptionsMonitor options protected override async Task InitializeAsync( string engineId, string buildId, - IReadOnlyList data, + IReadOnlyList corpora, CancellationToken cancellationToken ) { diff --git a/src/Machine/src/Serval.Machine.Shared/Services/TranslationPreprocessBuildJob.cs b/src/Machine/src/Serval.Machine.Shared/Services/TranslationPreprocessBuildJob.cs index 8c1aacf33..9715afb92 100644 --- a/src/Machine/src/Serval.Machine.Shared/Services/TranslationPreprocessBuildJob.cs +++ b/src/Machine/src/Serval.Machine.Shared/Services/TranslationPreprocessBuildJob.cs @@ -7,7 +7,7 @@ public class TranslationPreprocessBuildJob( ILogger> logger, IBuildJobService buildJobService, ISharedFileService sharedFileService, - IParallelCorpusPreprocessingService parallelCorpusPreprocessingService, + IParallelCorpusService parallelCorpusService, IOptionsMonitor options ) : PreprocessBuildJob( @@ -17,13 +17,13 @@ IOptionsMonitor options logger, buildJobService, sharedFileService, - parallelCorpusPreprocessingService, + parallelCorpusService, options ) { protected override async Task<(int TrainCount, int InferenceCount)> WriteDataFilesAsync( string buildId, - IReadOnlyList corpora, + IReadOnlyList parallelCorpora, string? buildOptions, CancellationToken cancellationToken ) @@ -55,13 +55,13 @@ await SharedFileService.OpenWriteAsync($"builds/{buildId}/train.key-terms.trg.tx int trainCount = 0; int pretranslateCount = 0; pretranslateWriter.WriteStartArray(); - await ParallelCorpusPreprocessingService.PreprocessAsync( - corpora, + await ParallelCorpusService.PreprocessAsync( + parallelCorpora, async (row, trainingDataType) => { if (row.SourceSegment.Length > 0 || row.TargetSegment.Length > 0) { - if (trainingDataType == TrainingDataType.KeyTerms) + if (trainingDataType == TrainingDataType.KeyTerm) { await sourceKeyTermsTrainWriter.WriteAsync($"{row.SourceSegment}\n"); await targetKeyTermsTrainWriter.WriteAsync($"{row.TargetSegment}\n"); @@ -75,12 +75,12 @@ await ParallelCorpusPreprocessingService.PreprocessAsync( if (row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0) trainCount++; }, - async (row, isInTrainingData, corpus) => + async (row, isInTrainingData, corpusId) => { if (row.SourceSegment.Length > 0 && !isInTrainingData) { pretranslateWriter.WriteStartObject(); - pretranslateWriter.WriteString("corpusId", corpus.Id); + pretranslateWriter.WriteString("corpusId", corpusId); pretranslateWriter.WriteString("textId", row.TextId); pretranslateWriter.WriteStartArray("refs"); foreach (object rowRef in row.TargetRefs) @@ -109,7 +109,7 @@ protected override async Task UpdateBuildExecutionData( int pretranslateCount, string sourceLanguageTag, string targetLanguageTag, - IReadOnlyList corpora, + IReadOnlyList parallelCorpora, CancellationToken cancellationToken ) { @@ -118,7 +118,7 @@ CancellationToken cancellationToken pretranslateCount, sourceLanguageTag, targetLanguageTag, - corpora + parallelCorpora ); // Log summary of build data diff --git a/src/Machine/src/Serval.Machine.Shared/Services/WordAlignmentPreprocessBuildJob.cs b/src/Machine/src/Serval.Machine.Shared/Services/WordAlignmentPreprocessBuildJob.cs index 89f265e7f..8459232aa 100644 --- a/src/Machine/src/Serval.Machine.Shared/Services/WordAlignmentPreprocessBuildJob.cs +++ b/src/Machine/src/Serval.Machine.Shared/Services/WordAlignmentPreprocessBuildJob.cs @@ -7,7 +7,7 @@ public class WordAlignmentPreprocessBuildJob( ILogger logger, IBuildJobService buildJobService, ISharedFileService sharedFileService, - IParallelCorpusPreprocessingService parallelCorpusPreprocessingService, + IParallelCorpusService parallelCorpusService, IOptionsMonitor options ) : PreprocessBuildJob( @@ -17,13 +17,13 @@ IOptionsMonitor options logger, buildJobService, sharedFileService, - parallelCorpusPreprocessingService, + parallelCorpusService, options ) { protected override async Task<(int TrainCount, int InferenceCount)> WriteDataFilesAsync( string buildId, - IReadOnlyList corpora, + IReadOnlyList parallelCorpora, string? buildOptions, CancellationToken cancellationToken ) @@ -55,13 +55,13 @@ await SharedFileService.OpenWriteAsync($"builds/{buildId}/train.key-terms.trg.tx int trainCount = 0; int inferenceCount = 0; wordAlignmentWriter.WriteStartArray(); - await ParallelCorpusPreprocessingService.PreprocessAsync( - corpora, + await ParallelCorpusService.PreprocessAsync( + parallelCorpora, async (row, trainingDataType) => { if (row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0) { - if (trainingDataType == TrainingDataType.KeyTerms) + if (trainingDataType == TrainingDataType.KeyTerm) { await sourceKeyTermsTrainWriter.WriteAsync($"{row.SourceSegment}\n"); await targetKeyTermsTrainWriter.WriteAsync($"{row.TargetSegment}\n"); @@ -75,12 +75,12 @@ await ParallelCorpusPreprocessingService.PreprocessAsync( trainCount++; } }, - async (row, isInTrainingData, corpus) => + async (row, isInTrainingData, corpusId) => { if (row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0 && !isInTrainingData) { wordAlignmentWriter.WriteStartObject(); - wordAlignmentWriter.WriteString("corpusId", corpus.Id); + wordAlignmentWriter.WriteString("corpusId", corpusId); wordAlignmentWriter.WriteString("textId", row.TextId); wordAlignmentWriter.WriteStartArray("refs"); foreach (object rowRef in row.TargetRefs) @@ -109,7 +109,7 @@ protected override async Task UpdateBuildExecutionData( int wordAlignCount, string sourceLanguageTag, string targetLanguageTag, - IReadOnlyList corpora, + IReadOnlyList parallelCorpora, CancellationToken cancellationToken ) { @@ -118,7 +118,7 @@ CancellationToken cancellationToken wordAlignCount, sourceLanguageTag, targetLanguageTag, - corpora + parallelCorpora ); // Log summary of build data @@ -148,7 +148,7 @@ CancellationToken cancellationToken protected override Task UpdateTargetQuoteConventionAsync( string engineId, string buildId, - IReadOnlyList corpora, + IReadOnlyList parallelCorpora, CancellationToken cancellationToken ) { diff --git a/src/Machine/src/Serval.Machine.Shared/Usings.cs b/src/Machine/src/Serval.Machine.Shared/Usings.cs index 5921669aa..7e6db4fa3 100644 --- a/src/Machine/src/Serval.Machine.Shared/Usings.cs +++ b/src/Machine/src/Serval.Machine.Shared/Usings.cs @@ -56,7 +56,6 @@ global using SIL.DataAccess; global using SIL.Machine.Corpora; global using SIL.Machine.Morphology.HermitCrab; -global using SIL.Machine.PunctuationAnalysis; global using SIL.Machine.Tokenization; global using SIL.Machine.Translation; global using SIL.Machine.Translation.Thot; diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/NmtEngineServiceTests.cs b/src/Machine/test/Serval.Machine.Shared.Tests/Services/NmtEngineServiceTests.cs index 29d307d7b..3b2bd5574 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/NmtEngineServiceTests.cs +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/NmtEngineServiceTests.cs @@ -330,7 +330,7 @@ public override object ActivateJob(Type jobType) _env.BuildJobService, _env.SharedFileService, new LanguageTagService(), - new ParallelCorpusPreprocessingService(new TextCorpusService()), + new ParallelCorpusService(), _env.BuildJobOptions ); } diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs index 1aa6e716d..c62ecaae4 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/PreprocessBuildJobTests.cs @@ -343,10 +343,34 @@ public void RunAsync_OnlyParseSelectedBooks_NoBadBooks() using TestEnvironment env = new(); env.PersistModel(); // MRK does not contain verse data, so there is no inferencing ParallelCorpus corpus = env.ParatextCorpus(trainOnTextIds: ["LEV"], inferenceTextIds: ["MRK"]); + var parallelCorpusService = new ParallelCorpusService(); + env.ParallelCorpusService = Substitute.For(); + env.ParallelCorpusService.When(s => + s.PreprocessAsync( + Arg.Any>(), + Arg.Any>(), + Arg.Any>(), + Arg.Any(), + Arg.Any?>() + ) + ) + .Do(async callInfo => + { + CorpusBundle corpusBundle = new(callInfo.ArgAt>(0)); + DummyCorpusBundle dummyCorpusBundle = new DummyCorpusBundle( + corpusBundle, + ["LEV", "MRK", "MAT"], + ["MAT"] + ); - env.TextCorpusService = Substitute.For(); - env.TextCorpusService.CreateTextCorpora(Arg.Any>()) - .Returns([new DummyCorpus(["LEV", "MRK", "MAT"], ["MAT"])]); + await parallelCorpusService.PreprocessAsync( + dummyCorpusBundle, + callInfo.ArgAt>(1), + callInfo.ArgAt>(2), + callInfo.ArgAt(3), + callInfo.ArgAt?>(4) + ); + }); Assert.DoesNotThrowAsync(async () => { await env.RunBuildJobAsync(corpus); @@ -354,17 +378,47 @@ public void RunAsync_OnlyParseSelectedBooks_NoBadBooks() } [Test] - public void RunAsync_OnlyParseSelectedBooks_TrainOnBadBook() + public async Task RunAsync_OnlyParseSelectedBooks_TrainOnBadBook() { using TestEnvironment env = new(); ParallelCorpus corpus = env.ParatextCorpus(trainOnTextIds: ["MAT"], inferenceTextIds: ["MRK"]); - env.TextCorpusService = Substitute.For(); - env.TextCorpusService.CreateTextCorpora(Arg.Any>()) - .Returns([new DummyCorpus(["LEV", "MRK", "MAT"], ["MAT"])]); - Assert.ThrowsAsync(async () => + var parallelCorpusService = new ParallelCorpusService(); + env.ParallelCorpusService = Substitute.For(); + ArgumentException? ex = null; + env.ParallelCorpusService.When(s => + s.PreprocessAsync( + Arg.Any>(), + Arg.Any>(), + Arg.Any>(), + Arg.Any(), + Arg.Any?>() + ) + ) + .Do(async callInfo => + { + CorpusBundle corpusBundle = new(callInfo.ArgAt>(0)); + DummyCorpusBundle dummyCorpusBundle = new DummyCorpusBundle( + corpusBundle, + ["LEV", "MRK", "MAT"], + ["MAT"] + ); + ex = Assert.ThrowsAsync(async () => + { + await parallelCorpusService.PreprocessAsync( + dummyCorpusBundle, + callInfo.ArgAt>(1), + callInfo.ArgAt>(2), + callInfo.ArgAt(3), + callInfo.ArgAt?>(4) + ); + }); + }); + Assert.ThrowsAsync(async () => { await env.RunBuildJobAsync(corpus); }); + + Assert.That(ex, Is.Not.Null); } [Test] @@ -372,13 +426,43 @@ public void RunAsync_OnlyParseSelectedBooks_PretranslateOnBadBook() { using TestEnvironment env = new(); ParallelCorpus corpus = env.ParatextCorpus(trainOnTextIds: ["LEV"], inferenceTextIds: ["MAT"]); - env.TextCorpusService = Substitute.For(); - env.TextCorpusService.CreateTextCorpora(Arg.Any>()) - .Returns([new DummyCorpus(["LEV", "MRK", "MAT"], ["MAT"])]); - Assert.ThrowsAsync(async () => + var parallelCorpusService = new ParallelCorpusService(); + env.ParallelCorpusService = Substitute.For(); + ArgumentException? ex = null; + env.ParallelCorpusService.When(s => + s.PreprocessAsync( + Arg.Any>(), + Arg.Any>(), + Arg.Any>(), + Arg.Any(), + Arg.Any?>() + ) + ) + .Do(async callInfo => + { + CorpusBundle corpusBundle = new(callInfo.ArgAt>(0)); + DummyCorpusBundle dummyCorpusBundle = new DummyCorpusBundle( + corpusBundle, + ["LEV", "MRK", "MAT"], + ["MAT"] + ); + ex = Assert.ThrowsAsync(async () => + { + await parallelCorpusService.PreprocessAsync( + dummyCorpusBundle, + callInfo.ArgAt>(1), + callInfo.ArgAt>(2), + callInfo.ArgAt(3), + callInfo.ArgAt?>(4) + ); + }); + }); + Assert.ThrowsAsync(async () => { await env.RunBuildJobAsync(corpus); }); + + Assert.That(ex, Is.Not.Null); } [Test] @@ -501,12 +585,12 @@ private class TestEnvironment : DisposableBase private readonly TempDirectory _tempDir; public ISharedFileService SharedFileService { get; } - public ITextCorpusService TextCorpusService { get; set; } public IPlatformService PlatformService { get; } public MemoryRepository Engines { get; } public MemoryRepository TrainSegmentPairs { get; } public IDistributedReaderWriterLockFactory LockFactory { get; } public IBuildJobService BuildJobService { get; } + public IParallelCorpusService ParallelCorpusService { get; set; } public IClearMLService ClearMLService { get; } public IOptionsMonitor BuildJobOptions { get; } @@ -709,7 +793,6 @@ public TestEnvironment() } ); TrainSegmentPairs = new MemoryRepository(); - TextCorpusService = new TextCorpusService(); PlatformService = Substitute.For(); PlatformService.EngineGroup.Returns(EngineGroup.Translation); PlatformService.UpdateBuildExecutionDataAsync( @@ -788,6 +871,7 @@ public TestEnvironment() ], Engines ); + ParallelCorpusService = new ParallelCorpusService(); } public PreprocessBuildJob GetBuildJob(EngineType engineType) @@ -804,7 +888,7 @@ public PreprocessBuildJob GetBuildJob(EngineType engineType) BuildJobService, SharedFileService, new LanguageTagService(), - new ParallelCorpusPreprocessingService(TextCorpusService), + ParallelCorpusService, BuildJobOptions ); } @@ -819,7 +903,7 @@ public PreprocessBuildJob GetBuildJob(EngineType engineType) SharedFileService, LockFactory, TrainSegmentPairs, - new ParallelCorpusPreprocessingService(TextCorpusService), + ParallelCorpusService, BuildJobOptions ); } @@ -1130,4 +1214,16 @@ IEnumerator IEnumerable.GetEnumerator() return Texts.GetEnumerator(); } } + + private class DummyCorpusBundle(CorpusBundle corpusBundle, IEnumerable books, IEnumerable failsOn) + : CorpusBundle(corpusBundle.ParallelCorpora) + { + private IEnumerable FailsOn { get; } = failsOn; + private IEnumerable Books { get; } = books; + + protected override IReadOnlyList CreateTextCorpora(IReadOnlyList files) + { + return [new DummyCorpus(Books, FailsOn)]; + } + } } diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/SmtTransferEngineServiceTests.cs b/src/Machine/test/Serval.Machine.Shared.Tests/Services/SmtTransferEngineServiceTests.cs index 9162fcf37..a852b846f 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/SmtTransferEngineServiceTests.cs +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/SmtTransferEngineServiceTests.cs @@ -711,7 +711,7 @@ public override object ActivateJob(Type jobType) _env.SharedFileService, _env._lockFactory, _env.TrainSegmentPairs, - new ParallelCorpusPreprocessingService(new TextCorpusService()), + new ParallelCorpusService(), _env.BuildJobOptions ) { diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Services/StatisticalEngineServiceTests.cs b/src/Machine/test/Serval.Machine.Shared.Tests/Services/StatisticalEngineServiceTests.cs index 63711d67f..4722b4b88 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Services/StatisticalEngineServiceTests.cs +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Services/StatisticalEngineServiceTests.cs @@ -457,7 +457,7 @@ public override object ActivateJob(Type jobType) Substitute.For>(), _env.BuildJobService, _env.SharedFileService, - new ParallelCorpusPreprocessingService(new TextCorpusService()), + new ParallelCorpusService(), _env.BuildJobOptions ) { diff --git a/src/Serval/src/Serval.Shared/Configuration/IServiceCollectionExtensions.cs b/src/Serval/src/Serval.Shared/Configuration/IServiceCollectionExtensions.cs index 3e3459b7a..a6f64f22a 100644 --- a/src/Serval/src/Serval.Shared/Configuration/IServiceCollectionExtensions.cs +++ b/src/Serval/src/Serval.Shared/Configuration/IServiceCollectionExtensions.cs @@ -5,7 +5,6 @@ public static class IServiceCollectionExtensions public static IServalBuilder AddServal(this IServiceCollection services, IConfiguration configuration) { services.AddFileSystem(); - services.AddTransient(); return new ServalBuilder(services, configuration); } } diff --git a/src/Serval/src/Serval.Shared/Services/IScriptureDataFileService.cs b/src/Serval/src/Serval.Shared/Services/IScriptureDataFileService.cs deleted file mode 100644 index 08424a55d..000000000 --- a/src/Serval/src/Serval.Shared/Services/IScriptureDataFileService.cs +++ /dev/null @@ -1,7 +0,0 @@ -namespace Serval.Shared.Services; - -public interface IScriptureDataFileService -{ - ParatextProjectSettings GetParatextProjectSettings(string filename); - ZipParatextProjectTextUpdater GetZipParatextProjectTextUpdater(string filename); -} diff --git a/src/Serval/src/Serval.Shared/Services/ScriptureDataFileService.cs b/src/Serval/src/Serval.Shared/Services/ScriptureDataFileService.cs deleted file mode 100644 index e38b8c08c..000000000 --- a/src/Serval/src/Serval.Shared/Services/ScriptureDataFileService.cs +++ /dev/null @@ -1,31 +0,0 @@ -namespace Serval.Shared.Services; - -public class ScriptureDataFileService(IFileSystem fileSystem, IOptionsMonitor dataFileOptions) - : IScriptureDataFileService -{ - private readonly IFileSystem _fileSystem = fileSystem; - private readonly IOptionsMonitor _dataFileOptions = dataFileOptions; - - public ParatextProjectSettings GetParatextProjectSettings(string filename) - { - using IZipContainer container = _fileSystem.OpenZipFile(GetFilePath(filename)); - return ParseProjectSettings(container); - } - - public ZipParatextProjectTextUpdater GetZipParatextProjectTextUpdater(string filename) - { - IZipContainer container = _fileSystem.OpenZipFile(GetFilePath(filename)); - return new ZipParatextProjectTextUpdater(container); - } - - private string GetFilePath(string filename) - { - return Path.Combine(_dataFileOptions.CurrentValue.FilesDirectory, filename); - } - - private static ParatextProjectSettings ParseProjectSettings(IZipContainer container) - { - ZipParatextProjectSettingsParser settingsParser = new(container); - return settingsParser.Parse(); - } -} diff --git a/src/Serval/src/Serval.Shared/Services/ZipParatextProjectSettingsParser.cs b/src/Serval/src/Serval.Shared/Services/ZipParatextProjectSettingsParser.cs deleted file mode 100644 index 907717e9e..000000000 --- a/src/Serval/src/Serval.Shared/Services/ZipParatextProjectSettingsParser.cs +++ /dev/null @@ -1,4 +0,0 @@ -namespace Serval.Shared.Services; - -public class ZipParatextProjectSettingsParser(IZipContainer projectContainer) - : ParatextProjectSettingsParserBase(new ZipParatextProjectFileHandler(projectContainer)) { } diff --git a/src/Serval/src/Serval.Shared/Usings.cs b/src/Serval/src/Serval.Shared/Usings.cs index c4bf91004..a55e44177 100644 --- a/src/Serval/src/Serval.Shared/Usings.cs +++ b/src/Serval/src/Serval.Shared/Usings.cs @@ -10,13 +10,8 @@ global using Microsoft.Extensions.Configuration; global using Microsoft.Extensions.Diagnostics.HealthChecks; global using Microsoft.Extensions.Logging; -global using Microsoft.Extensions.Options; -global using MongoDB.Driver; global using Serval.Shared.Configuration; global using Serval.Shared.Contracts; global using Serval.Shared.Models; -global using Serval.Shared.Services; global using Serval.Shared.Utils; global using SIL.DataAccess; -global using SIL.Machine.Corpora; -global using SIL.ServiceToolkit.Services; diff --git a/src/Serval/src/Serval.Translation/Configuration/IServalBuilderExtensions.cs b/src/Serval/src/Serval.Translation/Configuration/IServalBuilderExtensions.cs index 46cf84114..782699663 100644 --- a/src/Serval/src/Serval.Translation/Configuration/IServalBuilderExtensions.cs +++ b/src/Serval/src/Serval.Translation/Configuration/IServalBuilderExtensions.cs @@ -10,7 +10,10 @@ public static IServalBuilder AddTranslation(this IServalBuilder builder) builder.AddApiOptions(builder.Configuration.GetSection(ApiOptions.Key)); builder.AddDataFileOptions(builder.Configuration.GetSection(DataFileOptions.Key)); + builder.Services.AddParallelCorpusService(); + builder.Services.AddScoped(); + builder.Services.AddScoped(); builder.Services.AddScoped(); builder.Services.AddScoped(); diff --git a/src/Serval/src/Serval.Translation/Models/Pretranslation.cs b/src/Serval/src/Serval.Translation/Models/Pretranslation.cs index 854203203..74f76ad2f 100644 --- a/src/Serval/src/Serval.Translation/Models/Pretranslation.cs +++ b/src/Serval/src/Serval.Translation/Models/Pretranslation.cs @@ -12,7 +12,7 @@ public class Pretranslation : IEntity public required IReadOnlyList? TargetRefs { get; init; } = []; public required IReadOnlyList Refs { get; init; } public required string Translation { get; init; } - public IEnumerable? SourceTokens { get; init; } - public IEnumerable? TranslationTokens { get; init; } + public IReadOnlyList? SourceTokens { get; init; } + public IReadOnlyList? TranslationTokens { get; init; } public IReadOnlyList? Alignment { get; init; } } diff --git a/src/Serval/src/Serval.Translation/Services/CorpusMappingService.cs b/src/Serval/src/Serval.Translation/Services/CorpusMappingService.cs new file mode 100644 index 000000000..1667e5a30 --- /dev/null +++ b/src/Serval/src/Serval.Translation/Services/CorpusMappingService.cs @@ -0,0 +1,356 @@ +namespace Serval.Translation.Services; + +public class CorpusMappingService( + IOptionsMonitor dataFileOptions, + IParallelCorpusService parallelCorpusService +) : ICorpusMappingService +{ + private readonly IOptionsMonitor _dataFileOptions = dataFileOptions; + private readonly IParallelCorpusService _parallelCorpusService = parallelCorpusService; + + public IReadOnlyList Map(Build build, Engine engine) + { + if (engine.ParallelCorpora.Any()) + { + return Map(build, engine.ParallelCorpora); + } + else + { + return Map(build, engine, engine.Corpora); + } + } + + public IReadOnlyList Map( + Build build, + Engine engine, + IReadOnlyList corpora + ) + { + List mappedParallelCorpora = []; + + Dictionary? trainingCorpora = build.TrainOn?.ToDictionary(c => c.CorpusRef!); + Dictionary? pretranslateCorpora = build.Pretranslate?.ToDictionary(c => + c.CorpusRef! + ); + bool trainOnAllCorpora = trainingCorpora is null; + bool pretranslateAllCorpora = pretranslateCorpora is null; + + foreach ( + Corpus source in corpora.Where(c => + trainingCorpora == null + || trainingCorpora.ContainsKey(c.Id) + || pretranslateCorpora == null + || pretranslateCorpora.ContainsKey(c.Id) + ) + ) + { + TrainingCorpus? trainingCorpus = trainingCorpora?.GetValueOrDefault(source.Id); + PretranslateCorpus? pretranslateCorpus = pretranslateCorpora?.GetValueOrDefault(source.Id); + + IEnumerable sourceFiles = source.SourceFiles.Select(Map); + IEnumerable targetFiles = source.TargetFiles.Select(Map); + SIL.ServiceToolkit.Models.MonolingualCorpus sourceCorpus = new() + { + Id = source.Id, + Language = source.SourceLanguage, + Files = source.SourceFiles.Select(Map).ToArray(), + TrainOnAll = trainOnAllCorpora, + PretranslateAll = pretranslateAllCorpora, + }; + SIL.ServiceToolkit.Models.MonolingualCorpus targetCorpus = new() + { + Id = source.Id, + Language = source.TargetLanguage, + Files = source.TargetFiles.Select(Map).ToArray(), + TrainOnAll = trainOnAllCorpora, + PretranslateAll = pretranslateAllCorpora, + }; + + if (trainingCorpus is not null) + { + if (trainingCorpus.TextIds is not null && trainingCorpus.ScriptureRange is not null) + { + throw new InvalidOperationException( + $"The corpus {source.Id} cannot specify both 'textIds' and 'scriptureRange' for trainOn" + ); + } + sourceCorpus.TrainOnTextIds = trainingCorpus.TextIds?.ToHashSet(); + targetCorpus.TrainOnTextIds = trainingCorpus.TextIds?.ToHashSet(); + + if (trainingCorpus.ScriptureRange is not null) + { + if ( + targetCorpus.Files.Count > 1 + || targetCorpus.Files[0].Format != SIL.ServiceToolkit.Models.FileFormat.Paratext + ) + { + throw new InvalidOperationException( + $"The corpus {source.Id} is not compatible with using a scripture range" + ); + } + var chapters = _parallelCorpusService + .GetChapters( + corpora.Select(c => Map(c, engine)).ToArray(), + GetFilePath(targetCorpus.Files[0].Location), + trainingCorpus.ScriptureRange + ) + .ToDictionary(kvp => kvp.Key, kvp => kvp.Value.ToHashSet()); + sourceCorpus.TrainOnChapters = chapters; + targetCorpus.TrainOnChapters = chapters; + } + sourceCorpus.TrainOnAll = sourceCorpus.TrainOnChapters is null && sourceCorpus.TrainOnTextIds is null; + targetCorpus.TrainOnAll = targetCorpus.TrainOnChapters is null && targetCorpus.TrainOnTextIds is null; + } + + if (pretranslateCorpus is not null) + { + if (pretranslateCorpus.TextIds is not null && pretranslateCorpus.ScriptureRange is not null) + { + throw new InvalidOperationException( + $"The corpus {source.Id} cannot specify both 'textIds' and 'scriptureRange' for 'pretranslate'." + ); + } + sourceCorpus.InferenceTextIds = pretranslateCorpus.TextIds?.ToHashSet(); + if (pretranslateCorpus.ScriptureRange is not null) + { + if ( + targetCorpus.Files.Count > 1 + || targetCorpus.Files[0].Format != SIL.ServiceToolkit.Models.FileFormat.Paratext + ) + { + throw new InvalidOperationException( + $"The corpus {source.Id} is not compatible with using a scripture range" + ); + } + sourceCorpus.InferenceChapters = _parallelCorpusService + .GetChapters( + corpora.Select(c => Map(c, engine)).ToArray(), + GetFilePath(targetCorpus.Files[0].Location), + pretranslateCorpus.ScriptureRange + ) + .ToDictionary(kvp => kvp.Key, kvp => kvp.Value.ToHashSet()); + } + sourceCorpus.PretranslateAll = + sourceCorpus.InferenceChapters is null && sourceCorpus.InferenceTextIds is null; + targetCorpus.PretranslateAll = + targetCorpus.InferenceChapters is null && targetCorpus.InferenceTextIds is null; + } + SIL.ServiceToolkit.Models.ParallelCorpus corpus = new() + { + Id = source.Id, + SourceCorpora = [sourceCorpus], + TargetCorpora = [targetCorpus], + }; + mappedParallelCorpora.Add(corpus); + } + return mappedParallelCorpora; + } + + private IReadOnlyList Map( + Build build, + IReadOnlyList parallelCorpora + ) + { + List mappedParallelCorpora = []; + Dictionary? trainingCorpora = build.TrainOn?.ToDictionary(c => c.ParallelCorpusRef!); + Dictionary? pretranslateCorpora = build.Pretranslate?.ToDictionary(c => + c.ParallelCorpusRef! + ); + + bool trainOnAllCorpora = trainingCorpora is null; + bool pretranslateAllCorpora = pretranslateCorpora is null; + + parallelCorpora = parallelCorpora + .Where(pc => + trainingCorpora == null + || trainingCorpora.ContainsKey(pc.Id) + || pretranslateCorpora == null + || pretranslateCorpora.ContainsKey(pc.Id) + ) + .ToArray(); + foreach (ParallelCorpus source in parallelCorpora) + { + TrainingCorpus? trainingCorpus = trainingCorpora?.GetValueOrDefault(source.Id); + PretranslateCorpus? pretranslateCorpus = pretranslateCorpora?.GetValueOrDefault(source.Id); + + string? referenceFileLocation = + source.TargetCorpora.Count > 0 && source.TargetCorpora[0].Files.Count > 0 + ? Map(source.TargetCorpora[0].Files[0]).Location + : null; + + mappedParallelCorpora.Add( + new SIL.ServiceToolkit.Models.ParallelCorpus + { + Id = source.Id, + SourceCorpora = source + .SourceCorpora.Select(sc => + Map( + parallelCorpora, + sc, + trainingCorpus?.SourceFilters?.Where(sf => sf.CorpusRef == sc.Id).FirstOrDefault(), + pretranslateCorpus?.SourceFilters?.Where(sf => sf.CorpusRef == sc.Id).FirstOrDefault(), + referenceFileLocation, + trainOnAllCorpora + || (trainingCorpus is not null && trainingCorpus.SourceFilters is null), + pretranslateAllCorpora + || (pretranslateCorpus is not null && pretranslateCorpus.SourceFilters is null) + ) + ) + .ToArray(), + TargetCorpora = source + .TargetCorpora.Select(tc => + Map( + parallelCorpora, + tc, + trainingCorpus?.TargetFilters?.Where(sf => sf.CorpusRef == tc.Id).FirstOrDefault(), + null, + referenceFileLocation, + trainOnAllCorpora + || (trainingCorpus is not null && trainingCorpus.TargetFilters is null), + pretranslateAllCorpora || pretranslateCorpus is not null + ) + ) + .ToArray(), + } + ); + } + return mappedParallelCorpora; + } + + private SIL.ServiceToolkit.Models.MonolingualCorpus Map( + IReadOnlyList parallelCorpora, + MonolingualCorpus inputCorpus, + ParallelCorpusFilter? trainingFilter, + ParallelCorpusFilter? pretranslateFilter, + string? referenceFileLocation, + bool trainOnAll, + bool pretranslateAll + ) + { + Dictionary>? trainOnChapters = null; + if ( + trainingFilter is not null + && trainingFilter.ScriptureRange is not null + && referenceFileLocation is not null + ) + { + trainOnChapters = _parallelCorpusService + .GetChapters( + parallelCorpora.Select(Map).ToArray(), + GetFilePath(referenceFileLocation), + trainingFilter.ScriptureRange + ) + .ToDictionary(kvp => kvp.Key, kvp => kvp.Value.ToHashSet()); + } + + Dictionary>? pretranslateChapters = null; + if ( + pretranslateFilter is not null + && pretranslateFilter.ScriptureRange is not null + && referenceFileLocation is not null + ) + { + pretranslateChapters = _parallelCorpusService + .GetChapters( + parallelCorpora.Select(Map).ToArray(), + GetFilePath(referenceFileLocation), + pretranslateFilter.ScriptureRange + ) + .ToDictionary(kvp => kvp.Key, kvp => kvp.Value.ToHashSet()); + } + + var returnCorpus = new SIL.ServiceToolkit.Models.MonolingualCorpus + { + Id = inputCorpus.Id, + Language = inputCorpus.Language, + Files = inputCorpus.Files.Select(Map).ToArray(), + TrainOnAll = trainOnAll, + PretranslateAll = pretranslateAll, + }; + + if ( + trainingFilter is not null + && trainingFilter.TextIds is not null + && trainingFilter.ScriptureRange is not null + ) + { + throw new InvalidOperationException( + "Cannot specify both TextIds and ScriptureRange in the training filter." + ); + } + + returnCorpus.TrainOnChapters = trainOnChapters; + returnCorpus.TrainOnTextIds = trainingFilter?.TextIds?.ToHashSet(); + + if ( + pretranslateFilter is not null + && pretranslateFilter.TextIds is not null + && pretranslateFilter.ScriptureRange is not null + ) + { + throw new InvalidOperationException( + "Cannot specify both TextIds and ScriptureRange in the pretranslation filter." + ); + } + + returnCorpus.InferenceChapters = pretranslateChapters; + returnCorpus.InferenceTextIds = pretranslateFilter?.TextIds?.ToHashSet(); + + return returnCorpus; + } + + public SIL.ServiceToolkit.Models.ParallelCorpus Map(Corpus source, Engine engine) + { + return new SIL.ServiceToolkit.Models.ParallelCorpus + { + Id = source.Id, + SourceCorpora = source.SourceFiles.Select(f => Map(f, engine.SourceLanguage)).ToArray(), + TargetCorpora = source.TargetFiles.Select(f => Map(f, engine.TargetLanguage)).ToArray(), + }; + } + + private SIL.ServiceToolkit.Models.MonolingualCorpus Map(CorpusFile source, string language) + { + return new SIL.ServiceToolkit.Models.MonolingualCorpus + { + Id = source.Id, + Language = language, + Files = [Map(source)], + }; + } + + private SIL.ServiceToolkit.Models.CorpusFile Map(CorpusFile source) + { + return new SIL.ServiceToolkit.Models.CorpusFile + { + Location = GetFilePath(source.Filename), + Format = (SIL.ServiceToolkit.Models.FileFormat)source.Format, + TextId = source.TextId, + }; + } + + private SIL.ServiceToolkit.Models.ParallelCorpus Map(ParallelCorpus source) + { + return new SIL.ServiceToolkit.Models.ParallelCorpus + { + Id = source.Id, + SourceCorpora = source.SourceCorpora.Select(Map).ToArray(), + TargetCorpora = source.TargetCorpora.Select(Map).ToArray(), + }; + } + + private SIL.ServiceToolkit.Models.MonolingualCorpus Map(MonolingualCorpus source) + { + return new SIL.ServiceToolkit.Models.MonolingualCorpus + { + Id = source.Id, + Language = source.Language, + Files = source.Files.Select(Map).ToList(), + }; + } + + private string GetFilePath(string filename) + { + return Path.Combine(_dataFileOptions.CurrentValue.FilesDirectory, filename); + } +} diff --git a/src/Serval/src/Serval.Translation/Services/EngineService.cs b/src/Serval/src/Serval.Translation/Services/EngineService.cs index 8c208c742..e32d3bb5a 100644 --- a/src/Serval/src/Serval.Translation/Services/EngineService.cs +++ b/src/Serval/src/Serval.Translation/Services/EngineService.cs @@ -9,24 +9,22 @@ public class EngineService( IRepository pretranslations, IScopedMediator mediator, GrpcClientFactory grpcClientFactory, - IOptionsMonitor dataFileOptions, IDataAccessContext dataAccessContext, ILoggerFactory loggerFactory, - IScriptureDataFileService scriptureDataFileService, IOutboxService outboxService, - IOptionsMonitor translationOptions + IOptionsMonitor translationOptions, + ICorpusMappingService corpusMappingService ) : OwnedEntityServiceBase(engines), IEngineService { private readonly IRepository _builds = builds; private readonly IRepository _pretranslations = pretranslations; private readonly IScopedMediator _mediator = mediator; private readonly GrpcClientFactory _grpcClientFactory = grpcClientFactory; - private readonly IOptionsMonitor _dataFileOptions = dataFileOptions; private readonly IDataAccessContext _dataAccessContext = dataAccessContext; private readonly ILogger _logger = loggerFactory.CreateLogger(); - private readonly IScriptureDataFileService _scriptureDataFileService = scriptureDataFileService; private readonly IOutboxService _outboxService = outboxService; private readonly IOptionsMonitor _translationOptions = translationOptions; + private readonly ICorpusMappingService _corpusMappingService = corpusMappingService; public async Task TranslateAsync( string engineId, @@ -264,21 +262,6 @@ await _outboxService.EnqueueMessageAsync( ); } - private Dictionary> GetChapters(string fileLocation, string scriptureRange) - { - try - { - return ScriptureRangeParser.GetChapters( - scriptureRange, - _scriptureDataFileService.GetParatextProjectSettings(fileLocation).Versification - ); - } - catch (ArgumentException ae) - { - throw new InvalidOperationException($"The scripture range {scriptureRange} is not valid: {ae.Message}"); - } - } - public async Task StartBuildAsync(Build build, CancellationToken cancellationToken = default) { return await _dataAccessContext.WithTransactionAsync( @@ -300,77 +283,13 @@ await _builds.ExistsAsync( await _builds.InsertAsync(build, ct); Engine engine = await GetAsync(build.EngineRef, ct); - StartBuildRequest request; - if (engine.ParallelCorpora.Any()) + StartBuildRequest request = new StartBuildRequest { - Dictionary? trainOn = build.TrainOn?.ToDictionary(c => - c.ParallelCorpusRef! - ); - Dictionary? pretranslate = build.Pretranslate?.ToDictionary(c => - c.ParallelCorpusRef! - ); - IReadOnlyList parallelCorpora = engine - .ParallelCorpora.Where(pc => - trainOn == null - || trainOn.ContainsKey(pc.Id) - || pretranslate == null - || pretranslate.ContainsKey(pc.Id) - ) - .ToList(); - - request = new StartBuildRequest - { - EngineType = engine.Type, - EngineId = engine.Id, - BuildId = build.Id, - Corpora = - { - parallelCorpora.Select(c => - Map( - c, - trainOn?.GetValueOrDefault(c.Id), - pretranslate?.GetValueOrDefault(c.Id), - trainOn is null, - pretranslate is null - ) - ), - }, - }; - } - else - { - Dictionary? trainOn = build.TrainOn?.ToDictionary(c => c.CorpusRef!); - Dictionary? pretranslate = build.Pretranslate?.ToDictionary(c => - c.CorpusRef! - ); - IReadOnlyList corpora = engine - .Corpora.Where(c => - trainOn == null - || trainOn.ContainsKey(c.Id) - || pretranslate == null - || pretranslate.ContainsKey(c.Id) - ) - .ToList(); - - request = new StartBuildRequest - { - EngineType = engine.Type, - EngineId = engine.Id, - BuildId = build.Id, - Corpora = - { - corpora.Select(c => - Map( - c, - trainOn?.GetValueOrDefault(c.Id), - pretranslate?.GetValueOrDefault(c.Id), - trainOn is null, - pretranslate is null - ) - ), - }, - }; - } + EngineType = engine.Type, + EngineId = engine.Id, + BuildId = build.Id, + Corpora = { _corpusMappingService.Map(build, engine).Select(Map) }, + }; if (build.Options is not null) request.Options = JsonSerializer.Serialize(build.Options); @@ -919,296 +838,89 @@ private Models.WordGraphArc Map(V1.WordGraphArc source) }; } - private V1.ParallelCorpus Map( - Corpus source, - TrainingCorpus? trainingCorpus, - PretranslateCorpus? pretranslateCorpus, - bool trainOnAllCorpora, - bool pretranslateOnAllCorpora - ) + private static V1.ParallelCorpus Map(SIL.ServiceToolkit.Models.ParallelCorpus source) { - IEnumerable sourceFiles = source.SourceFiles.Select(Map); - IEnumerable targetFiles = source.TargetFiles.Select(Map); - V1.MonolingualCorpus sourceCorpus = new() - { - Language = source.SourceLanguage, - Files = { source.SourceFiles.Select(Map) }, - }; - V1.MonolingualCorpus targetCorpus = new() + return new V1.ParallelCorpus { - Language = source.TargetLanguage, - Files = { source.TargetFiles.Select(Map) }, + Id = source.Id, + SourceCorpora = { source.SourceCorpora.Select(Map) }, + TargetCorpora = { source.TargetCorpora.Select(Map) }, }; - - if ( - trainOnAllCorpora - || (trainingCorpus is not null && trainingCorpus.TextIds is null && trainingCorpus.ScriptureRange is null) - ) - { - sourceCorpus.TrainOnAll = true; - targetCorpus.TrainOnAll = true; - } - else if (trainingCorpus is not null) - { - if (trainingCorpus.TextIds is not null && trainingCorpus.ScriptureRange is not null) - { - throw new InvalidOperationException( - $"The corpus {source.Id} cannot specify both 'textIds' and 'scriptureRange' for trainOn" - ); - } - if (trainingCorpus.TextIds is not null) - { - sourceCorpus.TrainOnTextIds.Add(trainingCorpus.TextIds); - targetCorpus.TrainOnTextIds.Add(trainingCorpus.TextIds); - } - if (!string.IsNullOrEmpty(trainingCorpus.ScriptureRange)) - { - if (targetCorpus.Files.Count > 1 || targetCorpus.Files[0].Format != V1.FileFormat.Paratext) - { - throw new InvalidOperationException( - $"The corpus {source.Id} is not compatible with using a scripture range" - ); - } - var chapters = GetChapters(targetCorpus.Files[0].Location, trainingCorpus.ScriptureRange) - .Select( - (kvp) => - { - var scriptureChapters = new ScriptureChapters(); - scriptureChapters.Chapters.Add(kvp.Value); - return (kvp.Key, scriptureChapters); - } - ) - .ToDictionary(); - sourceCorpus.TrainOnChapters.Add(chapters); - targetCorpus.TrainOnChapters.Add(chapters); - } - } - if ( - pretranslateOnAllCorpora - || ( - pretranslateCorpus is not null - && pretranslateCorpus.TextIds is null - && pretranslateCorpus.ScriptureRange is null - ) - ) - { - sourceCorpus.PretranslateAll = true; - targetCorpus.PretranslateAll = true; - } - else if (pretranslateCorpus is not null) - { - if (pretranslateCorpus.TextIds is not null && pretranslateCorpus.ScriptureRange is not null) - { - throw new InvalidOperationException( - $"The corpus {source.Id} cannot specify both 'textIds' and 'scriptureRange' for 'pretranslate'." - ); - } - if (pretranslateCorpus.TextIds is not null) - sourceCorpus.PretranslateTextIds.Add(pretranslateCorpus.TextIds); - if (!string.IsNullOrEmpty(pretranslateCorpus.ScriptureRange)) - { - if (targetCorpus.Files.Count > 1 || targetCorpus.Files[0].Format != V1.FileFormat.Paratext) - { - throw new InvalidOperationException( - $"The corpus {source.Id} is not compatible with using a scripture range" - ); - } - sourceCorpus.PretranslateChapters.Add( - GetChapters(targetCorpus.Files[0].Location, pretranslateCorpus.ScriptureRange) - .Select( - (kvp) => - { - var scriptureChapters = new ScriptureChapters(); - scriptureChapters.Chapters.Add(kvp.Value); - return (kvp.Key, scriptureChapters); - } - ) - .ToDictionary() - ); - } - } - V1.ParallelCorpus corpus = new() { Id = source.Id }; - if (sourceCorpus.Files.Count > 0) - corpus.SourceCorpora.Add(sourceCorpus); - if (targetCorpus.Files.Count > 0) - corpus.TargetCorpora.Add(targetCorpus); - return corpus; } - private V1.ParallelCorpus Map( - Shared.Models.ParallelCorpus source, - TrainingCorpus? trainingCorpus, - PretranslateCorpus? pretranslateCorpus, - bool trainOnAllCorpora, - bool pretranslateOnAllCorpora - ) + private static V1.MonolingualCorpus Map(SIL.ServiceToolkit.Models.MonolingualCorpus source) { - string? referenceFileLocation = - source.TargetCorpora.Count > 0 && source.TargetCorpora[0].Files.Count > 0 - ? Map(source.TargetCorpora[0].Files[0]).Location - : null; - - bool trainOnAllSources = - trainOnAllCorpora || (trainingCorpus is not null && trainingCorpus.SourceFilters is null); - bool pretranslateAllSources = - pretranslateOnAllCorpora || (pretranslateCorpus is not null && pretranslateCorpus.SourceFilters is null); - - bool trainOnAllTargets = - trainOnAllCorpora || (trainingCorpus is not null && trainingCorpus.TargetFilters is null); - bool pretranslateAllTargets = pretranslateOnAllCorpora || pretranslateCorpus is not null; // there is no pretranslate Target filter. - - return new V1.ParallelCorpus + var corpus = new V1.MonolingualCorpus { Id = source.Id, - SourceCorpora = - { - source.SourceCorpora.Select(sc => - Map( - sc, - trainingCorpus?.SourceFilters?.Where(sf => sf.CorpusRef == sc.Id).FirstOrDefault(), - pretranslateCorpus?.SourceFilters?.Where(sf => sf.CorpusRef == sc.Id).FirstOrDefault(), - referenceFileLocation, - trainOnAllSources, - pretranslateAllSources - ) - ), - }, - TargetCorpora = - { - source.TargetCorpora.Select(tc => - Map( - tc, - trainingCorpus?.TargetFilters?.Where(sf => sf.CorpusRef == tc.Id).FirstOrDefault(), - null, - referenceFileLocation, - trainOnAllTargets, - pretranslateAllTargets - ) - ), - }, + Language = source.Language, + Files = { source.Files.Select(Map) }, }; - } - private V1.MonolingualCorpus Map( - Shared.Models.MonolingualCorpus inputCorpus, - ParallelCorpusFilter? trainingFilter, - ParallelCorpusFilter? pretranslateFilter, - string? referenceFileLocation, - bool trainOnAll, - bool pretranslateOnAll - ) - { - Dictionary? trainOnChapters = null; - if ( - trainingFilter is not null - && trainingFilter.ScriptureRange is not null - && referenceFileLocation is not null - ) + if (source.TrainOnAll) { - trainOnChapters = GetChapters(referenceFileLocation, trainingFilter.ScriptureRange) - .Select( - (kvp) => - { - var scriptureChapters = new ScriptureChapters(); - scriptureChapters.Chapters.Add(kvp.Value); - return (kvp.Key, scriptureChapters); - } - ) - .ToDictionary(); + corpus.TrainOnAll = true; } - - Dictionary? pretranslateChapters = null; - if ( - pretranslateFilter is not null - && pretranslateFilter.ScriptureRange is not null - && referenceFileLocation is not null - ) + else if (source.TrainOnTextIds is not null) + { + corpus.TrainOnTextIds.Add(source.TrainOnTextIds); + } + else if (source.TrainOnChapters is not null) { - pretranslateChapters = GetChapters(referenceFileLocation, pretranslateFilter.ScriptureRange) - .Select( - (kvp) => + corpus.TrainOnChapters.Add( + source + .TrainOnChapters?.Select(kvp => { var scriptureChapters = new ScriptureChapters(); scriptureChapters.Chapters.Add(kvp.Value); return (kvp.Key, scriptureChapters); - } - ) - .ToDictionary(); - } - - var returnCorpus = new V1.MonolingualCorpus - { - Id = inputCorpus.Id, - Language = inputCorpus.Language, - Files = { inputCorpus.Files.Select(Map) }, - }; - - if ( - trainingFilter is not null - && trainingFilter.TextIds is not null - && trainingFilter.ScriptureRange is not null - ) - { - throw new InvalidOperationException( - "Cannot specify both TextIds and ScriptureRange in the training filter." + }) + .ToDictionary() ); } - if ( - trainOnAll - || (trainingFilter is not null && trainingFilter.TextIds is null && trainingFilter.ScriptureRange is null) - ) + if (source.PretranslateAll) { - returnCorpus.TrainOnAll = true; + corpus.PretranslateAll = true; } - else + else if (source.InferenceTextIds is not null) { - if (trainOnChapters is not null) - returnCorpus.TrainOnChapters.Add(trainOnChapters); - if (trainingFilter?.TextIds is not null) - returnCorpus.TrainOnTextIds.Add(trainingFilter.TextIds); + corpus.PretranslateTextIds.Add(source.InferenceTextIds); } - - if ( - pretranslateFilter is not null - && pretranslateFilter.TextIds is not null - && pretranslateFilter.ScriptureRange is not null - ) + else if (source.InferenceChapters is not null) { - throw new InvalidOperationException( - "Cannot specify both TextIds and ScriptureRange in the pretranslation filter." + corpus.PretranslateChapters.Add( + source + .InferenceChapters?.Select(kvp => + { + var scriptureChapters = new ScriptureChapters(); + scriptureChapters.Chapters.Add(kvp.Value); + return (kvp.Key, scriptureChapters); + }) + .ToDictionary() ); } - if ( - pretranslateOnAll - || ( - pretranslateFilter is not null - && pretranslateFilter.TextIds is null - && pretranslateFilter.ScriptureRange is null - ) - ) - { - returnCorpus.PretranslateAll = true; - } - else - { - if (pretranslateChapters is not null) - returnCorpus.PretranslateChapters.Add(pretranslateChapters); - if (pretranslateFilter?.TextIds is not null) - returnCorpus.PretranslateTextIds.Add(pretranslateFilter.TextIds); - } - - return returnCorpus; + return corpus; } - private V1.CorpusFile Map(Shared.Models.CorpusFile source) + private static V1.CorpusFile Map(SIL.ServiceToolkit.Models.CorpusFile source) { return new V1.CorpusFile { + Location = source.Location, TextId = source.TextId, - Format = (V1.FileFormat)source.Format, - Location = Path.Combine(_dataFileOptions.CurrentValue.FilesDirectory, source.Filename), + Format = Map(source.Format), + }; + } + + private static V1.FileFormat Map(SIL.ServiceToolkit.Models.FileFormat source) + { + return source switch + { + SIL.ServiceToolkit.Models.FileFormat.Text => V1.FileFormat.Text, + SIL.ServiceToolkit.Models.FileFormat.Paratext => V1.FileFormat.Paratext, + _ => throw new InvalidEnumArgumentException(nameof(source)), }; } } diff --git a/src/Serval/src/Serval.Translation/Services/ICorpusMappingService.cs b/src/Serval/src/Serval.Translation/Services/ICorpusMappingService.cs new file mode 100644 index 000000000..038b7c420 --- /dev/null +++ b/src/Serval/src/Serval.Translation/Services/ICorpusMappingService.cs @@ -0,0 +1,6 @@ +namespace Serval.Translation.Services; + +public interface ICorpusMappingService +{ + IReadOnlyList Map(Build build, Engine engine); +} diff --git a/src/Serval/src/Serval.Translation/Services/PretranslationService.cs b/src/Serval/src/Serval.Translation/Services/PretranslationService.cs index 588c3626a..c31ac96f9 100644 --- a/src/Serval/src/Serval.Translation/Services/PretranslationService.cs +++ b/src/Serval/src/Serval.Translation/Services/PretranslationService.cs @@ -1,6 +1,4 @@ using SIL.Machine.Corpora; -using SIL.Machine.PunctuationAnalysis; -using SIL.Machine.Translation; namespace Serval.Translation.Services; @@ -8,12 +6,14 @@ public class PretranslationService( IRepository pretranslations, IRepository engines, IRepository builds, - IScriptureDataFileService scriptureDataFileService + ICorpusMappingService corpusMappingService, + IParallelCorpusService parallelCorpusService ) : EntityServiceBase(pretranslations), IPretranslationService { private readonly IRepository _engines = engines; private readonly IRepository _builds = builds; - private readonly IScriptureDataFileService _scriptureDataFileService = scriptureDataFileService; + private readonly IParallelCorpusService _parallelCorpusService = parallelCorpusService; + private readonly ICorpusMappingService _corpusMappingService = corpusMappingService; private const string AIDisclaimerRemark = "This draft of {0} was generated using AI on {1}. It should be reviewed and edited carefully."; @@ -50,38 +50,16 @@ public async Task GetUsfmAsync( ) { Engine? engine = await _engines.GetAsync(engineId, cancellationToken); - Corpus? corpus = engine?.Corpora.SingleOrDefault(c => c.Id == corpusId); - ParallelCorpus? parallelCorpus = engine?.ParallelCorpora.SingleOrDefault(c => c.Id == corpusId); - Build? build = (await _builds.GetAllAsync(b => b.EngineRef == engineId, cancellationToken)) - .OrderByDescending(b => b.DateFinished) - .FirstOrDefault(); - if (build is null || build.DateFinished is null) - throw new InvalidOperationException($"Could not find any completed builds for engine '{engineId}'."); - - string disclaimerRemark = string.Format( - CultureInfo.InvariantCulture, - AIDisclaimerRemark, - textId, - build.DateFinished.Value.ToUniversalTime().ToString("u") - ); - string markerPlacementRemark = GenerateMarkerPlacementRemark( - paragraphMarkerBehavior, - embedBehavior, - styleMarkerBehavior - ); - - List remarks = [disclaimerRemark, markerPlacementRemark]; - - CorpusFile sourceFile; - CorpusFile targetFile; + if (engine is null) + throw new EntityNotFoundException($"Could not find the Engine '{engineId}'."); + Corpus? corpus = engine.Corpora.SingleOrDefault(c => c.Id == corpusId); + ParallelCorpus? parallelCorpus = engine.ParallelCorpora.SingleOrDefault(c => c.Id == corpusId); if (corpus is not null) { if (corpus.SourceFiles.Count == 0) throw new InvalidOperationException($"The corpus {corpus.Id} has no source files."); - sourceFile = corpus.SourceFiles[0]; if (corpus.TargetFiles.Count == 0) throw new InvalidOperationException($"The corpus {corpus.Id} has no target files."); - targetFile = corpus.TargetFiles[0]; } else if (parallelCorpus is not null) { @@ -95,7 +73,6 @@ public async Task GetUsfmAsync( $"The corpus {parallelCorpus.SourceCorpora[0].Id} referenced in parallel corpus {parallelCorpus.Id} has no files associated with it." ); } - sourceFile = parallelCorpus.SourceCorpora[0].Files[0]; if (parallelCorpus.TargetCorpora.Count == 0) { throw new InvalidOperationException($"The parallel corpus {parallelCorpus.Id} has no target corpora."); @@ -106,135 +83,77 @@ public async Task GetUsfmAsync( $"The corpus {parallelCorpus.TargetCorpora[0].Id} referenced in parallel corpus {parallelCorpus.Id} has no files associated with it." ); } - targetFile = parallelCorpus.TargetCorpora[0].Files[0]; } else { throw new EntityNotFoundException($"Could not find the corpus '{corpusId}' in engine '{engineId}'."); } - if (sourceFile.Format is not FileFormat.Paratext || targetFile.Format is not FileFormat.Paratext) - throw new InvalidOperationException("USFM format is not valid for non-Scripture corpora."); - ParatextProjectSettings sourceSettings = _scriptureDataFileService.GetParatextProjectSettings( - sourceFile.Filename - ); - ParatextProjectSettings targetSettings = _scriptureDataFileService.GetParatextProjectSettings( - targetFile.Filename - ); + Build? build = (await _builds.GetAllAsync(b => b.EngineRef == engineId, cancellationToken)) + .OrderByDescending(b => b.DateFinished) + .FirstOrDefault(); + if (build is null || build.DateFinished is null) + throw new InvalidOperationException($"Could not find any completed builds for engine '{engineId}'."); - IEnumerable pretranslations = await GetAllAsync( - engineId, - modelRevision, - corpusId, + string disclaimerRemark = string.Format( + CultureInfo.InvariantCulture, + AIDisclaimerRemark, textId, - cancellationToken + build.DateFinished.Value.ToUniversalTime().ToString("u") + ); + string markerPlacementRemark = GenerateMarkerPlacementRemark( + paragraphMarkerBehavior, + embedBehavior, + styleMarkerBehavior ); - IEnumerable<( - IReadOnlyList SourceScriptureRefs, - IReadOnlyList TargetScriptureRefs, - Pretranslation Pretranslation, - PretranslationUsfmMarkerBehavior ParagraphBehavior, - PretranslationUsfmMarkerBehavior StyleBehavior - )> pretranslationRows = pretranslations - .Select(p => Map(p, sourceSettings, targetSettings, paragraphMarkerBehavior, styleMarkerBehavior)) - .Where(p => p.TargetScriptureRefs.Any()) - .OrderBy(p => p.TargetScriptureRefs[0]); + List remarks = [disclaimerRemark, markerPlacementRemark]; - List updateBlockHandlers = []; - if ( - paragraphMarkerBehavior == PretranslationUsfmMarkerBehavior.PreservePosition - && template == PretranslationUsfmTemplate.Source - ) + SIL.ServiceToolkit.Models.ParallelCorpus[] parallelCorpora = _corpusMappingService.Map(build, engine).ToArray(); + + IEnumerable pretranslations = ( + await GetAllAsync(engineId, modelRevision, corpusId, textId, cancellationToken) + ).Select(p => new SIL.ServiceToolkit.Models.ParallelRow { - updateBlockHandlers.Add(new PlaceMarkersUsfmUpdateBlockHandler()); - } + SourceRefs = p.SourceRefs ?? [], + TargetRefs = p.TargetRefs ?? [], + TargetText = p.Translation, + Alignment = p + .Alignment?.Select(wp => new SIL.Machine.Corpora.AlignedWordPair(wp.SourceIndex, wp.TargetIndex)) + .ToArray(), + SourceTokens = p.SourceTokens, + TargetTokens = p.TranslationTokens, + }); + + string? targetQuoteConvention = null; + if (quoteNormalizationBehavior == PretranslationNormalizationBehavior.Denormalized) + targetQuoteConvention = build.TargetQuoteConvention; string usfm = ""; // Update the target book if it exists if (template is PretranslationUsfmTemplate.Auto or PretranslationUsfmTemplate.Target) { - // the pretranslations are generated from the source book and inserted into the target book - // use relaxed references since the USFM structure may not be the same - pretranslationRows = pretranslationRows.Select(p => - ( - // we won't use the source refs - (IReadOnlyList)[], - (IReadOnlyList)p.TargetScriptureRefs.Select(r => r.ToRelaxed()).ToArray(), - p.Pretranslation, - p.ParagraphBehavior, - p.StyleBehavior - ) - ); - using Shared.Services.ZipParatextProjectTextUpdater updater = - _scriptureDataFileService.GetZipParatextProjectTextUpdater(targetFile.Filename); - switch (textOrigin) + UpdateUsfmTextBehavior textBehavior = textOrigin switch { - case PretranslationUsfmTextOrigin.PreferExisting: - usfm = - updater.UpdateUsfm( - textId, - pretranslationRows.Select(pr => Map(pr, isSource: false)).ToList(), - fullName: targetSettings.FullName, - textBehavior: UpdateUsfmTextBehavior.PreferExisting, - paragraphBehavior: Map(paragraphMarkerBehavior), - embedBehavior: Map(embedBehavior), - styleBehavior: Map(styleMarkerBehavior), - updateBlockHandlers: updateBlockHandlers, - remarks: remarks, - errorHandler: (_) => true, - compareSegments: false - ) ?? ""; - break; - case PretranslationUsfmTextOrigin.PreferPretranslated: - usfm = - updater.UpdateUsfm( - textId, - pretranslationRows.Select(pr => Map(pr, isSource: false)).ToList(), - fullName: targetSettings.FullName, - textBehavior: UpdateUsfmTextBehavior.PreferNew, - paragraphBehavior: Map(paragraphMarkerBehavior), - embedBehavior: Map(embedBehavior), - styleBehavior: Map(styleMarkerBehavior), - updateBlockHandlers: updateBlockHandlers, - remarks: remarks, - errorHandler: (_) => true, - compareSegments: false - ) ?? ""; - break; - case PretranslationUsfmTextOrigin.OnlyExisting: - usfm = - updater.UpdateUsfm( - textId, - [], // don't put any pretranslations, we only want the existing text. - fullName: targetSettings.FullName, - textBehavior: UpdateUsfmTextBehavior.PreferNew, - paragraphBehavior: Map(paragraphMarkerBehavior), - embedBehavior: Map(embedBehavior), - styleBehavior: Map(styleMarkerBehavior), - updateBlockHandlers: updateBlockHandlers, - remarks: remarks, - errorHandler: (_) => true, - compareSegments: false - ) ?? ""; - break; - case PretranslationUsfmTextOrigin.OnlyPretranslated: - usfm = - updater.UpdateUsfm( - textId, - pretranslationRows.Select(pr => Map(pr, isSource: false)).ToList(), - fullName: targetSettings.FullName, - textBehavior: UpdateUsfmTextBehavior.StripExisting, - paragraphBehavior: Map(paragraphMarkerBehavior), - embedBehavior: Map(embedBehavior), - styleBehavior: Map(styleMarkerBehavior), - updateBlockHandlers: updateBlockHandlers, - remarks: remarks, - errorHandler: (_) => true, - compareSegments: false - ) ?? ""; - break; - } + PretranslationUsfmTextOrigin.PreferExisting => UpdateUsfmTextBehavior.PreferExisting, + PretranslationUsfmTextOrigin.PreferPretranslated => UpdateUsfmTextBehavior.PreferNew, + PretranslationUsfmTextOrigin.OnlyExisting => UpdateUsfmTextBehavior.PreferNew, + PretranslationUsfmTextOrigin.OnlyPretranslated => UpdateUsfmTextBehavior.StripExisting, + _ => throw new InvalidEnumArgumentException(nameof(textOrigin)), + }; + + usfm = _parallelCorpusService.UpdateTargetUsfm( + parallelCorpora, + corpusId, + textId, + textOrigin == PretranslationUsfmTextOrigin.OnlyExisting ? [] : pretranslations.ToArray(), + textBehavior, + Map(paragraphMarkerBehavior), + Map(embedBehavior), + Map(styleMarkerBehavior), + remarks, + targetQuoteConvention + ); } if ( @@ -242,206 +161,24 @@ PretranslationUsfmMarkerBehavior StyleBehavior && (template is PretranslationUsfmTemplate.Auto or PretranslationUsfmTemplate.Source) ) { - using Shared.Services.ZipParatextProjectTextUpdater updater = - _scriptureDataFileService.GetZipParatextProjectTextUpdater(sourceFile.Filename); - // Copy and update the source book if it exists - switch (textOrigin) - { - case PretranslationUsfmTextOrigin.PreferExisting: - case PretranslationUsfmTextOrigin.PreferPretranslated: - case PretranslationUsfmTextOrigin.OnlyPretranslated: - usfm = - updater.UpdateUsfm( - textId, - pretranslationRows.Select(pr => Map(pr, isSource: true)).ToList(), - fullName: targetSettings.FullName, - textBehavior: UpdateUsfmTextBehavior.StripExisting, - paragraphBehavior: Map(paragraphMarkerBehavior), - embedBehavior: Map(embedBehavior), - styleBehavior: Map(styleMarkerBehavior), - updateBlockHandlers: updateBlockHandlers, - remarks: remarks, - errorHandler: (_) => true, - compareSegments: true - ) ?? ""; - break; - case PretranslationUsfmTextOrigin.OnlyExisting: - usfm = - updater.UpdateUsfm( - textId, - [], // don't pass the pretranslations, we only want the existing text. - fullName: targetSettings.FullName, - textBehavior: UpdateUsfmTextBehavior.StripExisting, - paragraphBehavior: Map(paragraphMarkerBehavior), - embedBehavior: Map(embedBehavior), - styleBehavior: Map(styleMarkerBehavior), - updateBlockHandlers: updateBlockHandlers, - remarks: remarks, - errorHandler: (_) => true, - compareSegments: true - ) ?? ""; - break; - } - } - if ( - quoteNormalizationBehavior == PretranslationNormalizationBehavior.Denormalized - && !string.IsNullOrEmpty(build.TargetQuoteConvention) - ) - { - usfm = DenormalizeQuotationMarks(usfm, build.TargetQuoteConvention); - } - - return usfm; - } - - private static ( - IReadOnlyList SourceScriptureRefs, - IReadOnlyList TargetScriptureRefs, - Pretranslation Pretranslation, - PretranslationUsfmMarkerBehavior ParagraphMarkerBehavior, - PretranslationUsfmMarkerBehavior StyleMarkerBehavior - ) Map( - Pretranslation pretranslation, - ParatextProjectSettings sourceSettings, - ParatextProjectSettings targetSettings, - PretranslationUsfmMarkerBehavior paragraphMarkerBehavior, - PretranslationUsfmMarkerBehavior styleMarkerBehavior - ) - { - IReadOnlyList sourceScriptureRefs, - targetScriptureRefs; - if (pretranslation.TargetRefs?.Any() ?? false) - { - sourceScriptureRefs = - pretranslation - .SourceRefs?.Select(r => - { - bool parsed = ScriptureRef.TryParse(r, sourceSettings.Versification, out ScriptureRef sr); - return new { Parsed = parsed, ScriptureRef = sr }; - }) - .Where(r => r.Parsed) - .Select(r => r.ScriptureRef) - .ToArray() - ?? []; - targetScriptureRefs = pretranslation - .TargetRefs.Select(r => - { - bool parsed = ScriptureRef.TryParse(r, targetSettings.Versification, out ScriptureRef sr); - return new { Parsed = parsed, ScriptureRef = sr }; - }) - .Where(r => r.Parsed) - .Select(r => r.ScriptureRef) - .ToArray(); - } - else - { - sourceScriptureRefs = []; - targetScriptureRefs = pretranslation - .Refs.Select(r => - { - bool parsed = ScriptureRef.TryParse(r, targetSettings.Versification, out ScriptureRef sr); - return new { Parsed = parsed, ScriptureRef = sr }; - }) - .Where(r => r.Parsed) - .Select(r => r.ScriptureRef) - .ToArray(); - } - - return (sourceScriptureRefs, targetScriptureRefs, pretranslation, paragraphMarkerBehavior, styleMarkerBehavior); - } - - private static string DenormalizeQuotationMarks(string usfm, string quoteConvention) - { - QuoteConvention targetQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName(quoteConvention); - if (targetQuoteConvention is null) - return usfm; - - QuotationMarkDenormalizationFirstPass quotationMarkDenormalizationFirstPass = new(targetQuoteConvention); - - UsfmParser.Parse(usfm, quotationMarkDenormalizationFirstPass); - List<(int ChapterNumber, QuotationMarkUpdateStrategy Strategy)> bestChapterStrategies = - quotationMarkDenormalizationFirstPass.FindBestChapterStrategies(); - - QuotationMarkDenormalizationUsfmUpdateBlockHandler quotationMarkDenormalizer = new( - targetQuoteConvention, - new QuotationMarkUpdateSettings( - chapterStrategies: bestChapterStrategies.Select(tuple => tuple.Strategy).ToList() - ) - ); - int denormalizableChapterCount = bestChapterStrategies.Count(tup => - tup.Strategy != QuotationMarkUpdateStrategy.Skip - ); - List remarks = []; - string quotationDenormalizationRemark; - if (denormalizableChapterCount == bestChapterStrategies.Count) - { - quotationDenormalizationRemark = - "The quote style in all chapters has been automatically adjusted to match the rest of the project."; - } - else if (denormalizableChapterCount > 0) - { - quotationDenormalizationRemark = - "The quote style in the following chapters has been automatically adjusted to match the rest of the project: " - + GetChapterRangesString( - bestChapterStrategies - .Where(tuple => tuple.Strategy != QuotationMarkUpdateStrategy.Skip) - .Select(tuple => tuple.ChapterNumber) - .ToList() - ) - + "."; - } - else - { - quotationDenormalizationRemark = - "The quote style was not automatically adjusted to match the rest of your project in any chapters."; + usfm = _parallelCorpusService.UpdateSourceUsfm( + parallelCorpora, + corpusId, + textId, + textOrigin == PretranslationUsfmTextOrigin.OnlyExisting ? [] : pretranslations.ToArray(), + Map(paragraphMarkerBehavior), + Map(embedBehavior), + Map(styleMarkerBehavior), + placeParagraphMarkers: paragraphMarkerBehavior == PretranslationUsfmMarkerBehavior.PreservePosition, + remarks, + targetQuoteConvention + ); } - remarks.Add(quotationDenormalizationRemark); - - var updater = new UpdateUsfmParserHandler(updateBlockHandlers: [quotationMarkDenormalizer], remarks: remarks); - UsfmParser.Parse(usfm, updater); - usfm = updater.GetUsfm(); return usfm; } - public static string GetChapterRangesString(List chapterNumbers) - { - chapterNumbers = chapterNumbers.Order().ToList(); - int start = chapterNumbers[0]; - int end = chapterNumbers[0]; - List chapterRangeStrings = []; - foreach (int chapterNumber in chapterNumbers[1..]) - { - if (chapterNumber == end + 1) - { - end = chapterNumber; - } - else - { - if (start == end) - { - chapterRangeStrings.Add(start.ToString(CultureInfo.InvariantCulture)); - } - else - { - chapterRangeStrings.Add($"{start}-{end}"); - } - start = chapterNumber; - end = chapterNumber; - } - } - if (start == end) - { - chapterRangeStrings.Add(start.ToString(CultureInfo.InvariantCulture)); - } - else - { - chapterRangeStrings.Add($"{start}-{end}"); - } - return string.Join(", ", chapterRangeStrings); - } - /// /// Generate a natural sounding remark/comment describing marker placement. /// @@ -510,59 +247,4 @@ private static UpdateUsfmMarkerBehavior Map(PretranslationUsfmMarkerBehavior beh _ => throw new InvalidEnumArgumentException(nameof(behavior)), }; } - - private static WordAlignmentMatrix Map(IEnumerable? alignedWordPairs) - { - int rowCount = 0; - int columnCount = 0; - if (alignedWordPairs is not null) - { - foreach (Models.AlignedWordPair pair in alignedWordPairs) - { - if (pair.SourceIndex + 1 > rowCount) - rowCount = pair.SourceIndex + 1; - if (pair.TargetIndex + 1 > columnCount) - columnCount = pair.TargetIndex + 1; - } - } - return new WordAlignmentMatrix( - rowCount, - columnCount, - alignedWordPairs?.Select(wp => (wp.SourceIndex, wp.TargetIndex)) - ); - } - - private static UpdateUsfmRow Map( - ( - IReadOnlyList SourceScriptureRefs, - IReadOnlyList TargetScriptureRefs, - Pretranslation Pretranslation, - PretranslationUsfmMarkerBehavior ParagraphBehavior, - PretranslationUsfmMarkerBehavior StyleBehavior - ) pretranslationRow, - bool isSource - ) - { - return new UpdateUsfmRow( - isSource && pretranslationRow.SourceScriptureRefs.Any() - ? pretranslationRow.SourceScriptureRefs - : pretranslationRow.TargetScriptureRefs, - pretranslationRow.Pretranslation.Translation, - pretranslationRow.Pretranslation.Alignment is not null - ? new Dictionary - { - { - PlaceMarkersAlignmentInfo.MetadataKey, - new PlaceMarkersAlignmentInfo( - pretranslationRow.Pretranslation.SourceTokens?.ToList() ?? [], - pretranslationRow.Pretranslation.TranslationTokens?.ToList() ?? [], - Map(pretranslationRow.Pretranslation.Alignment), - paragraphBehavior: Map(pretranslationRow.ParagraphBehavior), - styleBehavior: Map(pretranslationRow.StyleBehavior) - ) - }, - } - : null - ); - } } diff --git a/src/Serval/src/Serval.WordAlignment/Services/EngineService.cs b/src/Serval/src/Serval.WordAlignment/Services/EngineService.cs index 6deac95f6..5b6addff7 100644 --- a/src/Serval/src/Serval.WordAlignment/Services/EngineService.cs +++ b/src/Serval/src/Serval.WordAlignment/Services/EngineService.cs @@ -10,7 +10,6 @@ public class EngineService( IOptionsMonitor dataFileOptions, IDataAccessContext dataAccessContext, ILoggerFactory loggerFactory, - IScriptureDataFileService scriptureDataFileService, IOutboxService outboxService, IOptionsMonitor wordAlignmentOptions ) : OwnedEntityServiceBase(engines), IEngineService @@ -21,7 +20,6 @@ IOptionsMonitor wordAlignmentOptions private readonly IOptionsMonitor _dataFileOptions = dataFileOptions; private readonly IDataAccessContext _dataAccessContext = dataAccessContext; private readonly ILogger _logger = loggerFactory.CreateLogger(); - private readonly IScriptureDataFileService _scriptureDataFileService = scriptureDataFileService; private readonly IOutboxService _outboxService = outboxService; private readonly IOptionsMonitor _wordAlignmentOptions = wordAlignmentOptions; @@ -126,13 +124,16 @@ await _outboxService.EnqueueMessageAsync( ); } - private Dictionary> GetChapters(string fileLocation, string scriptureRange) + protected virtual Dictionary> GetChapters(string fileLocation, string scriptureRange) { try { + using var archive = new ZipContainer( + Path.Combine(_dataFileOptions.CurrentValue.FilesDirectory, fileLocation) + ); return ScriptureRangeParser.GetChapters( scriptureRange, - _scriptureDataFileService.GetParatextProjectSettings(fileLocation).Versification + new ZipParatextProjectSettingsParser(archive).Parse().Versification ); } catch (ArgumentException ae) diff --git a/src/Serval/test/Serval.ApiServer.IntegrationTests/TranslationEngineTests.cs b/src/Serval/test/Serval.ApiServer.IntegrationTests/TranslationEngineTests.cs index 3b287d705..24ee437e4 100644 --- a/src/Serval/test/Serval.ApiServer.IntegrationTests/TranslationEngineTests.cs +++ b/src/Serval/test/Serval.ApiServer.IntegrationTests/TranslationEngineTests.cs @@ -1,8 +1,8 @@ +using System.IO.Compression; using Google.Protobuf.WellKnownTypes; using Serval.Translation.Configuration; using Serval.Translation.Models; using Serval.Translation.V1; -using SIL.ServiceToolkit.Services; using static Serval.ApiServer.Utils; using Phase = Serval.Client.Phase; using PhaseStage = Serval.Client.PhaseStage; @@ -2323,7 +2323,7 @@ await _env.Builds.InsertAsync( Assert.That( usfm.Replace("\r\n", "\n"), Is.EqualTo( - @"\id MAT - TRG + @"\id MAT - Test1 \rem This draft of MAT was generated using AI on 1970-01-01 00:00:00Z. It should be reviewed and edited carefully. \rem Paragraph breaks and embed markers were moved to the end of the verse. Style markers were removed. \h @@ -2557,6 +2557,8 @@ public void TearDown() private class TestEnvironment : DisposableBase { private readonly IServiceScope _scope; + private readonly IOptionsMonitor _dataFileOptions; + private static readonly string TestDataPath = Path.Combine(AppContext.BaseDirectory, "..", "..", "..", "data"); public readonly MongoClient MongoClient; public TestEnvironment() @@ -2740,6 +2742,9 @@ public TestEnvironment() .Returns(CreateAsyncUnaryCall(StatusCode.Unimplemented)); SmtClient = Substitute.For(); + _dataFileOptions = _scope.ServiceProvider.GetRequiredService>(); + ZipParatextProject(FILE3_FILENAME); + ZipParatextProject(FILE4_FILENAME); } public ServalWebApplicationFactory Factory { get; } @@ -2768,7 +2773,6 @@ public TranslationBuildsClient CreateTranslationBuildsClient(IEnumerable .CreateClient("Nmt") .Returns(NmtClient); services.AddSingleton(grpcClientFactory); - services.AddTransient(CreateFileSystem); }); }) .CreateClient(); @@ -2798,7 +2802,6 @@ public TranslationEnginesClient CreateTranslationEnginesClient(IEnumerable("Nmt") .Returns(NmtClient); services.AddSingleton(grpcClientFactory); - services.AddTransient(CreateFileSystem); }); }) .CreateClient(); @@ -2859,15 +2862,7 @@ public TranslationEngineTypesClient CreateTranslationEngineTypesClient(IEnumerab public DataFilesClient CreateDataFilesClient() { IEnumerable scope = [Scopes.DeleteFiles, Scopes.ReadFiles, Scopes.UpdateFiles, Scopes.CreateFiles]; - HttpClient httpClient = Factory - .WithWebHostBuilder(builder => - { - builder.ConfigureTestServices(services => - { - services.AddTransient(CreateFileSystem); - }); - }) - .CreateClient(); + HttpClient httpClient = Factory.CreateClient(); if (scope is not null) httpClient.DefaultRequestHeaders.Add("Scope", string.Join(" ", scope)); return new DataFilesClient(httpClient); @@ -2888,80 +2883,21 @@ public void ResetDatabases() MongoClient.DropDatabase("serval_test_jobs"); } - private static IFileSystem CreateFileSystem(IServiceProvider sp) - { - IFileSystem fileSystem = Substitute.For(); - IOptionsMonitor dataFileOptions = sp.GetRequiredService< - IOptionsMonitor - >(); - fileSystem - .OpenZipFile(GetFilePath(dataFileOptions, FILE3_FILENAME)) - .Returns(ci => - { - IZipContainer source = CreateZipContainer("SRC"); - source.EntryExists("MATSRC.SFM").Returns(true); - string usfm = - $@"\id MAT - SRC -\h Matthew -\c 1 -\p -\v 1 Chapter one, verse one. -\v 2 Chapter one, verse two. -"; - source.OpenEntry("MATSRC.SFM").Returns(ci => new MemoryStream(Encoding.UTF8.GetBytes(usfm))); - return source; - }); - fileSystem - .OpenZipFile(GetFilePath(dataFileOptions, FILE4_FILENAME)) - .Returns(ci => - { - IZipContainer target = CreateZipContainer("TRG"); - target.EntryExists("MATTRG.SFM").Returns(false); - return target; - }); - fileSystem.OpenWrite(Arg.Any()).Returns(ci => new MemoryStream()); - return fileSystem; - } - - private static IZipContainer CreateZipContainer(string name) - { - IZipContainer container = Substitute.For(); - container.EntryExists("Settings.xml").Returns(true); - XElement settingsXml = new( - "ScriptureText", - new XElement("StyleSheet", "usfm.sty"), - new XElement("Guid", "Id"), - new XElement("Name", name), - new XElement("FullName", name), - new XElement("Encoding", "65001"), - new XElement( - "Naming", - new XAttribute("PrePart", ""), - new XAttribute("PostPart", $"{name}.SFM"), - new XAttribute("BookNameForm", "MAT") - ), - new XElement("BiblicalTermsListSetting", "Major::BiblicalTerms.xml") - ); - container - .OpenEntry("Settings.xml") - .Returns(new MemoryStream(Encoding.UTF8.GetBytes(settingsXml.ToString()))); - container.EntryExists("custom.vrs").Returns(false); - container.EntryExists("usfm.sty").Returns(false); - container.EntryExists("custom.sty").Returns(false); - return container; - } - - private static string GetFilePath(IOptionsMonitor dataFileOptions, string fileName) - { - return Path.Combine(dataFileOptions.CurrentValue.FilesDirectory, fileName); - } - protected override void DisposeManagedResources() { _scope.Dispose(); Factory.Dispose(); ResetDatabases(); } + + private string ZipParatextProject(string name) + { + string fileName = Path.Combine(_dataFileOptions.CurrentValue.FilesDirectory, name); + if (File.Exists(fileName)) + File.Delete(fileName); + ZipFile.CreateFromDirectory(Path.Combine(TestDataPath, name), fileName); + return fileName; + } } } diff --git a/src/Serval/test/Serval.ApiServer.IntegrationTests/data/file_c/41MATTe1.SFM b/src/Serval/test/Serval.ApiServer.IntegrationTests/data/file_c/41MATTe1.SFM new file mode 100644 index 000000000..24df58815 --- /dev/null +++ b/src/Serval/test/Serval.ApiServer.IntegrationTests/data/file_c/41MATTe1.SFM @@ -0,0 +1,6 @@ +\id MAT - SRC +\h Matthew +\c 1 +\p +\v 1 Chapter one, verse one. +\v 2 Chapter one, verse two. diff --git a/src/Serval/test/Serval.ApiServer.IntegrationTests/data/file_c/Settings.xml b/src/Serval/test/Serval.ApiServer.IntegrationTests/data/file_c/Settings.xml new file mode 100644 index 000000000..6358f4f0b --- /dev/null +++ b/src/Serval/test/Serval.ApiServer.IntegrationTests/data/file_c/Settings.xml @@ -0,0 +1,34 @@ + + usfm.sty + 4 + en::: + English + 8.0.100.76 + Test1 + 65001 + T + + NFC + Te1 + a7e0b3ce0200736062f9f810a444dbfbe64aca35 + Charis SIL + 12 + + + + 41MAT + + Tes.SFM + Major::BiblicalTerms.xml + F + F + F + Public + Standard:: + + 3 + 000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + 000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + + + \ No newline at end of file diff --git a/src/Serval/test/Serval.ApiServer.IntegrationTests/data/file_d/Settings.xml b/src/Serval/test/Serval.ApiServer.IntegrationTests/data/file_d/Settings.xml new file mode 100644 index 000000000..4ce9e238d --- /dev/null +++ b/src/Serval/test/Serval.ApiServer.IntegrationTests/data/file_d/Settings.xml @@ -0,0 +1,33 @@ + + usfm.sty + 4 + en::: + English + 8.0.100.76 + Test2 + 65001 + T + + NFC + Te2 + a7e0b3ce0200736062f9f810a444dbfbe64aca35 + Charis SIL + 12 + + + + 41MAT + + Ten.SFM + F + F + F + Public + Standard:: + + 3 + 000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + 000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + + + \ No newline at end of file diff --git a/src/Serval/test/Serval.Shared.Tests/Services/ScriptureDataFileServiceTests.cs b/src/Serval/test/Serval.Shared.Tests/Services/ScriptureDataFileServiceTests.cs deleted file mode 100644 index 5836085e5..000000000 --- a/src/Serval/test/Serval.Shared.Tests/Services/ScriptureDataFileServiceTests.cs +++ /dev/null @@ -1,106 +0,0 @@ -namespace Serval.Shared.Services; - -[TestFixture] -public class ScriptureDataFileServiceTests -{ - [Test] - public void GetParatextProjectSettings() - { - TestEnvironment env = new(); - ParatextProjectSettings settings = env.Service.GetParatextProjectSettings("file1.zip"); - Assert.That(settings.Name, Is.EqualTo("PROJ")); - } - - [Test] - public void GetZipParatextProjectTextUpdater() - { - TestEnvironment env = new(); - using ZipParatextProjectTextUpdater updater = env.Service.GetZipParatextProjectTextUpdater("file1.zip"); - Assert.That( - updater.UpdateUsfm("MAT", [], textBehavior: UpdateUsfmTextBehavior.PreferExisting).ReplaceLineEndings("\n"), - Is.EqualTo( - $@"\id MAT - PROJ -\h {Canon.BookIdToEnglishName("MAT")} -\c 1 -\p -\v 1 Chapter one, verse one. -\v 2 Chapter one, verse two. -\c 2 -\p -\v 1 Chapter two, verse one. -\v 2 Chapter two, verse two. -" - ) - .IgnoreLineEndings() - ); - } - - private class TestEnvironment - { - public TestEnvironment() - { - IFileSystem fileSystem = Substitute.For(); - fileSystem - .OpenZipFile("file1.zip") - .Returns(ci => - { - IZipContainer container = CreateZipContainer(); - AddBook(container, "MAT"); - return container; - }); - IOptionsMonitor dataFileOptions = Substitute.For>(); - dataFileOptions.CurrentValue.Returns(new DataFileOptions()); - - Service = new ScriptureDataFileService(fileSystem, dataFileOptions); - } - - public ScriptureDataFileService Service { get; } - - private static IZipContainer CreateZipContainer() - { - IZipContainer container = Substitute.For(); - container.EntryExists("Settings.xml").Returns(true); - XElement settingsXml = new( - "ScriptureText", - new XElement("StyleSheet", "usfm.sty"), - new XElement("Guid", "ID"), - new XElement("Name", "PROJ"), - new XElement("FullName", "PROJ"), - new XElement("Encoding", "65001"), - new XElement( - "Naming", - new XAttribute("PrePart", ""), - new XAttribute("PostPart", "PROJ.SFM"), - new XAttribute("BookNameForm", "MAT") - ), - new XElement("BiblicalTermsListSetting", "Major::BiblicalTerms.xml") - ); - container - .OpenEntry("Settings.xml") - .Returns(new MemoryStream(Encoding.UTF8.GetBytes(settingsXml.ToString()))); - container.EntryExists("custom.vrs").Returns(false); - container.EntryExists("usfm.sty").Returns(false); - container.EntryExists("custom.sty").Returns(false); - return container; - } - - private static void AddBook(IZipContainer container, string book) - { - string bookFileName = $"{book}PROJ.SFM"; - container.EntryExists(bookFileName).Returns(true); - string usfm = - $@"\id {book} - PROJ -\h {Canon.BookIdToEnglishName(book)} -\c 1 -\p -\v 1 Chapter one, verse one. -\v 2 Chapter one, verse two. -\c 2 -\p -\v 1 Chapter two, verse one. -\v 2 Chapter two, verse two. -"; - container.OpenEntry(bookFileName).Returns(new MemoryStream(Encoding.UTF8.GetBytes(usfm))); - } - } -} diff --git a/src/Serval/test/Serval.Shared.Tests/Usings.cs b/src/Serval/test/Serval.Shared.Tests/Usings.cs index ec2251e6f..86da99b25 100644 --- a/src/Serval/test/Serval.Shared.Tests/Usings.cs +++ b/src/Serval/test/Serval.Shared.Tests/Usings.cs @@ -1,13 +1,4 @@ -global using System.Text; global using System.Text.Json; -global using System.Xml.Linq; -global using Microsoft.Extensions.Options; -global using NSubstitute; global using NUnit.Framework; global using NUnit.Framework.Constraints; -global using Serval.Shared.Configuration; -global using Serval.Shared.Utils; -global using SIL.Machine.Corpora; -global using SIL.Scripture; -global using SIL.ServiceToolkit.Services; global using SIL.ServiceToolkit.Utils; diff --git a/src/Serval/test/Serval.Translation.Tests/Services/EngineServiceTests.cs b/src/Serval/test/Serval.Translation.Tests/Services/EngineServiceTests.cs index 9a9f63315..98dd9c46e 100644 --- a/src/Serval/test/Serval.Translation.Tests/Services/EngineServiceTests.cs +++ b/src/Serval/test/Serval.Translation.Tests/Services/EngineServiceTests.cs @@ -138,6 +138,7 @@ await env.Service.StartBuildAsync( { new() { + Id = "corpus1", Language = "es", Files = { @@ -159,6 +160,7 @@ await env.Service.StartBuildAsync( { new() { + Id = "corpus1", Language = "en", Files = { @@ -217,6 +219,8 @@ await env.Service.StartBuildAsync( { new() { + Id = "corpus1", + Language = "es", TrainOnTextIds = { }, Files = @@ -239,6 +243,8 @@ await env.Service.StartBuildAsync( { new() { + Id = "corpus1", + Language = "en", TrainOnTextIds = { }, Files = @@ -298,6 +304,8 @@ await env.Service.StartBuildAsync( { new() { + Id = "corpus1", + Language = "es", TrainOnTextIds = { "text1" }, Files = @@ -320,6 +328,8 @@ await env.Service.StartBuildAsync( { new() { + Id = "corpus1", + Language = "en", TrainOnTextIds = { "text1" }, Files = @@ -379,6 +389,8 @@ await env.Service.StartBuildAsync( { new() { + Id = "corpus1", + Language = "es", Files = { @@ -400,6 +412,8 @@ await env.Service.StartBuildAsync( { new() { + Id = "corpus1", + Language = "en", Files = { @@ -459,6 +473,8 @@ await env.Service.StartBuildAsync( { new() { + Id = "corpus1", + Language = "es", Files = { @@ -480,6 +496,8 @@ await env.Service.StartBuildAsync( { new() { + Id = "corpus1", + Language = "en", Files = { @@ -539,6 +557,8 @@ await env.Service.StartBuildAsync( { new() { + Id = "corpus1", + Language = "es", Files = { @@ -560,6 +580,8 @@ await env.Service.StartBuildAsync( { new() { + Id = "corpus1", + Language = "en", Files = { @@ -585,6 +607,8 @@ await env.Service.StartBuildAsync( { new() { + Id = "corpus2", + Language = "es", Files = { @@ -606,6 +630,8 @@ await env.Service.StartBuildAsync( { new() { + Id = "corpus2", + Language = "en", Files = { @@ -682,6 +708,8 @@ await env.Service.StartBuildAsync( { new() { + Id = "corpus1", + Language = "es", TrainOnChapters = { @@ -714,6 +742,8 @@ await env.Service.StartBuildAsync( { new() { + Id = "corpus1", + Language = "en", TrainOnChapters = { @@ -783,6 +813,8 @@ await env.Service.StartBuildAsync( { new() { + Id = "corpus1", + Language = "es", Files = { @@ -804,6 +836,8 @@ await env.Service.StartBuildAsync( { new() { + Id = "corpus1", + Language = "en", Files = { @@ -2356,27 +2390,6 @@ public TestEnvironment() .Returns(TranslationServiceClient); IOptionsMonitor dataFileOptions = Substitute.For>(); dataFileOptions.CurrentValue.Returns(new DataFileOptions()); - var scriptureDataFileService = Substitute.For(); - scriptureDataFileService - .GetParatextProjectSettings(Arg.Any()) - .Returns( - new ParatextProjectSettings( - guid: "Id", - name: "Tst", - fullName: "Test", - encoding: Encoding.UTF8, - versification: ScrVers.English, - stylesheet: new UsfmStylesheet("usfm.sty"), - fileNamePrefix: "TST", - fileNameForm: "MAT", - fileNameSuffix: ".USFM", - biblicalTermsListType: "BiblicalTerms", - biblicalTermsProjectName: "", - biblicalTermsFileName: "BiblicalTerms.xml", - languageCode: "en", - translationType: "Standard" - ) - ); Pretranslations = new MemoryRepository(); OutboxService = Substitute.For(); @@ -2386,6 +2399,17 @@ public TestEnvironment() translationOptions.CurrentValue.Returns( new TranslationOptions { Engines = [new EngineInfo { Type = "Smt" }] } ); + var parallelCorpusService = Substitute.For(); + parallelCorpusService + .GetChapters( + Arg.Any>(), + Arg.Any(), + Arg.Any() + ) + .Returns(callInfo => + { + return ScriptureRangeParser.GetChapters(callInfo.ArgAt(2)); + }); Service = new EngineService( Engines, @@ -2393,12 +2417,11 @@ public TestEnvironment() Pretranslations, Substitute.For(), grpcClientFactory, - dataFileOptions, new MemoryDataAccessContext(), new LoggerFactory(), - scriptureDataFileService, OutboxService, - translationOptions + translationOptions, + new CorpusMappingService(dataFileOptions, parallelCorpusService) ); } diff --git a/src/Serval/test/Serval.Translation.Tests/Services/PretranslationServiceTests.cs b/src/Serval/test/Serval.Translation.Tests/Services/PretranslationServiceTests.cs index b498f4da7..941714a53 100644 --- a/src/Serval/test/Serval.Translation.Tests/Services/PretranslationServiceTests.cs +++ b/src/Serval/test/Serval.Translation.Tests/Services/PretranslationServiceTests.cs @@ -1,25 +1,11 @@ -namespace Serval.Translation.Services; +using System.IO.Compression; +using SIL.Machine.Utils; + +namespace Serval.Translation.Services; [TestFixture] public class PretranslationServiceTests { - private const string SourceUsfm = - $@"\id MAT - SRC -\c 1 -\v 1 SRC - Chapter one, verse one. -\p new paragraph -\v 2 -\v 3 SRC - Chapter one, verse three. -"; - - private const string TargetUsfm = - @"\id MAT - TRG -\c 1 -\v 1 TRG - Chapter one, verse one. -\v 2 -\v 3 TRG - Chapter one, verse three. -"; - [Test] public async Task GetUsfmAsync_Source_PreferExisting() { @@ -33,7 +19,7 @@ public async Task GetUsfmAsync_Source_PreferExisting() Assert.That( usfm, Is.EqualTo( - @"\id MAT - TRG + @"\id MAT - Test1 \rem This draft of MAT was generated using AI on 1970-01-01 00:00:00Z. It should be reviewed and edited carefully. \rem Paragraph breaks and embed markers were moved to the end of the verse. Style markers were removed. \c 1 @@ -60,7 +46,7 @@ public async Task GetUsfmAsync_Source_PreferPretranslated() Assert.That( usfm, Is.EqualTo( - @"\id MAT - TRG + @"\id MAT - Test1 \rem This draft of MAT was generated using AI on 1970-01-01 00:00:00Z. It should be reviewed and edited carefully. \rem Paragraph breaks and embed markers were moved to the end of the verse. Style markers were removed. \c 1 @@ -87,7 +73,7 @@ public async Task GetUsfmAsync_Source_OnlyExisting() Assert.That( usfm, Is.EqualTo( - @"\id MAT - TRG + @"\id MAT - Test1 \rem This draft of MAT was generated using AI on 1970-01-01 00:00:00Z. It should be reviewed and edited carefully. \rem Paragraph breaks and embed markers were moved to the end of the verse. Style markers were removed. \c 1 @@ -114,7 +100,7 @@ public async Task GetUsfmAsync_Source_OnlyPretranslated() Assert.That( usfm, Is.EqualTo( - @"\id MAT - TRG + @"\id MAT - Test1 \rem This draft of MAT was generated using AI on 1970-01-01 00:00:00Z. It should be reviewed and edited carefully. \rem Paragraph breaks and embed markers were moved to the end of the verse. Style markers were removed. \c 1 @@ -142,7 +128,7 @@ public async Task GetUsfmAsync_Source_PlaceMarkers() Assert.That( usfm, Is.EqualTo( - @"\id MAT - TRG + @"\id MAT - Test1 \rem This draft of MAT was generated using AI on 1970-01-01 00:00:00Z. It should be reviewed and edited carefully. \rem Embed markers were moved to the end of the verse. Paragraph breaks have positions preserved. Style markers were removed. \c 1 @@ -159,8 +145,7 @@ public async Task GetUsfmAsync_Source_PlaceMarkers() [Test] public async Task GetUsfmAsync_Target_PreferExisting() { - using TestEnvironment env = new(); - env.AddMatthewToTarget(); + using TestEnvironment env = new(addMatthew: true); string usfm = await env.GetUsfmAsync( PretranslationUsfmTextOrigin.PreferExisting, @@ -186,8 +171,7 @@ public async Task GetUsfmAsync_Target_PreferExisting() [Test] public async Task GetUsfmAsync_Target_PreferPretranslated() { - using TestEnvironment env = new(); - env.AddMatthewToTarget(); + using TestEnvironment env = new(addMatthew: true); string usfm = await env.GetUsfmAsync( PretranslationUsfmTextOrigin.PreferPretranslated, @@ -197,7 +181,7 @@ public async Task GetUsfmAsync_Target_PreferPretranslated() Assert.That( usfm, Is.EqualTo( - @"\id MAT - TRG + @"\id MAT - Test3 \rem This draft of MAT was generated using AI on 1970-01-01 00:00:00Z. It should be reviewed and edited carefully. \rem Paragraph breaks and embed markers were moved to the end of the verse. Style markers were removed. \c 1 @@ -236,7 +220,7 @@ public async Task GetUsfmAsync_Auto_TargetBookDoesNotExist() Assert.That( usfm, Is.EqualTo( - @"\id MAT - TRG + @"\id MAT - Test1 \rem This draft of MAT was generated using AI on 1970-01-01 00:00:00Z. It should be reviewed and edited carefully. \rem Paragraph breaks and embed markers were moved to the end of the verse. Style markers were removed. \c 1 @@ -253,8 +237,7 @@ public async Task GetUsfmAsync_Auto_TargetBookDoesNotExist() [Test] public async Task GetUsfmAsync_Auto_TargetBookExists() { - using TestEnvironment env = new(); - env.AddMatthewToTarget(); + using TestEnvironment env = new(addMatthew: true); string usfm = await env.GetUsfmAsync( PretranslationUsfmTextOrigin.PreferPretranslated, @@ -264,7 +247,7 @@ public async Task GetUsfmAsync_Auto_TargetBookExists() Assert.That( usfm, Is.EqualTo( - @"\id MAT - TRG + @"\id MAT - Test3 \rem This draft of MAT was generated using AI on 1970-01-01 00:00:00Z. It should be reviewed and edited carefully. \rem Paragraph breaks and embed markers were moved to the end of the verse. Style markers were removed. \c 1 @@ -280,15 +263,22 @@ public async Task GetUsfmAsync_Auto_TargetBookExists() [Test] public async Task GetUsfmAsync_Target_OnlyExisting() { - using TestEnvironment env = new(); - env.AddMatthewToTarget(); + using TestEnvironment env = new(addMatthew: true); string usfm = await env.GetUsfmAsync( PretranslationUsfmTextOrigin.OnlyExisting, PretranslationUsfmTemplate.Target ); - List lines = TargetUsfm.Split('\n').ToList(); + string targetUsfm = + @"\id MAT - Test3 +\c 1 +\v 1 TRG - Chapter one, verse one. +\v 2 +\v 3 TRG - Chapter one, verse three. +"; + + List lines = targetUsfm.Split('\n').ToList(); lines.Insert( 1, @@ -304,8 +294,7 @@ public async Task GetUsfmAsync_Target_OnlyExisting() [Test] public async Task GetUsfmAsync_Target_OnlyPretranslated() { - using TestEnvironment env = new(); - env.AddMatthewToTarget(); + using TestEnvironment env = new(addMatthew: true); string usfm = await env.GetUsfmAsync( PretranslationUsfmTextOrigin.OnlyPretranslated, @@ -315,7 +304,7 @@ public async Task GetUsfmAsync_Target_OnlyPretranslated() Assert.That( usfm, Is.EqualTo( - @"\id MAT - TRG + @"\id MAT - Test3 \rem This draft of MAT was generated using AI on 1970-01-01 00:00:00Z. It should be reviewed and edited carefully. \rem Paragraph breaks and embed markers were moved to the end of the verse. Style markers were removed. \c 1 @@ -427,6 +416,26 @@ public void GetUsfmAsync_BadPretranslationVerseRef() }); } + [Test] + public void GetUsfmAsync_EngineDoesNotExist() + { + using TestEnvironment env = new(); + Assert.ThrowsAsync(async () => + await env.Service.GetUsfmAsync( + engineId: "engine2", + modelRevision: 1, + corpusId: "corpus1", + textId: "MAT", + textOrigin: PretranslationUsfmTextOrigin.PreferPretranslated, + template: PretranslationUsfmTemplate.Auto, + paragraphMarkerBehavior: PretranslationUsfmMarkerBehavior.PreservePosition, + embedBehavior: PretranslationUsfmMarkerBehavior.Preserve, + styleMarkerBehavior: PretranslationUsfmMarkerBehavior.Strip, + quoteNormalizationBehavior: PretranslationNormalizationBehavior.Denormalized + ) + ); + } + [Test] [TestCase(new int[] { 1, 2, 3 }, "1-3")] [TestCase(new int[] { 1, 3, 4 }, "1, 3-4")] @@ -435,14 +444,37 @@ public void GetUsfmAsync_BadPretranslationVerseRef() [TestCase(new int[] { 1 }, "1")] public void GetChapterRanges(int[] chapterNumbers, string expectedRangeString) { - string actualRangeString = PretranslationService.GetChapterRangesString(chapterNumbers.ToList()); + string actualRangeString = ParallelCorpusService.GetChapterRangesString(chapterNumbers.ToList()); Assert.That(actualRangeString, Is.EqualTo(expectedRangeString)); } private class TestEnvironment : IDisposable { - public TestEnvironment() + private static readonly string TestDataPath = Path.Combine("..", "..", "..", "data"); + + public TestEnvironment(bool addMatthew = false) { + _tempDir = new TempDirectory("PretranslationServiceTests"); + string file1Path = Path.Combine(_tempDir.Path, "file1.zip"); + if (!File.Exists(file1Path)) + { + ZipFile.CreateFromDirectory( + Path.Combine(TestDataPath, "pt-project1"), + Path.Combine(_tempDir.Path, "file1.zip") + ); + } + string file2Path = Path.Combine(_tempDir.Path, "file2.zip"); + if (File.Exists(file2Path)) + File.Delete(file2Path); + if (addMatthew) + { + ZipFile.CreateFromDirectory(Path.Combine(TestDataPath, "pt-project3"), file2Path); + } + else + { + ZipFile.CreateFromDirectory(Path.Combine(TestDataPath, "pt-project2"), file2Path); + } + CorpusFile file1 = new() { Id = "file1", @@ -636,50 +668,29 @@ public TestEnvironment() Translation = "Chapter 1, verse 2.", }, ]); - ScriptureDataFileService = Substitute.For(); - ScriptureDataFileService.GetParatextProjectSettings("file1.zip").Returns(CreateProjectSettings("SRC")); - ScriptureDataFileService.GetParatextProjectSettings("file2.zip").Returns(CreateProjectSettings("TRG")); - var zipSubstituteSource = Substitute.For(); - var zipSubstituteTarget = Substitute.For(); - zipSubstituteSource - .OpenEntry("MATSRC.SFM") - .Returns(x => new MemoryStream(Encoding.UTF8.GetBytes(SourceUsfm))); - zipSubstituteTarget.OpenEntry("MATTRG.SFM").Returns(x => new MemoryStream(Encoding.UTF8.GetBytes(""))); - zipSubstituteSource.EntryExists(Arg.Any()).Returns(false); - zipSubstituteTarget.EntryExists(Arg.Any()).Returns(false); - zipSubstituteSource.EntryExists("MATSRC.SFM").Returns(true); - zipSubstituteTarget.EntryExists("MATTRG.SFM").Returns(true); - TargetZipContainer = zipSubstituteTarget; - TextUpdaters = new List(); - Shared.Services.ZipParatextProjectTextUpdater GetTextUpdater(string type) - { - var updater = type switch - { - "SRC" => new Shared.Services.ZipParatextProjectTextUpdater( - zipSubstituteSource, - CreateProjectSettings("SRC") - ), - "TRG" => new Shared.Services.ZipParatextProjectTextUpdater( - zipSubstituteTarget, - CreateProjectSettings("TRG") - ), - _ => throw new ArgumentException(), - }; - TextUpdaters.Add(updater); - return updater; - } - ScriptureDataFileService.GetZipParatextProjectTextUpdater("file1.zip").Returns(x => GetTextUpdater("SRC")); - ScriptureDataFileService.GetZipParatextProjectTextUpdater("file2.zip").Returns(x => GetTextUpdater("TRG")); - Service = new PretranslationService(Pretranslations, Engines, Builds, ScriptureDataFileService); + IOptionsMonitor dataFileOptions = Substitute.For>(); + dataFileOptions.CurrentValue.Returns(new DataFileOptions() { FilesDirectory = _tempDir.Path }); + var parallelCorpusService = new ParallelCorpusService(); + Service = new PretranslationService( + Pretranslations, + Engines, + Builds, + new CorpusMappingService(dataFileOptions, parallelCorpusService), + parallelCorpusService + ); } public PretranslationService Service { get; } public MemoryRepository Pretranslations { get; } public MemoryRepository Engines { get; } public MemoryRepository Builds { get; } - public IScriptureDataFileService ScriptureDataFileService { get; } - public IZipContainer TargetZipContainer { get; } - public IList TextUpdaters { get; } + + private readonly TempDirectory _tempDir; + + public void Dispose() + { + _tempDir.Dispose(); + } public async Task GetUsfmAsync( PretranslationUsfmTextOrigin textOrigin, @@ -717,40 +728,5 @@ public async Task GetUsfmAsync( Assert.That(parallel_usfm, Is.EqualTo(usfm)); return usfm; } - - public void AddMatthewToTarget() - { - TargetZipContainer - .OpenEntry("MATTRG.SFM") - .Returns(x => new MemoryStream(Encoding.UTF8.GetBytes(TargetUsfm))); - } - - private static ParatextProjectSettings CreateProjectSettings(string name) - { - return new ParatextProjectSettings( - guid: "Id", - name: name, - fullName: name, - encoding: Encoding.UTF8, - versification: ScrVers.English, - stylesheet: new UsfmStylesheet("usfm.sty"), - fileNamePrefix: "", - fileNameForm: "MAT", - fileNameSuffix: $"{name}.SFM", - biblicalTermsListType: "Major", - biblicalTermsProjectName: "", - biblicalTermsFileName: "BiblicalTerms.xml", - languageCode: "en", - translationType: "Standard" - ); - } - - public void Dispose() - { - foreach (var updater in TextUpdaters) - { - updater.Dispose(); - } - } } } diff --git a/src/Serval/test/Serval.Translation.Tests/Usings.cs b/src/Serval/test/Serval.Translation.Tests/Usings.cs index 48c63ed06..1403ffb4e 100644 --- a/src/Serval/test/Serval.Translation.Tests/Usings.cs +++ b/src/Serval/test/Serval.Translation.Tests/Usings.cs @@ -1,4 +1,3 @@ -global using System.Text; global using System.Text.RegularExpressions; global using Grpc.Core; global using Grpc.Net.ClientFactory; @@ -9,11 +8,8 @@ global using NUnit.Framework; global using Serval.Shared.Configuration; global using Serval.Shared.Models; -global using Serval.Shared.Services; global using Serval.Shared.Utils; global using Serval.Translation.Contracts; global using Serval.Translation.Models; global using SIL.DataAccess; -global using SIL.Machine.Corpora; -global using SIL.Scripture; global using SIL.ServiceToolkit.Services; diff --git a/src/Serval/test/Serval.Translation.Tests/data/pt-project1/41MATTe1.SFM b/src/Serval/test/Serval.Translation.Tests/data/pt-project1/41MATTe1.SFM new file mode 100644 index 000000000..8130771c2 --- /dev/null +++ b/src/Serval/test/Serval.Translation.Tests/data/pt-project1/41MATTe1.SFM @@ -0,0 +1,6 @@ +\id MAT - SRC +\c 1 +\v 1 SRC - Chapter one, verse one. +\p new paragraph +\v 2 +\v 3 SRC - Chapter one, verse three. diff --git a/src/Serval/test/Serval.Translation.Tests/data/pt-project1/Settings.xml b/src/Serval/test/Serval.Translation.Tests/data/pt-project1/Settings.xml new file mode 100644 index 000000000..6358f4f0b --- /dev/null +++ b/src/Serval/test/Serval.Translation.Tests/data/pt-project1/Settings.xml @@ -0,0 +1,34 @@ + + usfm.sty + 4 + en::: + English + 8.0.100.76 + Test1 + 65001 + T + + NFC + Te1 + a7e0b3ce0200736062f9f810a444dbfbe64aca35 + Charis SIL + 12 + + + + 41MAT + + Tes.SFM + Major::BiblicalTerms.xml + F + F + F + Public + Standard:: + + 3 + 000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + 000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + + + \ No newline at end of file diff --git a/src/Serval/test/Serval.Translation.Tests/data/pt-project1/custom.vrs b/src/Serval/test/Serval.Translation.Tests/data/pt-project1/custom.vrs new file mode 100644 index 000000000..9c1cd3873 --- /dev/null +++ b/src/Serval/test/Serval.Translation.Tests/data/pt-project1/custom.vrs @@ -0,0 +1,31 @@ +# custom.vrs + +LEV 14:56 +ROM 14:26 +REV 12:17 +TOB 5:22 +TOB 10:12 +SIR 23:28 +ESG 1:22 +ESG 3:15 +ESG 5:14 +ESG 8:17 +ESG 10:14 +SIR 33:33 +SIR 41:24 +BAR 1:22 +4MA 7:25 +4MA 12:20 + +# deliberately missing verses +-ROM 16:26 +-ROM 16:27 +-3JN 1:15 +-S3Y 1:49 +-ESG 4:6 +-ESG 9:5 +-ESG 9:30 + +LEV 14:55 = LEV 14:55 +LEV 14:55 = LEV 14:56 +LEV 14:56 = LEV 14:57 diff --git a/src/Serval/test/Serval.Translation.Tests/data/pt-project2/Settings.xml b/src/Serval/test/Serval.Translation.Tests/data/pt-project2/Settings.xml new file mode 100644 index 000000000..4ce9e238d --- /dev/null +++ b/src/Serval/test/Serval.Translation.Tests/data/pt-project2/Settings.xml @@ -0,0 +1,33 @@ + + usfm.sty + 4 + en::: + English + 8.0.100.76 + Test2 + 65001 + T + + NFC + Te2 + a7e0b3ce0200736062f9f810a444dbfbe64aca35 + Charis SIL + 12 + + + + 41MAT + + Ten.SFM + F + F + F + Public + Standard:: + + 3 + 000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + 000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + + + \ No newline at end of file diff --git a/src/Serval/test/Serval.Translation.Tests/data/pt-project2/custom.vrs b/src/Serval/test/Serval.Translation.Tests/data/pt-project2/custom.vrs new file mode 100644 index 000000000..9c1cd3873 --- /dev/null +++ b/src/Serval/test/Serval.Translation.Tests/data/pt-project2/custom.vrs @@ -0,0 +1,31 @@ +# custom.vrs + +LEV 14:56 +ROM 14:26 +REV 12:17 +TOB 5:22 +TOB 10:12 +SIR 23:28 +ESG 1:22 +ESG 3:15 +ESG 5:14 +ESG 8:17 +ESG 10:14 +SIR 33:33 +SIR 41:24 +BAR 1:22 +4MA 7:25 +4MA 12:20 + +# deliberately missing verses +-ROM 16:26 +-ROM 16:27 +-3JN 1:15 +-S3Y 1:49 +-ESG 4:6 +-ESG 9:5 +-ESG 9:30 + +LEV 14:55 = LEV 14:55 +LEV 14:55 = LEV 14:56 +LEV 14:56 = LEV 14:57 diff --git a/src/Serval/test/Serval.Translation.Tests/data/pt-project3/41MATTe3.SFM b/src/Serval/test/Serval.Translation.Tests/data/pt-project3/41MATTe3.SFM new file mode 100644 index 000000000..90cb675c5 --- /dev/null +++ b/src/Serval/test/Serval.Translation.Tests/data/pt-project3/41MATTe3.SFM @@ -0,0 +1,5 @@ +\id MAT - TRG +\c 1 +\v 1 TRG - Chapter one, verse one. +\v 2 +\v 3 TRG - Chapter one, verse three. diff --git a/src/Serval/test/Serval.Translation.Tests/data/pt-project3/Settings.xml b/src/Serval/test/Serval.Translation.Tests/data/pt-project3/Settings.xml new file mode 100644 index 000000000..873f4ab4b --- /dev/null +++ b/src/Serval/test/Serval.Translation.Tests/data/pt-project3/Settings.xml @@ -0,0 +1,33 @@ + + usfm.sty + 4 + en::: + English + 8.0.100.76 + Test3 + 65001 + T + + NFC + Te3 + a7e0b3ce0200736062f9f810a444dbfbe64aca35 + Charis SIL + 12 + + + + 41MAT + + Ten.SFM + F + F + F + Public + Standard:: + + 3 + 000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + 000000000000000000000000000000000000000100000000000000000000000000000000000000000000000000000000000000000000000000000000000 + + + \ No newline at end of file diff --git a/src/Serval/test/Serval.Translation.Tests/data/pt-project3/custom.vrs b/src/Serval/test/Serval.Translation.Tests/data/pt-project3/custom.vrs new file mode 100644 index 000000000..9c1cd3873 --- /dev/null +++ b/src/Serval/test/Serval.Translation.Tests/data/pt-project3/custom.vrs @@ -0,0 +1,31 @@ +# custom.vrs + +LEV 14:56 +ROM 14:26 +REV 12:17 +TOB 5:22 +TOB 10:12 +SIR 23:28 +ESG 1:22 +ESG 3:15 +ESG 5:14 +ESG 8:17 +ESG 10:14 +SIR 33:33 +SIR 41:24 +BAR 1:22 +4MA 7:25 +4MA 12:20 + +# deliberately missing verses +-ROM 16:26 +-ROM 16:27 +-3JN 1:15 +-S3Y 1:49 +-ESG 4:6 +-ESG 9:5 +-ESG 9:30 + +LEV 14:55 = LEV 14:55 +LEV 14:55 = LEV 14:56 +LEV 14:56 = LEV 14:57 diff --git a/src/Serval/test/Serval.WordAlignment.Tests/Services/EngineServiceTests.cs b/src/Serval/test/Serval.WordAlignment.Tests/Services/EngineServiceTests.cs index 631c8fad8..c9ff894da 100644 --- a/src/Serval/test/Serval.WordAlignment.Tests/Services/EngineServiceTests.cs +++ b/src/Serval/test/Serval.WordAlignment.Tests/Services/EngineServiceTests.cs @@ -1361,27 +1361,6 @@ public TestEnvironment() .Returns(WordAlignmentServiceClient); IOptionsMonitor dataFileOptions = Substitute.For>(); dataFileOptions.CurrentValue.Returns(new DataFileOptions()); - var scriptureDataFileService = Substitute.For(); - scriptureDataFileService - .GetParatextProjectSettings(Arg.Any()) - .Returns( - new ParatextProjectSettings( - guid: "Id", - name: "Tst", - fullName: "Test", - encoding: Encoding.UTF8, - versification: ScrVers.English, - stylesheet: new UsfmStylesheet("usfm.sty"), - fileNamePrefix: "TST", - fileNameForm: "MAT", - fileNameSuffix: ".USFM", - biblicalTermsListType: "BiblicalTerms", - biblicalTermsProjectName: "", - biblicalTermsFileName: "BiblicalTerms.xml", - languageCode: "en", - translationType: "Standard" - ) - ); WordAlignments = new MemoryRepository(); OutboxService = Substitute.For(); @@ -1392,7 +1371,7 @@ public TestEnvironment() new WordAlignmentOptions { Engines = [new EngineInfo { Type = "Statistical" }] } ); - Service = new EngineService( + Service = new TestEngineService( Engines, new MemoryRepository(), WordAlignments, @@ -1400,7 +1379,6 @@ public TestEnvironment() dataFileOptions, new MemoryDataAccessContext(), new LoggerFactory(), - scriptureDataFileService, OutboxService, wordAlignmentOptions ); @@ -1835,4 +1813,40 @@ private static AsyncUnaryCall CreateAsyncUnaryCall(TRespon } return alignedWordPairs; } + + private class TestEngineService( + IRepository engines, + IRepository builds, + IRepository wordAlignments, + GrpcClientFactory grpcClientFactory, + IOptionsMonitor dataFileOptions, + IDataAccessContext dataAccessContext, + ILoggerFactory loggerFactory, + IOutboxService outboxService, + IOptionsMonitor wordAlignmentOptions + ) + : EngineService( + engines, + builds, + wordAlignments, + grpcClientFactory, + dataFileOptions, + dataAccessContext, + loggerFactory, + outboxService, + wordAlignmentOptions + ) + { + protected override Dictionary> GetChapters(string fileLocation, string scriptureRange) + { + try + { + return ScriptureRangeParser.GetChapters(scriptureRange); + } + catch (ArgumentException ae) + { + throw new InvalidOperationException($"The scripture range {scriptureRange} is not valid: {ae.Message}"); + } + } + } } diff --git a/src/Serval/test/Serval.WordAlignment.Tests/Usings.cs b/src/Serval/test/Serval.WordAlignment.Tests/Usings.cs index 7b14ff4a8..e1c4deec7 100644 --- a/src/Serval/test/Serval.WordAlignment.Tests/Usings.cs +++ b/src/Serval/test/Serval.WordAlignment.Tests/Usings.cs @@ -1,4 +1,3 @@ -global using System.Text; global using Grpc.Core; global using Grpc.Net.ClientFactory; global using MassTransit; @@ -8,9 +7,6 @@ global using NUnit.Framework; global using Serval.Shared.Configuration; global using Serval.Shared.Models; -global using Serval.Shared.Services; global using Serval.Shared.Utils; global using Serval.WordAlignment.Models; global using SIL.DataAccess; -global using SIL.Machine.Corpora; -global using SIL.Scripture; diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Configuration/IServiceCollectionsExtensions.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Configuration/IServiceCollectionsExtensions.cs index 058414b64..d4941ca26 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Configuration/IServiceCollectionsExtensions.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Configuration/IServiceCollectionsExtensions.cs @@ -2,10 +2,9 @@ namespace Microsoft.Extensions.DependencyInjection; public static class IServiceCollectionExtensions { - public static IServiceCollection AddParallelCorpusPreprocessor(this IServiceCollection services) + public static IServiceCollection AddParallelCorpusService(this IServiceCollection services) { - services.TryAddSingleton(); - services.TryAddSingleton(); + services.TryAddSingleton(); return services; } diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/MissingParentProjectError.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/MissingParentProjectError.cs new file mode 100644 index 000000000..fb76591dc --- /dev/null +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/MissingParentProjectError.cs @@ -0,0 +1,7 @@ +namespace SIL.ServiceToolkit.Models; + +public record MissingParentProjectError +{ + public required string ProjectName { get; init; } + public required string ParentProjectName { get; init; } +} diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/MonolingualCorpus.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/MonolingualCorpus.cs index 1ef59e6be..5b366a71b 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/MonolingualCorpus.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/MonolingualCorpus.cs @@ -9,6 +9,8 @@ public record MonolingualCorpus public Dictionary>? TrainOnChapters { get; set; } public HashSet? InferenceTextIds { get; set; } public Dictionary>? InferenceChapters { get; set; } + public bool TrainOnAll { get; set; } + public bool PretranslateAll { get; set; } public bool IsFiltered => TrainOnTextIds != null || TrainOnChapters != null || InferenceTextIds != null || InferenceChapters != null; diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/ParallelRow.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/ParallelRow.cs new file mode 100644 index 000000000..13a40e319 --- /dev/null +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/ParallelRow.cs @@ -0,0 +1,11 @@ +namespace SIL.ServiceToolkit.Models; + +public record ParallelRow +{ + public required IReadOnlyList SourceRefs { get; init; } + public required IReadOnlyList TargetRefs { get; init; } + public required string TargetText { get; init; } + public required IReadOnlyList? SourceTokens { get; init; } + public required IReadOnlyList? TargetTokens { get; init; } + public IReadOnlyList? Alignment { get; init; } +} diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/TrainingDataType.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/TrainingDataType.cs index 2597ffc4b..46f23035b 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/TrainingDataType.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Models/TrainingDataType.cs @@ -3,5 +3,5 @@ namespace SIL.ServiceToolkit.Models; public enum TrainingDataType { Text = 0, - KeyTerms = 1, + KeyTerm = 1, } diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusBundle.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusBundle.cs new file mode 100644 index 000000000..09720985e --- /dev/null +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/CorpusBundle.cs @@ -0,0 +1,226 @@ +using ZipParatextProjectTextUpdater = SIL.ServiceToolkit.Services.ZipParatextProjectTextUpdater; + +namespace SIL.ServiceToolkit.Utils; + +public class CorpusBundle +{ + private readonly Dictionary< + string, + (ParatextProjectSettings DaughterSettings, string? ParentLocation, ParatextProjectSettings? ParentSettings) + > _settings; + + public IEnumerable<( + ParallelCorpus ParallelCorpus, + MonolingualCorpus MonolingualCorpus, + IReadOnlyList CorpusFile, + IReadOnlyList TextCorpora + )> SourceTextCorpora { get; } + + public IEnumerable<( + ParallelCorpus ParallelCorpus, + MonolingualCorpus MonolingualCorpus, + IReadOnlyList CorpusFile, + IReadOnlyList TextCorpora + )> TargetTextCorpora { get; } + + public IEnumerable<( + ParallelCorpus ParallelCorpus, + MonolingualCorpus MonolingualCorpus, + IReadOnlyList CorpusFile, + IReadOnlyList TextCorpora + )> TextCorpora => SourceTextCorpora.Concat(TargetTextCorpora); + + public IEnumerable<( + ParallelCorpus ParallelCorpus, + MonolingualCorpus MonolingualCorpus, + IReadOnlyList CorpusFile, + IReadOnlyList TextCorpora + )> SourceTermCorpora { get; } + + public IEnumerable<( + ParallelCorpus ParallelCorpus, + MonolingualCorpus MonolingualCorpus, + IReadOnlyList CorpusFile, + IReadOnlyList TextCorpora + )> TargetTermCorpora { get; } + public IReadOnlyList ParallelCorpora { get; } + + public CorpusBundle(IEnumerable parallelCorpora) + { + ParallelCorpora = parallelCorpora.ToArray(); + + _settings = []; + IEnumerable corpusFiles = parallelCorpora.SelectMany(corpus => + corpus.SourceCorpora.Concat(corpus.TargetCorpora).SelectMany(c => c.Files) + ); + List<(string Location, ParatextProjectSettings Settings)> paratextProjects = []; + foreach (CorpusFile file in corpusFiles.Where(f => f.Format == FileFormat.Paratext)) + { + using IZipContainer archive = new ZipContainer(file.Location); + ParatextProjectSettings settings = new Services.ZipParatextProjectSettingsParser(archive).Parse(); + paratextProjects.Add((file.Location, settings)); + } + + foreach ((string daughterLocation, ParatextProjectSettings daughterSettings) in paratextProjects) + { + foreach ((string parentLocation, ParatextProjectSettings parentSettings) in paratextProjects) + { + if ( + daughterSettings != parentSettings + && daughterSettings.HasParent + && daughterSettings.IsDaughterProjectOf(parentSettings) + ) + { + daughterSettings.Parent = parentSettings; + _settings[daughterLocation] = (daughterSettings, parentLocation, parentSettings); + } + else + { + _settings[daughterLocation] = (daughterSettings, null, null); + } + } + } + + SourceTextCorpora = parallelCorpora.SelectMany(parallelCorpus => + parallelCorpus.SourceCorpora.Select(corpus => + (parallelCorpus, corpus, corpus.Files, CreateTextCorpora(corpus.Files)) + ) + ); + + TargetTextCorpora = parallelCorpora.SelectMany(parallelCorpus => + parallelCorpus.TargetCorpora.Select(corpus => + (parallelCorpus, corpus, corpus.Files, CreateTextCorpora(corpus.Files)) + ) + ); + + SourceTermCorpora = parallelCorpora.SelectMany(parallelCorpus => + parallelCorpus.SourceCorpora.Select(corpus => + (parallelCorpus, corpus, corpus.Files, CreateTermCorpora(corpus.Files)) + ) + ); + + TargetTermCorpora = parallelCorpora.SelectMany(parallelCorpus => + parallelCorpus.TargetCorpora.Select(corpus => + (parallelCorpus, corpus, corpus.Files, CreateTermCorpora(corpus.Files)) + ) + ); + } + + public (string Location, ParatextProjectSettings Settings)? ParentOf(string daughterLocation) + { + if ( + !_settings.TryGetValue( + daughterLocation, + out (ParatextProjectSettings _, string? Location, ParatextProjectSettings? Settings) parent + ) + ) + { + return null; + } + if (parent.Location == null || parent.Settings == null) + { + return null; + } + return (parent.Location, parent.Settings); + } + + public ParatextProjectSettings? GetSettings(string location) + { + if ( + !_settings.TryGetValue( + location, + out ( + ParatextProjectSettings ParatextProjectSettings, + string? ParentLocation, + ParatextProjectSettings? ParentSettings + ) settings + ) + ) + { + return null; + } + return settings.ParatextProjectSettings; + } + + public ZipParatextProjectTextUpdater GetTextUpdater(string location) + { + IZipContainer container = new ZipContainer(location); + ParatextProjectSettings? parentSettings = ParentOf(location)?.Settings; + return new ZipParatextProjectTextUpdater(container, parentSettings); + } + + protected virtual IReadOnlyList CreateTextCorpora(IReadOnlyList files) + { + List corpora = []; + + List> textFileCorpora = []; + foreach (CorpusFile file in files) + { + switch (file.Format) + { + case FileFormat.Text: + // if there are multiple texts with the same id, then add it to a new corpus or the first + // corpus that doesn't contain a text with that id + Dictionary? corpus = textFileCorpora.FirstOrDefault(c => + !c.ContainsKey(file.TextId) + ); + if (corpus is null) + { + corpus = []; + textFileCorpora.Add(corpus); + } + corpus[file.TextId] = new TextFileText(file.TextId, file.Location); + break; + + case FileFormat.Paratext: + string? parentLocation = null; + if ( + _settings.TryGetValue( + file.Location, + out (ParatextProjectSettings, string? ParentLocation, ParatextProjectSettings?) settings + ) + ) + { + parentLocation = settings.ParentLocation; + } + corpora.Add( + new ParatextBackupTextCorpus( + file.Location, + includeAllText: true, + parentFileName: parentLocation + ) + ); + break; + } + } + foreach (Dictionary corpus in textFileCorpora) + corpora.Add(new DictionaryTextCorpus(corpus.Values)); + + return corpora; + } + + private IReadOnlyList CreateTermCorpora(IReadOnlyList files) + { + List corpora = []; + foreach (CorpusFile file in files) + { + switch (file.Format) + { + case FileFormat.Paratext: + string? parentLocation = null; + if ( + _settings.TryGetValue( + file.Location, + out (ParatextProjectSettings, string? ParentLocation, ParatextProjectSettings?) settings + ) + ) + { + parentLocation = settings.ParentLocation; + } + corpora.Add(new ParatextBackupTermsCorpus(file.Location, ["PN"], parentFileName: parentLocation)); + break; + } + } + return corpora; + } +} diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/IParallelCorpusPreprocessingService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/IParallelCorpusPreprocessingService.cs deleted file mode 100644 index b1dbacd89..000000000 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/IParallelCorpusPreprocessingService.cs +++ /dev/null @@ -1,17 +0,0 @@ -namespace SIL.ServiceToolkit.Services; - -public interface IParallelCorpusPreprocessingService -{ - QuoteConventionAnalysis? AnalyzeTargetCorpusQuoteConvention(ParallelCorpus corpus); - IReadOnlyList<(string CorpusId, IReadOnlyList Errors)> AnalyzeUsfmVersification( - ParallelCorpus parallelCorpus - ); - - Task PreprocessAsync( - IReadOnlyList corpora, - Func train, - Func inference, - bool useKeyTerms = false, - HashSet? ignoreUsfmMarkers = null - ); -} diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/IParallelCorpusService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/IParallelCorpusService.cs new file mode 100644 index 000000000..3b26e2e16 --- /dev/null +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/IParallelCorpusService.cs @@ -0,0 +1,58 @@ +namespace SIL.ServiceToolkit.Services; + +public interface IParallelCorpusService +{ + QuoteConventionAnalysis AnalyzeTargetQuoteConvention(IEnumerable parallelCorpora); + + IReadOnlyList<( + string ParallelCorpusId, + string MonolingualCorpusId, + IReadOnlyList Errors + )> AnalyzeUsfmVersification(IEnumerable parallelCorpora); + + IReadOnlyList<( + string ParallelCorpusId, + string MonolingualCorpusId, + MissingParentProjectError + )> FindMissingParentProjects(IEnumerable parallelCorpora); + + Task PreprocessAsync( + IEnumerable parallelCorpora, + Func train, + Func inference, + bool useKeyTerms = false, + HashSet? ignoreUsfmMarkers = null + ); + + string UpdateSourceUsfm( + IReadOnlyList parallelCorpora, + string corpusId, + string bookId, + IReadOnlyList rows, + UpdateUsfmMarkerBehavior paragraphBehavior, + UpdateUsfmMarkerBehavior embedBehavior, + UpdateUsfmMarkerBehavior styleBehavior, + bool placeParagraphMarkers, + IEnumerable? remarks, + string? targetQuoteConvention + ); + + string UpdateTargetUsfm( + IReadOnlyList parallelCorpora, + string corpusId, + string bookId, + IReadOnlyList rows, + UpdateUsfmTextBehavior textBehavior, + UpdateUsfmMarkerBehavior paragraphBehavior, + UpdateUsfmMarkerBehavior embedBehavior, + UpdateUsfmMarkerBehavior styleBehavior, + IEnumerable? remarks, + string? targetQuoteConvention + ); + + Dictionary> GetChapters( + IReadOnlyList parallelCorpora, + string fileLocation, + string scriptureRange + ); +} diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ITextCorpusService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ITextCorpusService.cs deleted file mode 100644 index 7e7651583..000000000 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ITextCorpusService.cs +++ /dev/null @@ -1,7 +0,0 @@ -namespace SIL.ServiceToolkit.Services; - -public interface ITextCorpusService -{ - IEnumerable CreateTextCorpora(IReadOnlyList files); - IEnumerable CreateTermCorpora(IReadOnlyList corpusFiles); -} diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs deleted file mode 100644 index b5b3d4fcb..000000000 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusPreprocessingService.cs +++ /dev/null @@ -1,470 +0,0 @@ -using SIL.Scripture; - -namespace SIL.ServiceToolkit.Services; - -public class ParallelCorpusPreprocessingService(ITextCorpusService textCorpusService) - : IParallelCorpusPreprocessingService -{ - private readonly ITextCorpusService _textCorpusService = textCorpusService; - private const int Seed = 1234; - - public IReadOnlyList<(string CorpusId, IReadOnlyList Errors)> AnalyzeUsfmVersification( - ParallelCorpus parallelCorpus - ) - { - List<(string CorpusId, IReadOnlyList Errors)> errorsPerCorpus = []; - foreach ( - (CorpusFile file, MonolingualCorpus monolingualCorpus, bool isSource) in parallelCorpus - .SourceCorpora.SelectMany(c => - c.Files.Where(f => f.Format == FileFormat.Paratext).Select(f => (f, c, true)) - ) - .Concat( - parallelCorpus.TargetCorpora.SelectMany(c => - c.Files.Where(f => f.Format == FileFormat.Paratext).Select(f => (f, c, false)) - ) - ) - .DistinctBy(tuple => tuple.f.Location) - ) - { - using ZipArchive zipArchive = ZipFile.OpenRead(file.Location); - IReadOnlyList errors = new ZipParatextProjectVersificationErrorDetector( - zipArchive - ).GetUsfmVersificationErrors(books: GetBooks(monolingualCorpus, isSource)); - if (errors.Count > 0) - { - errorsPerCorpus.Add((monolingualCorpus.Id, errors)); - } - } - return errorsPerCorpus; - } - - private static HashSet? GetBooks(MonolingualCorpus corpus, bool isSource) - { - if (!corpus.IsFiltered) - return null; - - List books = []; - if (corpus.TrainOnTextIds != null) - { - books.AddRange(corpus.TrainOnTextIds); - } - else if (corpus.TrainOnChapters != null) - { - books.AddRange(corpus.TrainOnChapters.Keys); - } - - if (isSource) - { - if (corpus.InferenceTextIds != null) - { - books.AddRange(corpus.InferenceTextIds); - } - else if (corpus.InferenceChapters != null) - { - books.AddRange(corpus.InferenceChapters.Keys); - } - } - return [.. books.Select(bookName => Canon.BookIdToNumber(bookName))]; - } - - public QuoteConventionAnalysis? AnalyzeTargetCorpusQuoteConvention(ParallelCorpus parallelCorpus) - { - List analyses = []; - foreach (MonolingualCorpus targetMonolingualCorpus in parallelCorpus.TargetCorpora) - { - foreach (CorpusFile file in targetMonolingualCorpus.Files.Where(f => f.Format == FileFormat.Paratext)) - { - using ZipArchive zipArchive = ZipFile.OpenRead(file.Location); - var quoteConventionDetector = new ZipParatextProjectQuoteConventionDetector(zipArchive); - Dictionary>? chapters = null; - if (targetMonolingualCorpus.TrainOnTextIds is not null) - { - chapters = targetMonolingualCorpus.TrainOnTextIds.ToDictionary( - id => Canon.BookIdToNumber(id), - _ => new List() - ); - } - else if (targetMonolingualCorpus.TrainOnChapters is not null) - { - chapters = targetMonolingualCorpus.TrainOnChapters.ToDictionary( - kvp => Canon.BookIdToNumber(kvp.Key), - kvp => kvp.Value.ToList() - ); - } - if (chapters != null) - analyses.Add(quoteConventionDetector.GetQuoteConventionAnalysis(chapters)); - else - analyses.Add(quoteConventionDetector.GetQuoteConventionAnalysis()); - } - } - - return QuoteConventionAnalysis.CombineWithWeightedAverage(analyses); - } - - public async Task PreprocessAsync( - IReadOnlyList corpora, - Func train, - Func inference, - bool useKeyTerms = false, - HashSet? ignoreUsfmMarkers = null - ) - { - ignoreUsfmMarkers ??= []; - - bool parallelTrainingDataPresent = false; - List keyTermTrainingData = new(); - - // Create source and target dictionaries that map from a parallel corpus id - // to an array of all of that parallel corpus' monolingual corpora and associated text corpora - Dictionary sourceCorpora = corpora - .Select(corpus => - ( - CorpusId: corpus.Id, - Corpora: corpus - .SourceCorpora.SelectMany(c => - _textCorpusService.CreateTextCorpora(c.Files).Select(tc => (c, tc)) - ) - .ToArray() - ) - ) - .ToDictionary(tup => tup.CorpusId, tup => tup.Corpora); - - Dictionary targetCorpora = corpora - .Select(corpus => - ( - CorpusId: corpus.Id, - Corpora: corpus - .TargetCorpora.SelectMany(c => - _textCorpusService.CreateTextCorpora(c.Files).Select(tc => (c, tc)) - ) - .ToArray() - ) - ) - .ToDictionary(tup => tup.CorpusId, tup => tup.Corpora); - - // Filter the text corpora for training based on the filters specified in the monolingual corpora - ITextCorpus[] sourceTrainingCorpora = sourceCorpora - .Values.SelectMany(sc => sc) - .Select(sc => FilterTrainingCorpora(sc.Corpus, sc.TextCorpus)) - .ToArray(); - - ITextCorpus[] targetTrainingCorpora = targetCorpora - .Values.SelectMany(tc => tc) - .Select(tc => FilterTrainingCorpora(tc.Corpus, tc.TextCorpus)) - .ToArray(); - - // To support mixed source, collapse multiple source text corpora into one text corpus - // by randomly interlacing content from each of the source text corpora - ITextCorpus sourceTrainingCorpus = sourceTrainingCorpora.ChooseRandom(Seed); - if (sourceTrainingCorpus.IsScripture()) - { - // Filter out all non-scripture; we only train on scripture content - sourceTrainingCorpus = sourceTrainingCorpus.Where(IsScriptureRow); - } - - // Instead of interlacing rows from the target text corpora randomly, just take the - // text row from the first target text corpus that has content for that row - ITextCorpus targetTrainingCorpus = targetTrainingCorpora.ChooseFirst(); - if (targetTrainingCorpus.IsScripture()) - { - // Filter out all non-scripture; we only train on scripture content - targetTrainingCorpus = targetTrainingCorpus.Where(IsScriptureRow); - } - - // Align source and target training data - ParallelTextRow[] trainingRows = sourceTrainingCorpus - .AlignRows(targetTrainingCorpus, allSourceRows: true, allTargetRows: true) - .ToArray(); - - // After merging segments across ranges, run the 'train' preprocessing function - // on each training row and record whether any parallel training data was present - foreach (Row row in CollapseRanges(trainingRows)) - { - await train(row, TrainingDataType.Text); - if (!parallelTrainingDataPresent && row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0) - { - parallelTrainingDataPresent = true; - } - } - - if (useKeyTerms) - { - // Create a terms corpus for each corpus file - ITextCorpus[]? sourceTermCorpora = _textCorpusService - .CreateTermCorpora( - sourceCorpora.Values.SelectMany(sc => sc).SelectMany(corpus => corpus.Corpus.Files).ToArray() - ) - .ToArray(); - ITextCorpus[]? targetTermCorpora = _textCorpusService - .CreateTermCorpora( - targetCorpora.Values.SelectMany(tc => tc).SelectMany(corpus => corpus.Corpus.Files).ToArray() - ) - .ToArray(); - - if (sourceTermCorpora is not null && targetTermCorpora is not null) - { - // As with scripture data, interlace the source rows randomly - // but choose the first non-empty target row, then align - IParallelTextCorpus parallelKeyTermsCorpus = sourceTermCorpora - .ChooseRandom(Seed) - .AlignRows(targetTermCorpora.ChooseFirst()); - - // Only train on unique key terms pairs - foreach ( - ParallelTextRow row in parallelKeyTermsCorpus.DistinctBy(row => (row.SourceText, row.TargetText)) - ) - { - keyTermTrainingData.Add( - new Row(row.TextId, row.SourceRefs, row.TargetRefs, row.SourceText, row.TargetText, 1) - ); - } - } - } - - // Since we ultimately need to provide inferences for a particular parallel corpus, - // we need to preprocess the content on which to inference per parallel corpus - foreach (ParallelCorpus corpus in corpora) - { - // Filter the text corpora based on the filters specified in the monolingual corpora - ITextCorpus sourceInferencingCorpus = sourceCorpora[corpus.Id] - .Select(sc => FilterInferencingCorpora(sc.Corpus, sc.TextCorpus, ignoreUsfmMarkers)) - .ChooseFirst(); - - ITextCorpus targetInferencingCorpus = targetCorpora[corpus.Id] - .Select(tc => FilterInferencingCorpora(tc.Corpus, tc.TextCorpus, ignoreUsfmMarkers)) - .ChooseFirst(); - - // We need to align all three of these corpora because we need both the source and target - // content for inferencing (the target is only needed in some contexts like word alignment) - // as well as the target training corpus in order to determine whether a row was already - // used in training. - INParallelTextCorpus inferencingCorpus = new ITextCorpus[] - { - sourceInferencingCorpus, - targetInferencingCorpus, - targetTrainingCorpus, - }.AlignMany([true, false, false]); - - foreach ((Row row, bool isInTrainingData) in CollapseInferencingRanges(inferencingCorpus.ToArray())) - { - await inference(row, isInTrainingData, corpus); - } - } - - // Only train on key terms if there were other parallel scripture data. - // This is necessary to support inference-only jobs since the terms are not - // filtered by the filters specified in the monolingual corpora. - if (useKeyTerms && parallelTrainingDataPresent) - { - foreach (Row row in keyTermTrainingData) - { - await train(row, TrainingDataType.KeyTerms); - } - } - } - - private static ITextCorpus FilterInferencingCorpora( - MonolingualCorpus corpus, - ITextCorpus textCorpus, - HashSet ignoreUsfmMarkers - ) - { - textCorpus = textCorpus.Transform(CleanSegment); - if (corpus.InferenceTextIds is not null) - { - textCorpus = textCorpus.FilterTexts(corpus.InferenceTextIds); - } - else if (corpus.InferenceChapters is not null) - { - textCorpus = textCorpus - .FilterTexts(corpus.InferenceChapters.Keys) - .Where(row => row.Ref is not ScriptureRef sr || IsInChapters(sr, corpus.InferenceChapters)); - } - return textCorpus.Where(row => row.Ref is not ScriptureRef sr || !HasIgnorableMarker(sr, ignoreUsfmMarkers)); - } - - private static ITextCorpus FilterTrainingCorpora(MonolingualCorpus corpus, ITextCorpus textCorpus) - { - textCorpus = textCorpus.Transform(CleanSegment); - if (corpus.TrainOnTextIds is not null) - { - return textCorpus.FilterTexts(corpus.TrainOnTextIds); - } - if (corpus.TrainOnChapters is not null) - { - return textCorpus - .FilterTexts(corpus.TrainOnChapters.Keys) - .Where(row => row.Ref is not ScriptureRef sr || IsInChapters(sr, corpus.TrainOnChapters)); - } - return textCorpus; - } - - private static IEnumerable CollapseRanges(ParallelTextRow[] rows) - { - StringBuilder srcSegBuffer = new(); - StringBuilder trgSegBuffer = new(); - List sourceRefs = []; - List targetRefs = []; - string textId = ""; - bool hasUnfinishedRange = false; - - foreach (ParallelTextRow row in rows) - { - if ( - hasUnfinishedRange - && (!row.IsTargetInRange || row.IsTargetRangeStart) - && (!row.IsSourceInRange || row.IsSourceRangeStart) - ) - { - yield return new Row( - textId, - sourceRefs, - targetRefs, - srcSegBuffer.ToString(), - trgSegBuffer.ToString(), - 1 - ); - - srcSegBuffer.Clear(); - trgSegBuffer.Clear(); - targetRefs.Clear(); - - hasUnfinishedRange = false; - } - - textId = row.TextId; - sourceRefs.AddRange(row.SourceRefs); - targetRefs.AddRange(row.TargetRefs); - if (row.SourceText.Length > 0) - { - if (srcSegBuffer.Length > 0) - srcSegBuffer.Append(' '); - srcSegBuffer.Append(row.SourceText); - } - if (row.TargetText.Length > 0) - { - if (trgSegBuffer.Length > 0) - trgSegBuffer.Append(' '); - trgSegBuffer.Append(row.TargetText); - } - - if (row.IsTargetInRange || row.IsSourceInRange) - { - hasUnfinishedRange = true; - continue; - } - - yield return new Row(textId, sourceRefs, targetRefs, srcSegBuffer.ToString(), trgSegBuffer.ToString(), 1); - - srcSegBuffer.Clear(); - trgSegBuffer.Clear(); - sourceRefs.Clear(); - targetRefs.Clear(); - } - if (hasUnfinishedRange) - { - yield return new Row(textId, sourceRefs, targetRefs, srcSegBuffer.ToString(), trgSegBuffer.ToString(), 1); - } - } - - private static IEnumerable<(Row, bool)> CollapseInferencingRanges(NParallelTextRow[] rows) - { - StringBuilder srcSegBuffer = new(); - StringBuilder trgSegBuffer = new(); - List sourceRefs = []; - List targetRefs = []; - string textId = ""; - bool hasUnfinishedRange = false; - bool isInTrainingData = false; - - foreach (NParallelTextRow row in rows) - { - //row at 0 is source filtered for inferencing, row at 1 is target filtered for inferencing, row at 2 is target filtered for training - if ( - hasUnfinishedRange - && (!row.IsInRange(0) || row.IsRangeStart(0)) - && (!row.IsInRange(1) || row.IsRangeStart(1)) - && (!row.IsInRange(2) || row.IsRangeStart(2)) - ) - { - yield return ( - new Row(textId, sourceRefs, targetRefs, srcSegBuffer.ToString(), trgSegBuffer.ToString(), 1), - isInTrainingData - ); - - srcSegBuffer.Clear(); - trgSegBuffer.Clear(); - sourceRefs.Clear(); - targetRefs.Clear(); - isInTrainingData = false; - hasUnfinishedRange = false; - } - - textId = row.TextId; - sourceRefs.AddRange(row.NRefs[0]); - targetRefs.AddRange(row.NRefs[2].Count > 0 ? row.NRefs[2] : row.NRefs[1]); - isInTrainingData = isInTrainingData || row.Text(2).Length > 0; - - if (row.Text(0).Length > 0) - { - if (srcSegBuffer.Length > 0) - srcSegBuffer.Append(' '); - srcSegBuffer.Append(row.Text(0)); - } - if (row.Text(1).Length > 0) - { - if (trgSegBuffer.Length > 0) - trgSegBuffer.Append(' '); - trgSegBuffer.Append(row.Text(1)); - } - - if (row.IsInRange(0) || row.IsInRange(1) || row.IsInRange(2)) - { - hasUnfinishedRange = true; - continue; - } - - yield return ( - new Row(textId, sourceRefs, targetRefs, srcSegBuffer.ToString(), trgSegBuffer.ToString(), 1), - isInTrainingData - ); - - srcSegBuffer.Clear(); - trgSegBuffer.Clear(); - sourceRefs.Clear(); - targetRefs.Clear(); - isInTrainingData = false; - } - if (hasUnfinishedRange) - { - yield return ( - new Row(textId, sourceRefs, targetRefs, srcSegBuffer.ToString(), trgSegBuffer.ToString(), 1), - isInTrainingData - ); - } - } - - private static bool IsScriptureRow(TextRow parallelTextRow) - { - return parallelTextRow.Ref is ScriptureRef sr && sr.IsVerse; - } - - private static bool IsInChapters(ScriptureRef sr, Dictionary> selection) - { - return selection.TryGetValue(sr.Book, out HashSet? chapters) - && chapters != null - && (chapters.Count == 0 || chapters.Contains(sr.ChapterNum)); - } - - private static bool HasIgnorableMarker(ScriptureRef sr, HashSet ignoreUsfmMarkers) - { - return sr.Path.Any(e => ignoreUsfmMarkers.Contains(e.Name)); - } - - private static TextRow CleanSegment(TextRow row) - { - if (row.Text == "...") - row.Segment = []; - return row; - } -} diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusService.cs new file mode 100644 index 000000000..a4e11124c --- /dev/null +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusService.cs @@ -0,0 +1,806 @@ +using System.Globalization; +using SIL.Machine.Translation; +using SIL.Scripture; + +namespace SIL.ServiceToolkit.Services; + +public class ParallelCorpusService : IParallelCorpusService +{ + private const int Seed = 1234; + + public IReadOnlyList<( + string ParallelCorpusId, + string MonolingualCorpusId, + IReadOnlyList Errors + )> AnalyzeUsfmVersification(IEnumerable parallelCorpora) + { + CorpusBundle corpusBundle = new(parallelCorpora); + List<( + string ParallelCorpusId, + string MonolingualCorpusId, + IReadOnlyList Errors + )> errorsPerCorpus = []; + foreach ( + ( + ParallelCorpus parallelCorpus, + MonolingualCorpus monolingualCorpus, + IReadOnlyList files, + _ + ) in corpusBundle.TextCorpora + ) + { + foreach (CorpusFile file in files.Where(f => f.Format == FileFormat.Paratext)) + { + using ZipArchive zipArchive = ZipFile.OpenRead(file.Location); + IReadOnlyList errors = new ZipParatextProjectVersificationErrorDetector( + zipArchive, + corpusBundle.ParentOf(file.Location)?.Settings + ).GetUsfmVersificationErrors(books: GetBooks(monolingualCorpus)); + if (errors.Count > 0) + { + errorsPerCorpus.Add((parallelCorpus.Id, monolingualCorpus.Id, errors)); + } + } + } + return errorsPerCorpus; + } + + public QuoteConventionAnalysis AnalyzeTargetQuoteConvention(IEnumerable parallelCorpora) + { + CorpusBundle corpusBundle = new(parallelCorpora); + Dictionary> analyses = []; + foreach ( + ( + ParallelCorpus parallelCorpus, + MonolingualCorpus targetMonolingualCorpus, + IReadOnlyList corpusFiles, + _ + ) in corpusBundle.TargetTextCorpora + ) + { + foreach (CorpusFile file in corpusFiles.Where(f => f.Format == FileFormat.Paratext)) + { + using ZipArchive zipArchive = ZipFile.OpenRead(file.Location); + var quoteConventionDetector = new ZipParatextProjectQuoteConventionDetector( + zipArchive, + corpusBundle.ParentOf(file.Location)?.Settings + ); + Dictionary>? chapters = null; + if (targetMonolingualCorpus.TrainOnTextIds is not null) + { + chapters = targetMonolingualCorpus.TrainOnTextIds.ToDictionary( + id => Canon.BookIdToNumber(id), + _ => new List() + ); + } + else if (targetMonolingualCorpus.TrainOnChapters is not null) + { + chapters = targetMonolingualCorpus.TrainOnChapters.ToDictionary( + kvp => Canon.BookIdToNumber(kvp.Key), + kvp => kvp.Value.ToList() + ); + } + if (!analyses.ContainsKey(parallelCorpus.Id)) + analyses[parallelCorpus.Id] = []; + if (chapters != null) + analyses[parallelCorpus.Id].Add(quoteConventionDetector.GetQuoteConventionAnalysis(chapters)); + else + analyses[parallelCorpus.Id].Add(quoteConventionDetector.GetQuoteConventionAnalysis()); + } + } + + return QuoteConventionAnalysis.CombineWithWeightedAverage( + analyses.Select(kvp => QuoteConventionAnalysis.CombineWithWeightedAverage(kvp.Value)).ToList() + ); + } + + public IReadOnlyList<( + string ParallelCorpusId, + string MonolingualCorpusId, + MissingParentProjectError + )> FindMissingParentProjects(IEnumerable parallelCorpora) + { + CorpusBundle corpusBundle = new(parallelCorpora); + List<(string, string, MissingParentProjectError)> errors = []; + foreach ( + ( + ParallelCorpus parallelCorpus, + MonolingualCorpus monolingualCorpus, + IReadOnlyList files, + _ + ) in corpusBundle.TextCorpora + ) + { + foreach (CorpusFile file in files.Where(f => f.Format == FileFormat.Paratext)) + { + using ZipArchive archive = ZipFile.OpenRead(file.Location); + ParatextProjectSettings settings = Machine.Corpora.ZipParatextProjectSettingsParser.Parse(archive); + if (settings.HasParent && corpusBundle.ParentOf(file.Location) == null) + { + errors.Add( + ( + parallelCorpus.Id, + monolingualCorpus.Id, + new() { ProjectName = settings.Name, ParentProjectName = settings.ParentName } + ) + ); + } + } + } + + return errors; + } + + public async Task PreprocessAsync( + IEnumerable parallelCorpora, + Func train, + Func inference, + bool useKeyTerms = false, + HashSet? ignoreUsfmMarkers = null + ) + { + await PreprocessAsync(new CorpusBundle(parallelCorpora), train, inference, useKeyTerms, ignoreUsfmMarkers); + } + + public async Task PreprocessAsync( + CorpusBundle corpusBundle, + Func train, + Func inference, + bool useKeyTerms = false, + HashSet? ignoreUsfmMarkers = null + ) + { + ignoreUsfmMarkers ??= []; + + bool parallelTrainingDataPresent = false; + List keyTermTrainingData = new(); + + // Create source and target arrays of text corpora filtered for training + // based on the filters specified in the associated monolingual corpora + ITextCorpus[] sourceTrainingCorpora = corpusBundle + .SourceTextCorpora.SelectMany(c => + c.TextCorpora.Select(tc => FilterTrainingCorpora(c.MonolingualCorpus, tc)) + ) + .ToArray(); + + ITextCorpus[] targetTrainingCorpora = corpusBundle + .TargetTextCorpora.SelectMany(c => + c.TextCorpora.Select(tc => FilterTrainingCorpora(c.MonolingualCorpus, tc)) + ) + .ToArray(); + + // To support mixed source, collapse multiple source text corpora into one text corpus + // by randomly interlacing content from each of the source text corpora + ITextCorpus sourceTrainingCorpus = sourceTrainingCorpora.ChooseRandom(Seed); + if (sourceTrainingCorpus.IsScripture()) + { + // Filter out all non-scripture; we only train on scripture content + sourceTrainingCorpus = sourceTrainingCorpus.Where(IsScriptureRow); + } + + // Instead of interlacing rows from the target text corpora randomly, just take the + // text row from the first target text corpus that has content for that row + ITextCorpus targetTrainingCorpus = targetTrainingCorpora.ChooseFirst(); + if (targetTrainingCorpus.IsScripture()) + { + // Filter out all non-scripture; we only train on scripture content + targetTrainingCorpus = targetTrainingCorpus.Where(IsScriptureRow); + } + + // Align source and target training data + ParallelTextRow[] trainingRows = sourceTrainingCorpus + .AlignRows(targetTrainingCorpus, allSourceRows: true, allTargetRows: true) + .ToArray(); + + // After merging segments across ranges, run the 'train' preprocessing function + // on each training row and record whether any parallel training data was present + foreach (Row row in CollapseRanges(trainingRows)) + { + await train(row, TrainingDataType.Text); + if (!parallelTrainingDataPresent && row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0) + { + parallelTrainingDataPresent = true; + } + } + + if (useKeyTerms) + { + // Create a terms corpus for each corpus file + ITextCorpus[] sourceTermCorpora = corpusBundle.SourceTermCorpora.SelectMany(c => c.TextCorpora).ToArray(); + ITextCorpus[] targetTermCorpora = corpusBundle.TargetTermCorpora.SelectMany(c => c.TextCorpora).ToArray(); + + // As with scripture data, interlace the source rows randomly + // but choose the first non-empty target row, then align + IParallelTextCorpus parallelKeyTermCorpus = sourceTermCorpora + .ChooseRandom(Seed) + .AlignRows(targetTermCorpora.ChooseFirst()); + + // Only train on unique key terms pairs + foreach (ParallelTextRow row in parallelKeyTermCorpus.DistinctBy(row => (row.SourceText, row.TargetText))) + { + keyTermTrainingData.Add( + new Row(row.TextId, row.SourceRefs, row.TargetRefs, row.SourceText, row.TargetText, 1) + ); + } + } + + // Since we ultimately need to provide inferences for a particular parallel corpus, + // we need to preprocess the content on which to inference per parallel corpus + foreach (ParallelCorpus parallelCorpus in corpusBundle.ParallelCorpora) + { + // Filter the text corpora based on the filters specified in the monolingual corpora + ITextCorpus sourceInferencingCorpus = corpusBundle + .SourceTextCorpora.Where(c => c.ParallelCorpus.Id == parallelCorpus.Id) + .SelectMany(sc => + sc.TextCorpora.Select(textCorpus => + FilterInferencingCorpora(sc.MonolingualCorpus, textCorpus, ignoreUsfmMarkers) + ) + ) + .ChooseFirst(); + + ITextCorpus targetInferencingCorpus = corpusBundle + .TargetTextCorpora.Where(c => c.ParallelCorpus.Id == parallelCorpus.Id) + .SelectMany(tc => + tc.TextCorpora.Select(textCorpus => + FilterInferencingCorpora(tc.MonolingualCorpus, textCorpus, ignoreUsfmMarkers) + ) + ) + .ChooseFirst(); + + // We need to align all three of these corpora because we need both the source and target + // content for inferencing (the target is only needed in some contexts like word alignment) + // as well as the target training corpus in order to determine whether a row was already + // used in training. + INParallelTextCorpus inferencingCorpus = new ITextCorpus[] + { + sourceInferencingCorpus, + targetInferencingCorpus, + targetTrainingCorpus, + }.AlignMany([true, false, false]); + + foreach ((Row row, bool isInTrainingData) in CollapseInferencingRanges(inferencingCorpus.ToArray())) + { + await inference(row, isInTrainingData, parallelCorpus.Id); + } + } + + // Only train on key terms if there were other parallel scripture data. + // This is necessary to support inference-only jobs since the terms are not + // filtered by the filters specified in the monolingual corpora. + if (useKeyTerms && parallelTrainingDataPresent) + { + foreach (Row row in keyTermTrainingData) + { + await train(row, TrainingDataType.KeyTerm); + } + } + } + + private static ITextCorpus FilterInferencingCorpora( + MonolingualCorpus corpus, + ITextCorpus textCorpus, + HashSet ignoreUsfmMarkers + ) + { + textCorpus = textCorpus.Transform(CleanSegment); + if (corpus.InferenceTextIds is not null) + { + textCorpus = textCorpus.FilterTexts(corpus.InferenceTextIds); + } + else if (corpus.InferenceChapters is not null) + { + textCorpus = textCorpus + .FilterTexts(corpus.InferenceChapters.Keys) + .Where(row => row.Ref is not ScriptureRef sr || IsInChapters(sr, corpus.InferenceChapters)); + } + return textCorpus.Where(row => row.Ref is not ScriptureRef sr || !HasIgnorableMarker(sr, ignoreUsfmMarkers)); + } + + private static ITextCorpus FilterTrainingCorpora(MonolingualCorpus corpus, ITextCorpus textCorpus) + { + textCorpus = textCorpus.Transform(CleanSegment); + if (corpus.TrainOnTextIds is not null) + { + return textCorpus.FilterTexts(corpus.TrainOnTextIds); + } + if (corpus.TrainOnChapters is not null) + { + return textCorpus + .FilterTexts(corpus.TrainOnChapters.Keys) + .Where(row => row.Ref is not ScriptureRef sr || IsInChapters(sr, corpus.TrainOnChapters)); + } + return textCorpus; + } + + private static IEnumerable CollapseRanges(ParallelTextRow[] rows) + { + StringBuilder srcSegBuffer = new(); + StringBuilder trgSegBuffer = new(); + List sourceRefs = []; + List targetRefs = []; + string textId = ""; + bool hasUnfinishedRange = false; + + foreach (ParallelTextRow row in rows) + { + if ( + hasUnfinishedRange + && (!row.IsTargetInRange || row.IsTargetRangeStart) + && (!row.IsSourceInRange || row.IsSourceRangeStart) + ) + { + yield return new Row( + textId, + sourceRefs, + targetRefs, + srcSegBuffer.ToString(), + trgSegBuffer.ToString(), + 1 + ); + + srcSegBuffer.Clear(); + trgSegBuffer.Clear(); + targetRefs.Clear(); + + hasUnfinishedRange = false; + } + + textId = row.TextId; + sourceRefs.AddRange(row.SourceRefs); + targetRefs.AddRange(row.TargetRefs); + if (row.SourceText.Length > 0) + { + if (srcSegBuffer.Length > 0) + srcSegBuffer.Append(' '); + srcSegBuffer.Append(row.SourceText); + } + if (row.TargetText.Length > 0) + { + if (trgSegBuffer.Length > 0) + trgSegBuffer.Append(' '); + trgSegBuffer.Append(row.TargetText); + } + + if (row.IsTargetInRange || row.IsSourceInRange) + { + hasUnfinishedRange = true; + continue; + } + + yield return new Row(textId, sourceRefs, targetRefs, srcSegBuffer.ToString(), trgSegBuffer.ToString(), 1); + + srcSegBuffer.Clear(); + trgSegBuffer.Clear(); + sourceRefs.Clear(); + targetRefs.Clear(); + } + if (hasUnfinishedRange) + { + yield return new Row(textId, sourceRefs, targetRefs, srcSegBuffer.ToString(), trgSegBuffer.ToString(), 1); + } + } + + private static IEnumerable<(Row, bool)> CollapseInferencingRanges(NParallelTextRow[] rows) + { + StringBuilder srcSegBuffer = new(); + StringBuilder trgSegBuffer = new(); + List sourceRefs = []; + List targetRefs = []; + string textId = ""; + bool hasUnfinishedRange = false; + bool isInTrainingData = false; + + foreach (NParallelTextRow row in rows) + { + //row at 0 is source filtered for inferencing, row at 1 is target filtered for inferencing, row at 2 is target filtered for training + if ( + hasUnfinishedRange + && (!row.IsInRange(0) || row.IsRangeStart(0)) + && (!row.IsInRange(1) || row.IsRangeStart(1)) + && (!row.IsInRange(2) || row.IsRangeStart(2)) + ) + { + yield return ( + new Row(textId, sourceRefs, targetRefs, srcSegBuffer.ToString(), trgSegBuffer.ToString(), 1), + isInTrainingData + ); + + srcSegBuffer.Clear(); + trgSegBuffer.Clear(); + sourceRefs.Clear(); + targetRefs.Clear(); + isInTrainingData = false; + hasUnfinishedRange = false; + } + + textId = row.TextId; + sourceRefs.AddRange(row.NRefs[0]); + targetRefs.AddRange(row.NRefs[2].Count > 0 ? row.NRefs[2] : row.NRefs[1]); + isInTrainingData = isInTrainingData || row.Text(2).Length > 0; + + if (row.Text(0).Length > 0) + { + if (srcSegBuffer.Length > 0) + srcSegBuffer.Append(' '); + srcSegBuffer.Append(row.Text(0)); + } + if (row.Text(1).Length > 0) + { + if (trgSegBuffer.Length > 0) + trgSegBuffer.Append(' '); + trgSegBuffer.Append(row.Text(1)); + } + + if (row.IsInRange(0) || row.IsInRange(1) || row.IsInRange(2)) + { + hasUnfinishedRange = true; + continue; + } + + yield return ( + new Row(textId, sourceRefs, targetRefs, srcSegBuffer.ToString(), trgSegBuffer.ToString(), 1), + isInTrainingData + ); + + srcSegBuffer.Clear(); + trgSegBuffer.Clear(); + sourceRefs.Clear(); + targetRefs.Clear(); + isInTrainingData = false; + } + if (hasUnfinishedRange) + { + yield return ( + new Row(textId, sourceRefs, targetRefs, srcSegBuffer.ToString(), trgSegBuffer.ToString(), 1), + isInTrainingData + ); + } + } + + private static bool IsScriptureRow(TextRow parallelTextRow) + { + return parallelTextRow.Ref is ScriptureRef sr && sr.IsVerse; + } + + private static bool IsInChapters(ScriptureRef sr, Dictionary> selection) + { + return selection.TryGetValue(sr.Book, out HashSet? chapters) + && chapters != null + && (chapters.Count == 0 || chapters.Contains(sr.ChapterNum)); + } + + private static bool HasIgnorableMarker(ScriptureRef sr, HashSet ignoreUsfmMarkers) + { + return sr.Path.Any(e => ignoreUsfmMarkers.Contains(e.Name)); + } + + private static TextRow CleanSegment(TextRow row) + { + if (row.Text == "...") + row.Segment = []; + return row; + } + + private static HashSet? GetBooks(MonolingualCorpus corpus) + { + if (!corpus.IsFiltered) + return null; + + List books = []; + if (corpus.TrainOnTextIds != null) + { + books.AddRange(corpus.TrainOnTextIds); + } + else if (corpus.TrainOnChapters != null) + { + books.AddRange(corpus.TrainOnChapters.Keys); + } + + if (corpus.InferenceTextIds != null) + { + books.AddRange(corpus.InferenceTextIds); + } + else if (corpus.InferenceChapters != null) + { + books.AddRange(corpus.InferenceChapters.Keys); + } + + return [.. books.Select(bookName => Canon.BookIdToNumber(bookName))]; + } + + public string UpdateSourceUsfm( + IReadOnlyList parallelCorpora, + string corpusId, + string bookId, + IReadOnlyList rows, + UpdateUsfmMarkerBehavior paragraphBehavior, + UpdateUsfmMarkerBehavior embedBehavior, + UpdateUsfmMarkerBehavior styleBehavior, + bool placeParagraphMarkers, + IEnumerable? remarks, + string? targetQuoteConvention + ) + { + return UpdateUsfm( + parallelCorpora, + corpusId, + bookId, + rows, + UpdateUsfmTextBehavior.StripExisting, + paragraphBehavior, + embedBehavior, + styleBehavior, + placeParagraphMarkers ? [new PlaceMarkersUsfmUpdateBlockHandler()] : null, + remarks, + targetQuoteConvention, + isSource: true + ); + } + + public string UpdateTargetUsfm( + IReadOnlyList parallelCorpora, + string corpusId, + string bookId, + IReadOnlyList rows, + UpdateUsfmTextBehavior textBehavior, + UpdateUsfmMarkerBehavior paragraphBehavior, + UpdateUsfmMarkerBehavior embedBehavior, + UpdateUsfmMarkerBehavior styleBehavior, + IEnumerable? remarks, + string? targetQuoteConvention + ) + { + return UpdateUsfm( + parallelCorpora, + corpusId, + bookId, + rows, + textBehavior, + paragraphBehavior, + embedBehavior, + styleBehavior, + updateBlockHandlers: null, + remarks, + targetQuoteConvention, + isSource: false + ); + } + + private static string UpdateUsfm( + IReadOnlyList parallelCorpora, + string corpusId, + string bookId, + IEnumerable rows, + UpdateUsfmTextBehavior textBehavior, + UpdateUsfmMarkerBehavior paragraphBehavior, + UpdateUsfmMarkerBehavior embedBehavior, + UpdateUsfmMarkerBehavior styleBehavior, + IEnumerable? updateBlockHandlers, + IEnumerable? remarks, + string? targetQuoteConvention, + bool isSource + ) + { + CorpusBundle corpusBundle = new(parallelCorpora); + ParallelCorpus corpus = corpusBundle.ParallelCorpora.Single(c => c.Id == corpusId); + CorpusFile sourceFile = corpus.SourceCorpora[0].Files[0]; + CorpusFile targetFile = corpus.TargetCorpora[0].Files[0]; + ParatextProjectSettings? sourceSettings = corpusBundle.GetSettings(sourceFile.Location); + ParatextProjectSettings? targetSettings = corpusBundle.GetSettings(targetFile.Location); + + using ZipParatextProjectTextUpdater updater = corpusBundle.GetTextUpdater( + isSource ? sourceFile.Location : targetFile.Location + ); + string usfm = + updater.UpdateUsfm( + bookId, + rows.Select(p => + Map( + p, + isSource, + sourceSettings?.Versification, + targetSettings?.Versification, + paragraphBehavior, + styleBehavior + ) + ) + .Where(row => row.Refs.Any()) + .OrderBy(row => row.Refs[0]) + .ToArray(), + isSource ? sourceSettings?.FullName : targetSettings?.FullName, + textBehavior, + paragraphBehavior, + embedBehavior, + styleBehavior, + updateBlockHandlers: updateBlockHandlers, + remarks: remarks, + errorHandler: (_) => true, + compareSegments: isSource + ) ?? ""; + + if (!string.IsNullOrEmpty(targetQuoteConvention)) + usfm = DenormalizeQuotationMarks(usfm, targetQuoteConvention); + return usfm; + } + + private static UpdateUsfmRow Map( + ParallelRow row, + bool isSource, + ScrVers? sourceVersification, + ScrVers? targetVersification, + UpdateUsfmMarkerBehavior paragraphBehavior, + UpdateUsfmMarkerBehavior styleBehavior + ) + { + Dictionary? metadata = null; + if (row.Alignment is not null) + { + metadata = new Dictionary + { + { + PlaceMarkersAlignmentInfo.MetadataKey, + new PlaceMarkersAlignmentInfo( + row.SourceTokens, + row.TargetTokens, + CreateWordAlignmentMatrix(row), + paragraphBehavior, + styleBehavior + ) + }, + }; + } + + ScriptureRef[] refs; + if (isSource) + { + refs = ( + row.SourceRefs.Any() + ? Map(row.SourceRefs, sourceVersification) + : Map(row.TargetRefs, targetVersification) + ).ToArray(); + } + else + { + // the pretranslations are generated from the source book and inserted into the target book + // use relaxed references since the USFM structure may not be the same + refs = Map(row.TargetRefs, targetVersification).Select(r => r.ToRelaxed()).ToArray(); + } + + return new UpdateUsfmRow(refs, row.TargetText, metadata); + } + + private static IEnumerable Map(IEnumerable refs, ScrVers? versification) + { + return refs.Select(r => + { + ScriptureRef.TryParse(r, versification, out ScriptureRef sr); + return sr; + }) + .Where(r => !r.IsEmpty); + } + + private static WordAlignmentMatrix? CreateWordAlignmentMatrix(ParallelRow row) + { + if (row.Alignment is null || row.SourceTokens is null || row.TargetTokens is null) + { + return null; + } + + var matrix = new WordAlignmentMatrix(row.SourceTokens.Count, row.TargetTokens.Count); + foreach (AlignedWordPair wordPair in row.Alignment) + matrix[wordPair.SourceIndex, wordPair.TargetIndex] = true; + + return matrix; + } + + private static string DenormalizeQuotationMarks(string usfm, string quoteConvention) + { + QuoteConvention targetQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName(quoteConvention); + if (targetQuoteConvention is null) + return usfm; + + QuotationMarkDenormalizationFirstPass quotationMarkDenormalizationFirstPass = new(targetQuoteConvention); + + UsfmParser.Parse(usfm, quotationMarkDenormalizationFirstPass); + List<(int ChapterNumber, QuotationMarkUpdateStrategy Strategy)> bestChapterStrategies = + quotationMarkDenormalizationFirstPass.FindBestChapterStrategies(); + + QuotationMarkDenormalizationUsfmUpdateBlockHandler quotationMarkDenormalizer = new( + targetQuoteConvention, + new QuotationMarkUpdateSettings( + chapterStrategies: bestChapterStrategies.Select(tuple => tuple.Strategy).ToList() + ) + ); + int denormalizableChapterCount = bestChapterStrategies.Count(tup => + tup.Strategy != QuotationMarkUpdateStrategy.Skip + ); + List remarks = []; + string quotationDenormalizationRemark; + if (denormalizableChapterCount == bestChapterStrategies.Count) + { + quotationDenormalizationRemark = + "The quote style in all chapters has been automatically adjusted to match the rest of the project."; + } + else if (denormalizableChapterCount > 0) + { + quotationDenormalizationRemark = + "The quote style in the following chapters has been automatically adjusted to match the rest of the project: " + + GetChapterRangesString( + bestChapterStrategies + .Where(tuple => tuple.Strategy != QuotationMarkUpdateStrategy.Skip) + .Select(tuple => tuple.ChapterNumber) + .ToList() + ) + + "."; + } + else + { + quotationDenormalizationRemark = + "The quote style was not automatically adjusted to match the rest of your project in any chapters."; + } + remarks.Add(quotationDenormalizationRemark); + + var updater = new UpdateUsfmParserHandler(updateBlockHandlers: [quotationMarkDenormalizer], remarks: remarks); + UsfmParser.Parse(usfm, updater); + + usfm = updater.GetUsfm(); + return usfm; + } + + public static string GetChapterRangesString(List chapterNumbers) + { + chapterNumbers = chapterNumbers.Order().ToList(); + int start = chapterNumbers[0]; + int end = chapterNumbers[0]; + List chapterRangeStrings = []; + foreach (int chapterNumber in chapterNumbers[1..]) + { + if (chapterNumber == end + 1) + { + end = chapterNumber; + } + else + { + if (start == end) + { + chapterRangeStrings.Add(start.ToString(CultureInfo.InvariantCulture)); + } + else + { + chapterRangeStrings.Add($"{start}-{end}"); + } + start = chapterNumber; + end = chapterNumber; + } + } + if (start == end) + { + chapterRangeStrings.Add(start.ToString(CultureInfo.InvariantCulture)); + } + else + { + chapterRangeStrings.Add($"{start}-{end}"); + } + return string.Join(", ", chapterRangeStrings); + } + + public Dictionary> GetChapters( + IReadOnlyList parallelCorpora, + string fileLocation, + string scriptureRange + ) + { + CorpusBundle corpusBundle = new(parallelCorpora); + try + { + return ScriptureRangeParser.GetChapters( + scriptureRange, + corpusBundle.GetSettings(fileLocation)?.Versification + ); + } + catch (ArgumentException ae) + { + throw new InvalidOperationException($"The scripture range {scriptureRange} is not valid: {ae.Message}"); + } + } +} diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/TextCorpusService.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/TextCorpusService.cs deleted file mode 100644 index 4ee909b2a..000000000 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/TextCorpusService.cs +++ /dev/null @@ -1,51 +0,0 @@ -namespace SIL.ServiceToolkit.Services; - -public class TextCorpusService : ITextCorpusService -{ - public IEnumerable CreateTextCorpora(IReadOnlyList files) - { - List corpora = []; - - List> textFileCorpora = []; - foreach (CorpusFile file in files) - { - switch (file.Format) - { - case FileFormat.Text: - // if there are multiple texts with the same id, then add it to a new corpus or the first - // corpus that doesn't contain a text with that id - Dictionary? corpus = textFileCorpora.FirstOrDefault(c => - !c.ContainsKey(file.TextId) - ); - if (corpus is null) - { - corpus = []; - textFileCorpora.Add(corpus); - } - corpus[file.TextId] = new TextFileText(file.TextId, file.Location); - break; - - case FileFormat.Paratext: - corpora.Add(new ParatextBackupTextCorpus(file.Location, includeAllText: true)); - break; - } - } - foreach (Dictionary corpus in textFileCorpora) - corpora.Add(new DictionaryTextCorpus(corpus.Values)); - - return corpora; - } - - public IEnumerable CreateTermCorpora(IReadOnlyList corpusFiles) - { - foreach (CorpusFile file in corpusFiles) - { - switch (file.Format) - { - case FileFormat.Paratext: - yield return new ParatextBackupTermsCorpus(file.Location, ["PN"]); - break; - } - } - } -} diff --git a/src/Serval/src/Serval.Shared/Services/ZipParatextProjectFileHandler.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ZipParatextProjectFileHandler.cs similarity index 97% rename from src/Serval/src/Serval.Shared/Services/ZipParatextProjectFileHandler.cs rename to src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ZipParatextProjectFileHandler.cs index c0e50d549..79a9cd49d 100644 --- a/src/Serval/src/Serval.Shared/Services/ZipParatextProjectFileHandler.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ZipParatextProjectFileHandler.cs @@ -1,6 +1,6 @@ using SIL.IO; -namespace Serval.Shared.Services; +namespace SIL.ServiceToolkit.Services; public class ZipParatextProjectFileHandler(IZipContainer container) : IParatextProjectFileHandler { diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ZipParatextProjectSettingsParser.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ZipParatextProjectSettingsParser.cs new file mode 100644 index 000000000..5d3075d58 --- /dev/null +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ZipParatextProjectSettingsParser.cs @@ -0,0 +1,6 @@ +namespace SIL.ServiceToolkit.Services; + +public class ZipParatextProjectSettingsParser( + IZipContainer projectContainer, + ParatextProjectSettings? parentProjectSettings = null +) : ParatextProjectSettingsParserBase(new ZipParatextProjectFileHandler(projectContainer), parentProjectSettings) { } diff --git a/src/Serval/src/Serval.Shared/Services/ZipParatextProjectTextUpdater.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ZipParatextProjectTextUpdater.cs similarity index 91% rename from src/Serval/src/Serval.Shared/Services/ZipParatextProjectTextUpdater.cs rename to src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ZipParatextProjectTextUpdater.cs index cc83e84dc..506c8dd2a 100644 --- a/src/Serval/src/Serval.Shared/Services/ZipParatextProjectTextUpdater.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ZipParatextProjectTextUpdater.cs @@ -1,9 +1,9 @@ -namespace Serval.Shared.Services; +namespace SIL.ServiceToolkit.Services; public class ZipParatextProjectTextUpdater(IZipContainer projectContainer, ParatextProjectSettings? settings = null) : ParatextProjectTextUpdaterBase( new ZipParatextProjectFileHandler(projectContainer), - settings ?? new ZipParatextProjectSettingsParser(projectContainer).Parse() + settings ?? new ZipParatextProjectSettingsParser(projectContainer, settings).Parse() ), IDisposable { diff --git a/src/ServiceToolkit/src/SIL.ServiceToolkit/Usings.cs b/src/ServiceToolkit/src/SIL.ServiceToolkit/Usings.cs index e4b01cb60..b52b3f447 100644 --- a/src/ServiceToolkit/src/SIL.ServiceToolkit/Usings.cs +++ b/src/ServiceToolkit/src/SIL.ServiceToolkit/Usings.cs @@ -26,4 +26,5 @@ global using SIL.ServiceToolkit.Configuration; global using SIL.ServiceToolkit.Models; global using SIL.ServiceToolkit.Services; +global using SIL.ServiceToolkit.Utils; global using SIL.WritingSystems; diff --git a/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/CorpusBundleTests.cs b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/CorpusBundleTests.cs new file mode 100644 index 000000000..2cbf44bc8 --- /dev/null +++ b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/CorpusBundleTests.cs @@ -0,0 +1,279 @@ +using SIL.ServiceToolkit.Utils; + +namespace SIL.ServiceToolkit.Services; + +public class CorpusBundleTests +{ + [Test] + public void GetSettings() + { + using TestEnvironment env = new(addParatext: true, addText: false); + string fileLocation = env.CorpusBundle.ParallelCorpora[0].SourceCorpora[0].Files[0].Location; + ParatextProjectSettings? settings = env.CorpusBundle.GetSettings(fileLocation); + Assert.That(settings, Is.Not.Null); + Assert.That(settings.Name, Is.EqualTo("Te1")); + Assert.That(env.CorpusBundle.ParentOf(fileLocation), Is.Null); + } + + [Test] + public void GetSettings_TextFile() + { + using TestEnvironment env = new(addParatext: false, addText: true); + string fileLocation = env.CorpusBundle.ParallelCorpora[0].SourceCorpora[0].Files[0].Location; + ParatextProjectSettings? settings = env.CorpusBundle.GetSettings(fileLocation); + Assert.That(settings, Is.Null); + Assert.That(env.CorpusBundle.ParentOf(fileLocation), Is.Null); + } + + [Test] + public void GetTextUpdater() + { + using TestEnvironment env = new(addParatext: true, addText: false); + string fileLocation = env.CorpusBundle.ParallelCorpora[0].SourceCorpora[0].Files[0].Location; + using ZipParatextProjectTextUpdater updater = env.CorpusBundle.GetTextUpdater(fileLocation); + Assert.That( + updater.UpdateUsfm("MAT", [], textBehavior: UpdateUsfmTextBehavior.PreferExisting).ReplaceLineEndings("\n"), + Is.EqualTo( + $@"\id MAT - Test +\h Matthew +\mt Matthew +\ip An introduction to Matthew +\c 1 +\p +\v 1 Source one, chapter one, verse one. +\v 2-3 Source one, chapter one, verse two and three. +\v 4 Source one, chapter one, verse four. +\v 5 Source one, chapter one, verse five. +\v 6 Source one, chapter one, verse six. +\v 7-9 Source one, chapter one, verse seven, eight, and nine. +\v 10 Source one, chapter one, verse ten. +\c 2 +\p +\v 1 Source one, chapter two, verse one. +\v 2 Source one, chapter two, verse two. “a quotation” +\v 3 ... +\v 4 ... +" + ) + .IgnoreLineEndings() + ); + } + + [Test] + public void GetTextUpdater_TextFile() + { + using TestEnvironment env = new(addParatext: false, addText: true); + string fileLocation = env.CorpusBundle.ParallelCorpora[0].SourceCorpora[0].Files[0].Location; + Assert.Throws(() => env.CorpusBundle.GetTextUpdater(fileLocation)); + } + + [Test] + public void GetTextCorpora() + { + using TestEnvironment env = new(addParatext: true, addText: true); + + Assert.That(env.CorpusBundle.ParallelCorpora, Has.Count.EqualTo(3)); + + Assert.That(env.CorpusBundle.SourceTermCorpora.Count(c => c.TextCorpora.Any()), Is.EqualTo(2)); + Assert.That( + env.CorpusBundle.SourceTermCorpora.SelectMany(c => c.TextCorpora) + .All(tc => tc.First().ContentType == TextRowContentType.Word) + ); + Assert.That(env.CorpusBundle.TargetTermCorpora.Count(c => c.TextCorpora.Any()), Is.EqualTo(2)); + Assert.That( + env.CorpusBundle.TargetTermCorpora.SelectMany(c => c.TextCorpora) + .All(tc => tc.First().ContentType == TextRowContentType.Word) + ); + + Assert.That(env.CorpusBundle.SourceTextCorpora.SelectMany(c => c.TextCorpora).Count(), Is.EqualTo(4)); + Assert.That( + env.CorpusBundle.SourceTextCorpora.SelectMany(c => c.TextCorpora) + .All(tc => tc.First().ContentType == TextRowContentType.Segment) + ); + Assert.That(env.CorpusBundle.TargetTextCorpora.SelectMany(c => c.TextCorpora).Count(), Is.EqualTo(3)); + Assert.That( + env.CorpusBundle.TargetTextCorpora.SelectMany(c => c.TextCorpora) + .All(tc => tc.First().ContentType == TextRowContentType.Segment) + ); + } + + private class TestEnvironment : DisposableBase + { + public TestEnvironment(bool addParatext, bool addText) + { + CorpusBundle = new CorpusBundle(GetCorpora(addParatext, addText)); + } + + public CorpusBundle CorpusBundle { get; } + + private static readonly string TestDataPath = Path.Combine( + AppContext.BaseDirectory, + "..", + "..", + "..", + "Services", + "data" + ); + private readonly TempDirectory _tempDir = new(name: "CorpusBundleTests"); + + public ParallelCorpus[] GetCorpora(bool addParatext, bool addText) + { + List parallelCorpora = []; + if (addParatext) + { + parallelCorpora.AddRange( + new ParallelCorpus + { + Id = "corpus1", + SourceCorpora = + [ + new MonolingualCorpus + { + Id = "pt-source1", + Language = "en", + Files = + [ + new CorpusFile + { + TextId = "textId1", + Format = FileFormat.Paratext, + Location = ZipParatextProject("pt-source1"), + }, + ], + InferenceTextIds = [], + }, + ], + TargetCorpora = + [ + new MonolingualCorpus + { + Id = "pt-target1", + Language = "en", + Files = + [ + new CorpusFile + { + TextId = "textId1", + Format = FileFormat.Paratext, + Location = ZipParatextProject("pt-target1"), + }, + ], + }, + ], + }, + new ParallelCorpus + { + Id = "corpus2", + SourceCorpora = + [ + new MonolingualCorpus + { + Id = "pt-source1", + Language = "en", + Files = + [ + new CorpusFile + { + TextId = "textId1", + Format = FileFormat.Paratext, + Location = ZipParatextProject("pt-source1"), + }, + ], + TrainOnTextIds = [], + }, + ], + TargetCorpora = + [ + new MonolingualCorpus + { + Id = "pt-target1", + Language = "en", + Files = + [ + new CorpusFile + { + TextId = "textId1", + Format = FileFormat.Paratext, + Location = ZipParatextProject("pt-target1"), + }, + ], + TrainOnTextIds = [], + }, + ], + } + ); + } + if (addText) + { + parallelCorpora.AddRange( + new ParallelCorpus + { + Id = "corpus1", + SourceCorpora = + [ + new MonolingualCorpus + { + Id = "source-corpus1", + Language = "en", + Files = + [ + new CorpusFile + { + TextId = "textId1", + Format = FileFormat.Text, + Location = Path.Combine(TestDataPath, "source1.txt"), + }, + ], + }, + new MonolingualCorpus + { + Id = "source-corpus2", + Language = "en", + Files = + [ + new CorpusFile + { + TextId = "textId1", + Format = FileFormat.Text, + Location = Path.Combine(TestDataPath, "source2.txt"), + }, + ], + }, + ], + TargetCorpora = + [ + new MonolingualCorpus + { + Id = "target-corpus1", + Language = "en", + Files = + [ + new CorpusFile + { + TextId = "textId1", + Format = FileFormat.Text, + Location = Path.Combine(TestDataPath, "target1.txt"), + }, + ], + }, + ], + } + ); + } + return parallelCorpora.ToArray(); + } + + protected override void DisposeManagedResources() + { + _tempDir.Dispose(); + } + + private string ZipParatextProject(string name) + { + string fileName = Path.Combine(_tempDir.Path, $"{name}.zip"); + if (!File.Exists(fileName)) + ZipFile.CreateFromDirectory(Path.Combine(TestDataPath, name), fileName); + return fileName; + } + } +} diff --git a/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/ParallelCorpusProcessingServiceTests.cs b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/ParallelCorpusServiceTests.cs similarity index 94% rename from src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/ParallelCorpusProcessingServiceTests.cs rename to src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/ParallelCorpusServiceTests.cs index d861a9599..4032c8200 100644 --- a/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/ParallelCorpusProcessingServiceTests.cs +++ b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Services/ParallelCorpusServiceTests.cs @@ -1,7 +1,7 @@ namespace SIL.ServiceToolkit.Services; [TestFixture] -public class ParallelCorpusPreprocessingServiceTests +public class ParallelCorpusServiceTests { [Test] public void TestParallelCorpusAnalysis_FileFormatParatext() @@ -10,9 +10,9 @@ public void TestParallelCorpusAnalysis_FileFormatParatext() ParallelCorpus parallelCorpus = env.GetCorpora(paratextProject: true).First(); const string ExpectedTargetName = "typewriter_english"; - QuoteConventionAnalysis? targetQuotationConvention = env.Processor.AnalyzeTargetCorpusQuoteConvention( - parallelCorpus - ); + QuoteConventionAnalysis? targetQuotationConvention = env.Processor.AnalyzeTargetQuoteConvention([ + parallelCorpus, + ]); Assert.Multiple(() => { @@ -27,9 +27,9 @@ public void TestParallelCorpusAnalysis_FileFormatText() using var env = new TestEnvironment(); ParallelCorpus parallelCorpus = env.GetCorpora(paratextProject: false).First(); - QuoteConventionAnalysis? targetQuotationConvention = env.Processor.AnalyzeTargetCorpusQuoteConvention( - parallelCorpus - ); + QuoteConventionAnalysis? targetQuotationConvention = env.Processor.AnalyzeTargetQuoteConvention([ + parallelCorpus, + ]); Assert.Multiple(() => { @@ -39,7 +39,7 @@ public void TestParallelCorpusAnalysis_FileFormatText() } [Test] - public async Task TestParallelCorpusPreprocessor_FileFormatText() + public async Task TestPreprocess_FileFormatText() { using var env = new TestEnvironment(); IReadOnlyList corpora = env.GetCorpora(paratextProject: false); @@ -73,7 +73,7 @@ await env.Processor.PreprocessAsync( } [Test] - public async Task TestParallelCorpusPreprocessor_FileFormatParatext() + public async Task TestPreprocess_FileFormatParatext() { using var env = new TestEnvironment(); IReadOnlyList corpora = env.GetCorpora(paratextProject: true); @@ -123,10 +123,9 @@ private class TestEnvironment : DisposableBase "Services", "data" ); - private readonly TempDirectory _tempDir = new TempDirectory(name: "ParallelCorpusProcessingServiceTests"); + private readonly TempDirectory _tempDir = new TempDirectory(name: "ParallelCorpusServiceTests"); - public IParallelCorpusPreprocessingService Processor { get; } = - new ParallelCorpusPreprocessingService(new TextCorpusService()); + public IParallelCorpusService Processor { get; } = new ParallelCorpusService(); public ParallelCorpus[] GetCorpora(bool paratextProject) { diff --git a/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Usings.cs b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Usings.cs index 4fb4a8aed..19d74f19e 100644 --- a/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Usings.cs +++ b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Usings.cs @@ -6,7 +6,9 @@ global using NSubstitute; global using NSubstitute.ExceptionExtensions; global using NUnit.Framework; +global using NUnit.Framework.Constraints; global using SIL.DataAccess; +global using SIL.Machine.Corpora; global using SIL.Machine.PunctuationAnalysis; global using SIL.Machine.Utils; global using SIL.ObjectModel; diff --git a/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Utils/NUnitExtensions.cs b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Utils/NUnitExtensions.cs new file mode 100644 index 000000000..e52803012 --- /dev/null +++ b/src/ServiceToolkit/test/SIL.ServiceToolkit.Tests/Utils/NUnitExtensions.cs @@ -0,0 +1,9 @@ +namespace SIL.ServiceToolkit.Utils; + +public static class NUnitExtensions +{ + public static EqualUsingConstraint IgnoreLineEndings(this EqualStringConstraint constraint) + { + return constraint.Using(new IgnoreLineEndingsStringComparer()); + } +}