diff --git a/src/Echo/src/EchoEngine/TranslationEngineServiceV1.cs b/src/Echo/src/EchoEngine/TranslationEngineServiceV1.cs index dd84b952..53ca037c 100644 --- a/src/Echo/src/EchoEngine/TranslationEngineServiceV1.cs +++ b/src/Echo/src/EchoEngine/TranslationEngineServiceV1.cs @@ -152,6 +152,7 @@ await _parallelCorpusService.PreprocessAsync( (_, i) => new AlignedWordPair() { SourceIndex = i, TargetIndex = i } ), }, + Confidence = 1.0, } ); if (row.SourceSegment.Length > 0 && !isInTrainingData) diff --git a/src/Machine/src/Serval.Machine.Shared/Consumers/TranslationInsertPretranslationsConsumer.cs b/src/Machine/src/Serval.Machine.Shared/Consumers/TranslationInsertPretranslationsConsumer.cs index 655a94b8..816448b9 100644 --- a/src/Machine/src/Serval.Machine.Shared/Consumers/TranslationInsertPretranslationsConsumer.cs +++ b/src/Machine/src/Serval.Machine.Shared/Consumers/TranslationInsertPretranslationsConsumer.cs @@ -41,6 +41,7 @@ CancellationToken cancellationToken Translation = pretranslation.Translation, SourceTokens = { pretranslation.SourceTokens }, TranslationTokens = { pretranslation.TranslationTokens }, + Confidence = pretranslation.Confidence, }; if (pretranslation.Alignment is not null) request.Alignment.Add(pretranslation.Alignment.Select(Map)); @@ -83,6 +84,7 @@ JsonSerializerOptions options sourceTokens = [], translationTokens = []; IReadOnlyList alignedWordPairs = []; + double confidence = 0.0; while (reader.Read() && reader.TokenType != JsonTokenType.EndObject) { if (reader.TokenType == JsonTokenType.PropertyName) @@ -128,6 +130,10 @@ JsonSerializerOptions options reader.Read(); alignedWordPairs = SIL.Machine.Corpora.AlignedWordPair.Parse(reader.GetString()).ToArray(); break; + case "sequenceConfidence": + reader.Read(); + confidence = reader.GetDouble(); + break; default: throw new JsonException( $"Unexpected property name {s} when deserializing Pretranslation object" @@ -145,6 +151,7 @@ JsonSerializerOptions options Alignment = alignedWordPairs, SourceTokens = sourceTokens, TranslationTokens = translationTokens, + Confidence = confidence, }; } diff --git a/src/Machine/src/Serval.Machine.Shared/Models/Pretranslation.cs b/src/Machine/src/Serval.Machine.Shared/Models/Pretranslation.cs index 60ea7860..a1ea6e74 100644 --- a/src/Machine/src/Serval.Machine.Shared/Models/Pretranslation.cs +++ b/src/Machine/src/Serval.Machine.Shared/Models/Pretranslation.cs @@ -10,4 +10,5 @@ public record Pretranslation public IEnumerable? SourceTokens { get; init; } public IEnumerable? TranslationTokens { get; init; } public IReadOnlyList? Alignment { get; init; } + public double Confidence { get; init; } } diff --git a/src/Machine/src/Serval.Machine.Shared/Services/ServalTranslationEngineServiceV1.cs b/src/Machine/src/Serval.Machine.Shared/Services/ServalTranslationEngineServiceV1.cs index 27270956..44b5d3d6 100644 --- a/src/Machine/src/Serval.Machine.Shared/Services/ServalTranslationEngineServiceV1.cs +++ b/src/Machine/src/Serval.Machine.Shared/Services/ServalTranslationEngineServiceV1.cs @@ -238,14 +238,13 @@ private static Translation.V1.TranslationSources Map(SIL.Machine.Translation.Tra private static IEnumerable Map(WordAlignmentMatrix source) { - for (int i = 0; i < source.RowCount; i++) - { - for (int j = 0; j < source.ColumnCount; j++) + return source + .ToAlignedWordPairs() + .Select(wp => new Translation.V1.AlignedWordPair { - if (source[i, j]) - yield return new Translation.V1.AlignedWordPair { SourceIndex = i, TargetIndex = j }; - } - } + SourceIndex = wp.SourceIndex, + TargetIndex = wp.TargetIndex, + }); } private static Translation.V1.Phrase Map(SIL.Machine.Translation.Phrase source) diff --git a/src/Machine/test/Serval.Machine.Shared.Tests/Consumers/TranslationInsertPretranslationsConsumerTests.cs b/src/Machine/test/Serval.Machine.Shared.Tests/Consumers/TranslationInsertPretranslationsConsumerTests.cs index 826cb012..8b45f10d 100644 --- a/src/Machine/test/Serval.Machine.Shared.Tests/Consumers/TranslationInsertPretranslationsConsumerTests.cs +++ b/src/Machine/test/Serval.Machine.Shared.Tests/Consumers/TranslationInsertPretranslationsConsumerTests.cs @@ -24,6 +24,7 @@ await JsonSerializer.SerializeAsync( ["textId"] = "MAT", ["refs"] = new JsonArray { "MAT 1:1" }, ["translation"] = "translation", + ["sequenceConfidence"] = 0.5, }, } ); @@ -44,6 +45,7 @@ await JsonSerializer.SerializeAsync( SourceRefs = { }, TargetRefs = { "MAT 1:1" }, Translation = "translation", + Confidence = 0.5, }, Arg.Any() ); @@ -67,7 +69,10 @@ await JsonSerializer.SerializeAsync( ["textId"] = "MAT", ["sourceRefs"] = new JsonArray { "MAT 1:1" }, ["targetRefs"] = new JsonArray { "MAT 1:1" }, + ["sourceTokens"] = new JsonArray { "translation" }, + ["translationTokens"] = new JsonArray { "translation" }, ["translation"] = "translation", + ["alignment"] = "0-0", }, } ); @@ -88,6 +93,13 @@ await JsonSerializer.SerializeAsync( SourceRefs = { "MAT 1:1" }, TargetRefs = { "MAT 1:1" }, Translation = "translation", + SourceTokens = { "translation" }, + TranslationTokens = { "translation" }, + Alignment = + { + new Translation.V1.AlignedWordPair { SourceIndex = 0, TargetIndex = 0 }, + }, + Confidence = 0.0, }, Arg.Any() ); diff --git a/src/Serval/src/Serval.Client/Client.g.cs b/src/Serval/src/Serval.Client/Client.g.cs index 218759a3..4f41e11b 100644 --- a/src/Serval/src/Serval.Client/Client.g.cs +++ b/src/Serval/src/Serval.Client/Client.g.cs @@ -11659,6 +11659,9 @@ public partial class Pretranslation [System.ComponentModel.DataAnnotations.Required(AllowEmptyStrings = true)] public string Translation { get; set; } = default!; + [Newtonsoft.Json.JsonProperty("confidence", Required = Newtonsoft.Json.Required.Always)] + public double Confidence { get; set; } = default!; + } [System.CodeDom.Compiler.GeneratedCode("NJsonSchema", "14.6.3.0 (NJsonSchema v11.5.2.0 (Newtonsoft.Json v13.0.0.0))")] diff --git a/src/Serval/src/Serval.Grpc/Protos/serval/translation/v1/platform.proto b/src/Serval/src/Serval.Grpc/Protos/serval/translation/v1/platform.proto index 2ec50f76..ed2d4043 100644 --- a/src/Serval/src/Serval.Grpc/Protos/serval/translation/v1/platform.proto +++ b/src/Serval/src/Serval.Grpc/Protos/serval/translation/v1/platform.proto @@ -70,6 +70,7 @@ message InsertPretranslationsRequest { repeated string source_tokens = 7; repeated string translation_tokens = 8; repeated AlignedWordPair alignment = 9; + double confidence = 10; } message UpdateBuildExecutionDataRequest { diff --git a/src/Serval/src/Serval.Translation/Contracts/PretranslationDto.cs b/src/Serval/src/Serval.Translation/Contracts/PretranslationDto.cs index 4d7c9cfb..d6487741 100644 --- a/src/Serval/src/Serval.Translation/Contracts/PretranslationDto.cs +++ b/src/Serval/src/Serval.Translation/Contracts/PretranslationDto.cs @@ -9,4 +9,5 @@ public record PretranslationDto [Obsolete] public IReadOnlyList? Refs { get; init; } public required string Translation { get; init; } + public double Confidence { get; init; } } diff --git a/src/Serval/src/Serval.Translation/Controllers/TranslationEnginesController.cs b/src/Serval/src/Serval.Translation/Controllers/TranslationEnginesController.cs index a1274812..001bdb5b 100644 --- a/src/Serval/src/Serval.Translation/Controllers/TranslationEnginesController.cs +++ b/src/Serval/src/Serval.Translation/Controllers/TranslationEnginesController.cs @@ -2004,6 +2004,7 @@ private static PretranslationDto Map(Pretranslation source) TargetRefs = source.TargetRefs ?? [], Refs = source.Refs, Translation = source.Translation, + Confidence = source.Confidence ?? -1.0, }; } diff --git a/src/Serval/src/Serval.Translation/Models/Pretranslation.cs b/src/Serval/src/Serval.Translation/Models/Pretranslation.cs index 74f76ad2..760006af 100644 --- a/src/Serval/src/Serval.Translation/Models/Pretranslation.cs +++ b/src/Serval/src/Serval.Translation/Models/Pretranslation.cs @@ -15,4 +15,5 @@ public class Pretranslation : IEntity public IReadOnlyList? SourceTokens { get; init; } public IReadOnlyList? TranslationTokens { get; init; } public IReadOnlyList? Alignment { get; init; } + public double? Confidence { get; init; } } diff --git a/src/Serval/src/Serval.Translation/Services/TranslationPlatformServiceV1.cs b/src/Serval/src/Serval.Translation/Services/TranslationPlatformServiceV1.cs index 10554a6c..cef0c866 100644 --- a/src/Serval/src/Serval.Translation/Services/TranslationPlatformServiceV1.cs +++ b/src/Serval/src/Serval.Translation/Services/TranslationPlatformServiceV1.cs @@ -381,6 +381,7 @@ ServerCallContext context SourceTokens = request.SourceTokens, TranslationTokens = request.TranslationTokens, Alignment = request.Alignment.Select(Map).ToList(), + Confidence = request.Confidence, } ); if (batch.Count == PretranslationInsertBatchSize) diff --git a/src/Serval/test/Serval.E2ETests/ServalApiTests.cs b/src/Serval/test/Serval.E2ETests/ServalApiTests.cs index ffefccb1..eedde8a1 100644 --- a/src/Serval/test/Serval.E2ETests/ServalApiTests.cs +++ b/src/Serval/test/Serval.E2ETests/ServalApiTests.cs @@ -263,18 +263,21 @@ public async Task Nmt_Paratext() }, ]; _helperClient.TranslationBuildConfig.Options = - "{\"max_steps\":10, \"use_key_terms\":true, \"train_params\": {\"per_device_train_batch_size\":4}}"; + "{\"max_steps\":50, \"use_key_terms\":true, \"parent_model_name\": \"facebook/nllb-200-distilled-600M\", \"train_params\": {\"per_device_train_batch_size\":4}, \"generate_params\":{\"num_beams\": 2}}"; await _helperClient.BuildEngineAsync(engineId); Assert.That( (await _helperClient.TranslationEnginesClient.GetAllBuildsAsync(engineId)).First().State, Is.EqualTo(JobState.Completed) ); + IList translations = await _helperClient.TranslationEnginesClient.GetAllPretranslationsAsync( engineId, inferencingParallelCorpusId ); Assert.That(translations, Is.Not.Empty); + Assert.That(translations[0].Confidence, Is.GreaterThan(0.0)); + IList firstJohnTranslations = await _helperClient.TranslationEnginesClient.GetAllPretranslationsAsync( engineId, @@ -283,12 +286,14 @@ await _helperClient.TranslationEnginesClient.GetAllPretranslationsAsync( ); // Only non-scripture was translated Assert.That(firstJohnTranslations.All(t => t.TargetRefs[0].Contains('/'))); + string usfm = await _helperClient.TranslationEnginesClient.GetPretranslatedUsfmAsync( engineId, inferencingParallelCorpusId, "REV" ); Assert.That(usfm, Does.Contain("\\v 1")); + string usfmWithPlacedMarkers = await _helperClient.TranslationEnginesClient.GetPretranslatedUsfmAsync( engineId, inferencingParallelCorpusId, @@ -296,6 +301,7 @@ await _helperClient.TranslationEnginesClient.GetAllPretranslationsAsync( paragraphMarkerBehavior: PretranslationUsfmMarkerBehavior.PreservePosition ); Assert.That(usfmWithPlacedMarkers, Is.Not.EqualTo(usfm)); + string usfmWithDenormalizedQuotes = await _helperClient.TranslationEnginesClient.GetPretranslatedUsfmAsync( engineId, inferencingParallelCorpusId, diff --git a/src/Serval/test/Serval.WordAlignment.Tests/Services/EngineServiceTests.cs b/src/Serval/test/Serval.WordAlignment.Tests/Services/EngineServiceTests.cs index c9ff894d..150b86ca 100644 --- a/src/Serval/test/Serval.WordAlignment.Tests/Services/EngineServiceTests.cs +++ b/src/Serval/test/Serval.WordAlignment.Tests/Services/EngineServiceTests.cs @@ -1242,7 +1242,7 @@ public async Task UpdateCorpusAsync() } [Test] - public async Task DeletePretranslationsWhenParallelCorpusIsUpdatedAsync() + public async Task DeleteWordAlignmentsWhenParallelCorpusIsUpdatedAsync() { var env = new TestEnvironment(); Models.WordAlignment wordAlignment = new() @@ -1266,7 +1266,7 @@ public async Task DeletePretranslationsWhenParallelCorpusIsUpdatedAsync() } [Test] - public async Task DeletePretranslationsWhenCorpusFilesAreDeletedAsync() + public async Task DeleteWordAlignmentsWhenCorpusFilesAreDeletedAsync() { var env = new TestEnvironment(); Models.WordAlignment wordAlignment = new() @@ -1290,7 +1290,7 @@ public async Task DeletePretranslationsWhenCorpusFilesAreDeletedAsync() } [Test] - public async Task DeletePretranslationsWhenCorpusFilesAreUpdatedAsync() + public async Task DeleteWordAlignmentsWhenCorpusFilesAreUpdatedAsync() { var env = new TestEnvironment(); Models.WordAlignment wordAlignment = new()