Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/Echo/src/EchoEngine/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
builder.Services.AddHostedService<BackgroundTaskService>();
builder.Services.AddSingleton<BackgroundTaskQueue>();

builder.Services.AddParallelCorpusPreprocessor();
builder.Services.AddParallelCorpusService();

builder.Services.AddHealthChecks().AddCheck("Live", () => HealthCheckResult.Healthy());

Expand Down
13 changes: 6 additions & 7 deletions src/Echo/src/EchoEngine/TranslationEngineServiceV1.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,15 @@ namespace EchoEngine;

public class TranslationEngineServiceV1(
BackgroundTaskQueue taskQueue,
IParallelCorpusPreprocessingService parallelCorpusPreprocessingService,
IParallelCorpusService parallelCorpusService,
TranslationPlatformApi.TranslationPlatformApiClient platformApiClient
) : TranslationEngineApi.TranslationEngineApiBase
{
private static readonly Empty Empty = new();
private readonly BackgroundTaskQueue _taskQueue = taskQueue;
private readonly TranslationPlatformApi.TranslationPlatformApiClient _platformApiClient = platformApiClient;

private readonly IParallelCorpusPreprocessingService _parallelCorpusPreprocessingService =
parallelCorpusPreprocessingService;
private readonly IParallelCorpusService _parallelCorpusService = parallelCorpusService;

public override Task<Empty> Create(CreateRequest request, ServerCallContext context)
{
Expand Down Expand Up @@ -125,22 +124,22 @@ await client.BuildStartedAsync(
int pretranslateCount = 0;

List<InsertPretranslationsRequest> pretranslationsRequests = [];
await _parallelCorpusPreprocessingService.PreprocessAsync(
request.Corpora.Select(Map).ToList(),
await _parallelCorpusService.PreprocessAsync(
request.Corpora.Select(Map),
(row, _) =>
{
if (row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0)
trainCount++;
return Task.CompletedTask;
},
(row, isInTrainingData, corpus) =>
(row, isInTrainingData, corpusId) =>
{
string[] tokens = row.SourceSegment.Split();
pretranslationsRequests.Add(
new InsertPretranslationsRequest
{
EngineId = request.EngineId,
CorpusId = corpus.Id,
CorpusId = corpusId,
TextId = row.TextId,
SourceRefs = { row.SourceRefs.Select(r => r.ToString()) },
TargetRefs = { row.TargetRefs.Select(r => r.ToString()) },
Expand Down
17 changes: 7 additions & 10 deletions src/Echo/src/EchoEngine/WordAlignmentEngineServiceV1.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,12 @@

namespace EchoEngine;

public class WordAlignmentEngineServiceV1(
BackgroundTaskQueue taskQueue,
IParallelCorpusPreprocessingService parallelCorpusPreprocessingService
) : WordAlignmentEngineApi.WordAlignmentEngineApiBase
public class WordAlignmentEngineServiceV1(BackgroundTaskQueue taskQueue, IParallelCorpusService parallelCorpusService)
: WordAlignmentEngineApi.WordAlignmentEngineApiBase
{
private static readonly Empty Empty = new();
private readonly BackgroundTaskQueue _taskQueue = taskQueue;
private readonly IParallelCorpusPreprocessingService _parallelCorpusPreprocessingService =
parallelCorpusPreprocessingService;
private readonly IParallelCorpusService _parallelCorpusService = parallelCorpusService;

public override Task<Empty> Create(CreateRequest request, ServerCallContext context)
{
Expand Down Expand Up @@ -79,21 +76,21 @@ await client.BuildStartedAsync(
int trainCount = 0;
int wordAlignCount = 0;
List<InsertWordAlignmentsRequest> wordAlignmentsRequests = [];
await _parallelCorpusPreprocessingService.PreprocessAsync(
request.Corpora.Select(Map).ToList(),
await _parallelCorpusService.PreprocessAsync(
request.Corpora.Select(Map),
(row, _) =>
{
if (row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0)
trainCount++;
return Task.CompletedTask;
},
(row, isInTrainingData, corpus) =>
(row, isInTrainingData, corpusId) =>
{
wordAlignmentsRequests.Add(
new InsertWordAlignmentsRequest
{
EngineId = request.EngineId,
CorpusId = corpus.Id,
CorpusId = corpusId,
TextId = row.TextId,
SourceRefs = { row.SourceRefs.Select(r => r.ToString()) },
TargetRefs = { row.TargetRefs.Select(r => r.ToString()) },
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ public static IMachineBuilder AddMachine(this IServiceCollection services, IConf
(sp, cancellationToken) =>
sp.GetRequiredService<IDistributedReaderWriterLockFactory>().InitAsync(cancellationToken)
);
services.AddParallelCorpusPreprocessor();
services.AddParallelCorpusService();
services.Configure<Bugsnag.Configuration>(configuration.GetSection("Bugsnag"));
services.AddBugsnag();
services.AddDiagnostics();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ public class NmtPreprocessBuildJob(
IBuildJobService<TranslationEngine> buildJobService,
ISharedFileService sharedFileService,
ILanguageTagService languageTagService,
IParallelCorpusPreprocessingService parallelCorpusPreprocessingService,
IParallelCorpusService parallelCorpusService,
IOptionsMonitor<BuildJobOptions> options
)
: TranslationPreprocessBuildJob(
Expand All @@ -18,7 +18,7 @@ IOptionsMonitor<BuildJobOptions> options
logger,
buildJobService,
sharedFileService,
parallelCorpusPreprocessingService,
parallelCorpusService,
options
)
{
Expand All @@ -33,21 +33,12 @@ private bool ResolveLanguageCode(string languageCode, out string resolvedCode)
protected override async Task UpdateTargetQuoteConventionAsync(
string engineId,
string buildId,
IReadOnlyList<ParallelCorpus> corpora,
IReadOnlyList<ParallelCorpus> parallelCorpora,
CancellationToken cancellationToken
)
{
List<QuoteConventionAnalysis> quoteConventionAnalyses = [];
foreach (ParallelCorpus parallelCorpus in corpora)
{
QuoteConventionAnalysis? targetQuotationConventionAnalysis =
ParallelCorpusPreprocessingService.AnalyzeTargetCorpusQuoteConvention(parallelCorpus);
if (targetQuotationConventionAnalysis != null)
quoteConventionAnalyses.Add(targetQuotationConventionAnalysis);
}

string overallTargetQuoteConventionAnalysis =
QuoteConventionAnalysis.CombineWithWeightedAverage(quoteConventionAnalyses)?.BestQuoteConvention?.Name
ParallelCorpusService.AnalyzeTargetQuoteConvention(parallelCorpora)?.BestQuoteConvention?.Name
?? string.Empty;

await PlatformService.UpdateTargetQuoteConventionAsync(
Expand All @@ -65,7 +56,7 @@ protected override async Task UpdateBuildExecutionData(
int pretranslateCount,
string sourceLanguageTag,
string targetLanguageTag,
IReadOnlyList<ParallelCorpus> corpora,
IReadOnlyList<ParallelCorpus> parallelCorpora,
CancellationToken cancellationToken
)
{
Expand All @@ -84,7 +75,7 @@ CancellationToken cancellationToken
pretranslateCount,
sourceLanguageTag,
targetLanguageTag,
corpora
parallelCorpora
);

int maxWarnings = BuildJobOptions.MaxWarnings;
Expand Down Expand Up @@ -128,12 +119,12 @@ protected override IReadOnlyList<string> GetWarnings(
int inferenceCount,
string sourceLanguageTag,
string targetLanguageTag,
IReadOnlyList<ParallelCorpus> corpora
IReadOnlyList<ParallelCorpus> parallelCorpora
)
{
List<string> warnings =
[
.. base.GetWarnings(trainCount, inferenceCount, sourceLanguageTag, targetLanguageTag, corpora),
.. base.GetWarnings(trainCount, inferenceCount, sourceLanguageTag, targetLanguageTag, parallelCorpora),
];

// Has at least a Gospel of Mark amount of data and not the special case of no data which will be caught elsewhere
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ public abstract class PreprocessBuildJob<TEngine>(
ILogger<PreprocessBuildJob<TEngine>> logger,
IBuildJobService<TEngine> buildJobService,
ISharedFileService sharedFileService,
IParallelCorpusPreprocessingService parallelCorpusPreprocessingService,
IParallelCorpusService parallelCorpusService,
IOptionsMonitor<BuildJobOptions> options
)
: HangfireBuildJob<TEngine, IReadOnlyList<ParallelCorpus>>(
Expand All @@ -31,8 +31,7 @@ IOptionsMonitor<BuildJobOptions> options
internal BuildJobRunnerType TrainJobRunnerType { get; init; } = BuildJobRunnerType.ClearML;
protected readonly BuildJobOptions BuildJobOptions = options.CurrentValue;
protected readonly ISharedFileService SharedFileService = sharedFileService;
protected readonly IParallelCorpusPreprocessingService ParallelCorpusPreprocessingService =
parallelCorpusPreprocessingService;
protected readonly IParallelCorpusService ParallelCorpusService = parallelCorpusService;

protected override async Task DoWorkAsync(
string engineId,
Expand Down Expand Up @@ -95,28 +94,28 @@ protected abstract Task UpdateBuildExecutionData(
int inferenceCount,
string sourceLanguageTag,
string targetLanguageTag,
IReadOnlyList<ParallelCorpus> corpora,
IReadOnlyList<ParallelCorpus> parallelCorpora,
CancellationToken cancellationToken
);

protected virtual Task UpdateTargetQuoteConventionAsync(
string engineId,
string buildId,
IReadOnlyList<ParallelCorpus> corpora,
IReadOnlyList<ParallelCorpus> parallelCorpora,
CancellationToken cancellationToken
) => Task.CompletedTask;

protected abstract Task<(int TrainCount, int InferenceCount)> WriteDataFilesAsync(
string buildId,
IReadOnlyList<ParallelCorpus> corpora,
IReadOnlyList<ParallelCorpus> parallelCorpora,
string? buildOptions,
CancellationToken cancellationToken
);

protected override async Task CleanupAsync(
string engineId,
string buildId,
IReadOnlyList<ParallelCorpus> data,
IReadOnlyList<ParallelCorpus> parallelCorpora,
JobCompletionStatus completionStatus
)
{
Expand All @@ -138,34 +137,48 @@ protected virtual IReadOnlyList<string> GetWarnings(
int inferenceCount,
string sourceLanguageTag,
string targetLanguageTag,
IReadOnlyList<ParallelCorpus> corpora
IReadOnlyList<ParallelCorpus> parallelCorpora
)
{
List<string> warnings = [];

foreach (ParallelCorpus parallelCorpus in corpora)
foreach (
(
string parallelCorpusId,
string monolingualCorpusId,
IReadOnlyList<UsfmVersificationError> errors
) in ParallelCorpusService.AnalyzeUsfmVersification(parallelCorpora)
)
{
IReadOnlyList<(string MonolingualCorpusId, IReadOnlyList<UsfmVersificationError> errors)> errorsPerCorpus =
ParallelCorpusPreprocessingService.AnalyzeUsfmVersification(parallelCorpus);

foreach ((string monolingualCorpusId, IReadOnlyList<UsfmVersificationError> errors) in errorsPerCorpus)
foreach (UsfmVersificationError error in errors)
{
foreach (UsfmVersificationError error in errors)
{
warnings.Add(
error.Type switch
{
UsfmVersificationErrorType.InvalidChapterNumber =>
$"Invalid chapter number error in project {error.ProjectName} at “{error.ActualVerseRef}” (parallel corpus {parallelCorpus.Id}, monolingual corpus {monolingualCorpusId})",
UsfmVersificationErrorType.InvalidVerseNumber =>
$"Invalid verse number error in project {error.ProjectName} at “{error.ActualVerseRef}” (parallel corpus {parallelCorpus.Id}, monolingual corpus {monolingualCorpusId})",
_ =>
$"USFM versification error in project {error.ProjectName}, expected verse “{error.ExpectedVerseRef}”, actual verse “{error.ActualVerseRef}”, mismatch type {error.Type} (parallel corpus {parallelCorpus.Id}, monolingual corpus {monolingualCorpusId})",
}
);
}
warnings.Add(
error.Type switch
{
UsfmVersificationErrorType.InvalidChapterNumber =>
$"Invalid chapter number error in project {error.ProjectName} at “{error.ActualVerseRef}” (parallel corpus {parallelCorpusId}, monolingual corpus {monolingualCorpusId})",
UsfmVersificationErrorType.InvalidVerseNumber =>
$"Invalid verse number error in project {error.ProjectName} at “{error.ActualVerseRef}” (parallel corpus {parallelCorpusId}, monolingual corpus {monolingualCorpusId})",
_ =>
$"USFM versification error in project {error.ProjectName}, expected verse “{error.ExpectedVerseRef}”, actual verse “{error.ActualVerseRef}”, mismatch type {error.Type} (parallel corpus {parallelCorpusId}, monolingual corpus {monolingualCorpusId})",
}
);
}
}

foreach (
(
string parallelCorpusId,
string monolingualCorpusId,
MissingParentProjectError error
) in ParallelCorpusService.FindMissingParentProjects(parallelCorpora)
)
{
warnings.Add(
$"Unable to locate parent project {error.ParentProjectName} of daughter project {error.ProjectName} (parallel corpus {parallelCorpusId}, monolingual corpus {monolingualCorpusId})"
);
}

return warnings;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ public class SmtTransferPreprocessBuildJob(
ISharedFileService sharedFileService,
IDistributedReaderWriterLockFactory lockFactory,
IRepository<TrainSegmentPair> trainSegmentPairs,
IParallelCorpusPreprocessingService parallelCorpusPreprocessingService,
IParallelCorpusService parallelCorpusService,
IOptionsMonitor<BuildJobOptions> options
)
: TranslationPreprocessBuildJob(
Expand All @@ -19,7 +19,7 @@ IOptionsMonitor<BuildJobOptions> options
logger,
buildJobService,
sharedFileService,
parallelCorpusPreprocessingService,
parallelCorpusService,
options
)
{
Expand All @@ -29,7 +29,7 @@ IOptionsMonitor<BuildJobOptions> options
protected override async Task InitializeAsync(
string engineId,
string buildId,
IReadOnlyList<ParallelCorpus> data,
IReadOnlyList<ParallelCorpus> corpora,
CancellationToken cancellationToken
)
{
Expand Down
Loading
Loading