Skip to content

Commit f630908

Browse files
committed
Passing tests for pretranslateAll, trainOnAll
1 parent 49d8ea9 commit f630908

5 files changed

Lines changed: 73 additions & 37 deletions

File tree

src/Serval/src/Serval.Translation/Services/CorpusMappingService.cs

Lines changed: 30 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
using SIL.Extensions;
2-
31
namespace Serval.Translation.Services;
42

53
public class CorpusMappingService(
@@ -56,12 +54,16 @@ Corpus source in corpora.Where(c =>
5654
Id = source.Id,
5755
Language = source.SourceLanguage,
5856
Files = source.SourceFiles.Select(Map).ToArray(),
57+
TrainOnAll = trainOnAllCorpora,
58+
PretranslateAll = pretranslateAllCorpora,
5959
};
6060
SIL.ServiceToolkit.Models.MonolingualCorpus targetCorpus = new()
6161
{
6262
Id = source.Id,
6363
Language = source.TargetLanguage,
6464
Files = source.TargetFiles.Select(Map).ToArray(),
65+
TrainOnAll = trainOnAllCorpora,
66+
PretranslateAll = pretranslateAllCorpora,
6567
};
6668

6769
if (trainingCorpus is not null)
@@ -72,12 +74,10 @@ Corpus source in corpora.Where(c =>
7274
$"The corpus {source.Id} cannot specify both 'textIds' and 'scriptureRange' for trainOn"
7375
);
7476
}
75-
if (trainingCorpus.TextIds is not null)
76-
{
77-
sourceCorpus.TrainOnTextIds.AddRange(trainingCorpus.TextIds);
78-
targetCorpus.TrainOnTextIds.AddRange(trainingCorpus.TextIds);
79-
}
80-
if (!string.IsNullOrEmpty(trainingCorpus.ScriptureRange))
77+
sourceCorpus.TrainOnTextIds = trainingCorpus.TextIds?.ToHashSet();
78+
targetCorpus.TrainOnTextIds = trainingCorpus.TextIds?.ToHashSet();
79+
80+
if (trainingCorpus.ScriptureRange is not null)
8181
{
8282
if (
8383
targetCorpus.Files.Count > 1
@@ -98,6 +98,8 @@ Corpus source in corpora.Where(c =>
9898
sourceCorpus.TrainOnChapters = chapters;
9999
targetCorpus.TrainOnChapters = chapters;
100100
}
101+
sourceCorpus.TrainOnAll = sourceCorpus.TrainOnChapters is null && sourceCorpus.TrainOnTextIds is null;
102+
targetCorpus.TrainOnAll = targetCorpus.TrainOnChapters is null && targetCorpus.TrainOnTextIds is null;
101103
}
102104

103105
if (pretranslateCorpus is not null)
@@ -108,9 +110,8 @@ Corpus source in corpora.Where(c =>
108110
$"The corpus {source.Id} cannot specify both 'textIds' and 'scriptureRange' for 'pretranslate'."
109111
);
110112
}
111-
if (pretranslateCorpus.TextIds is not null)
112-
sourceCorpus.InferenceTextIds.AddRange(pretranslateCorpus.TextIds);
113-
if (!string.IsNullOrEmpty(pretranslateCorpus.ScriptureRange))
113+
sourceCorpus.InferenceTextIds = pretranslateCorpus.TextIds?.ToHashSet();
114+
if (pretranslateCorpus.ScriptureRange is not null)
114115
{
115116
if (
116117
targetCorpus.Files.Count > 1
@@ -129,14 +130,16 @@ Corpus source in corpora.Where(c =>
129130
)
130131
.ToDictionary(kvp => kvp.Key, kvp => kvp.Value.ToHashSet());
131132
}
133+
sourceCorpus.PretranslateAll =
134+
sourceCorpus.InferenceChapters is null && sourceCorpus.InferenceTextIds is null;
135+
targetCorpus.PretranslateAll =
136+
targetCorpus.InferenceChapters is null && targetCorpus.InferenceTextIds is null;
132137
}
133138
SIL.ServiceToolkit.Models.ParallelCorpus corpus = new()
134139
{
135140
Id = source.Id,
136141
SourceCorpora = [sourceCorpus],
137142
TargetCorpora = [targetCorpus],
138-
TrainOnAllCorpora = trainOnAllCorpora,
139-
PretranslateAllCorpora = pretranslateAllCorpora,
140143
};
141144
mappedParallelCorpora.Add(corpus);
142145
}
@@ -186,7 +189,11 @@ IReadOnlyList<ParallelCorpus> parallelCorpora
186189
sc,
187190
trainingCorpus?.SourceFilters?.Where(sf => sf.CorpusRef == sc.Id).FirstOrDefault(),
188191
pretranslateCorpus?.SourceFilters?.Where(sf => sf.CorpusRef == sc.Id).FirstOrDefault(),
189-
referenceFileLocation
192+
referenceFileLocation,
193+
trainOnAllCorpora
194+
|| (trainingCorpus is not null && trainingCorpus.SourceFilters is null),
195+
pretranslateAllCorpora
196+
|| (pretranslateCorpus is not null && pretranslateCorpus.SourceFilters is null)
190197
)
191198
)
192199
.ToArray(),
@@ -197,12 +204,13 @@ IReadOnlyList<ParallelCorpus> parallelCorpora
197204
tc,
198205
trainingCorpus?.TargetFilters?.Where(sf => sf.CorpusRef == tc.Id).FirstOrDefault(),
199206
null,
200-
referenceFileLocation
207+
referenceFileLocation,
208+
trainOnAllCorpora
209+
|| (trainingCorpus is not null && trainingCorpus.TargetFilters is null),
210+
pretranslateAllCorpora || pretranslateCorpus is not null
201211
)
202212
)
203213
.ToArray(),
204-
TrainOnAllCorpora = trainOnAllCorpora,
205-
PretranslateAllCorpora = pretranslateAllCorpora,
206214
}
207215
);
208216
}
@@ -214,7 +222,9 @@ private SIL.ServiceToolkit.Models.MonolingualCorpus Map(
214222
MonolingualCorpus inputCorpus,
215223
ParallelCorpusFilter? trainingFilter,
216224
ParallelCorpusFilter? pretranslateFilter,
217-
string? referenceFileLocation
225+
string? referenceFileLocation,
226+
bool trainOnAll,
227+
bool pretranslateAll
218228
)
219229
{
220230
Dictionary<string, HashSet<int>>? trainOnChapters = null;
@@ -254,6 +264,8 @@ pretranslateFilter is not null
254264
Id = inputCorpus.Id,
255265
Language = inputCorpus.Language,
256266
Files = inputCorpus.Files.Select(Map).ToArray(),
267+
TrainOnAll = trainOnAll,
268+
PretranslateAll = pretranslateAll,
257269
};
258270

259271
if (

src/Serval/src/Serval.Translation/Services/EngineService.cs

Lines changed: 7 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -843,22 +843,12 @@ private static V1.ParallelCorpus Map(SIL.ServiceToolkit.Models.ParallelCorpus so
843843
return new V1.ParallelCorpus
844844
{
845845
Id = source.Id,
846-
SourceCorpora =
847-
{
848-
source.SourceCorpora.Select(c => Map(c, source.TrainOnAllCorpora, source.PretranslateAllCorpora)),
849-
},
850-
TargetCorpora =
851-
{
852-
source.TargetCorpora.Select(c => Map(c, source.TrainOnAllCorpora, source.PretranslateAllCorpora)),
853-
},
846+
SourceCorpora = { source.SourceCorpora.Select(Map) },
847+
TargetCorpora = { source.TargetCorpora.Select(Map) },
854848
};
855849
}
856850

857-
private static V1.MonolingualCorpus Map(
858-
SIL.ServiceToolkit.Models.MonolingualCorpus source,
859-
bool trainOnAll,
860-
bool pretranslateAll
861-
)
851+
private static V1.MonolingualCorpus Map(SIL.ServiceToolkit.Models.MonolingualCorpus source)
862852
{
863853
var corpus = new V1.MonolingualCorpus
864854
{
@@ -867,15 +857,15 @@ bool pretranslateAll
867857
Files = { source.Files.Select(Map) },
868858
};
869859

870-
if (trainOnAll || (source.TrainOnTextIds is null && source.TrainOnChapters is null))
860+
if (source.TrainOnAll)
871861
{
872862
corpus.TrainOnAll = true;
873863
}
874-
if (source.TrainOnTextIds is not null)
864+
else if (source.TrainOnTextIds is not null)
875865
{
876866
corpus.TrainOnTextIds.Add(source.TrainOnTextIds);
877867
}
878-
if (source.TrainOnChapters is not null)
868+
else if (source.TrainOnChapters is not null)
879869
{
880870
corpus.TrainOnChapters.Add(
881871
source
@@ -889,7 +879,7 @@ bool pretranslateAll
889879
);
890880
}
891881

892-
if (pretranslateAll || (source.InferenceTextIds is null && source.InferenceChapters is null))
882+
if (source.PretranslateAll)
893883
{
894884
corpus.PretranslateAll = true;
895885
}

src/Serval/test/Serval.Translation.Tests/Services/EngineServiceTests.cs

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,7 @@ await env.Service.StartBuildAsync(
138138
{
139139
new()
140140
{
141+
Id = "corpus1",
141142
Language = "es",
142143
Files =
143144
{
@@ -159,6 +160,7 @@ await env.Service.StartBuildAsync(
159160
{
160161
new()
161162
{
163+
Id = "corpus1",
162164
Language = "en",
163165
Files =
164166
{
@@ -217,6 +219,8 @@ await env.Service.StartBuildAsync(
217219
{
218220
new()
219221
{
222+
Id = "corpus1",
223+
220224
Language = "es",
221225
TrainOnTextIds = { },
222226
Files =
@@ -239,6 +243,8 @@ await env.Service.StartBuildAsync(
239243
{
240244
new()
241245
{
246+
Id = "corpus1",
247+
242248
Language = "en",
243249
TrainOnTextIds = { },
244250
Files =
@@ -298,6 +304,8 @@ await env.Service.StartBuildAsync(
298304
{
299305
new()
300306
{
307+
Id = "corpus1",
308+
301309
Language = "es",
302310
TrainOnTextIds = { "text1" },
303311
Files =
@@ -320,6 +328,8 @@ await env.Service.StartBuildAsync(
320328
{
321329
new()
322330
{
331+
Id = "corpus1",
332+
323333
Language = "en",
324334
TrainOnTextIds = { "text1" },
325335
Files =
@@ -379,6 +389,8 @@ await env.Service.StartBuildAsync(
379389
{
380390
new()
381391
{
392+
Id = "corpus1",
393+
382394
Language = "es",
383395
Files =
384396
{
@@ -400,6 +412,8 @@ await env.Service.StartBuildAsync(
400412
{
401413
new()
402414
{
415+
Id = "corpus1",
416+
403417
Language = "en",
404418
Files =
405419
{
@@ -459,6 +473,8 @@ await env.Service.StartBuildAsync(
459473
{
460474
new()
461475
{
476+
Id = "corpus1",
477+
462478
Language = "es",
463479
Files =
464480
{
@@ -480,6 +496,8 @@ await env.Service.StartBuildAsync(
480496
{
481497
new()
482498
{
499+
Id = "corpus1",
500+
483501
Language = "en",
484502
Files =
485503
{
@@ -539,6 +557,8 @@ await env.Service.StartBuildAsync(
539557
{
540558
new()
541559
{
560+
Id = "corpus1",
561+
542562
Language = "es",
543563
Files =
544564
{
@@ -560,6 +580,8 @@ await env.Service.StartBuildAsync(
560580
{
561581
new()
562582
{
583+
Id = "corpus1",
584+
563585
Language = "en",
564586
Files =
565587
{
@@ -585,6 +607,8 @@ await env.Service.StartBuildAsync(
585607
{
586608
new()
587609
{
610+
Id = "corpus2",
611+
588612
Language = "es",
589613
Files =
590614
{
@@ -606,6 +630,8 @@ await env.Service.StartBuildAsync(
606630
{
607631
new()
608632
{
633+
Id = "corpus2",
634+
609635
Language = "en",
610636
Files =
611637
{
@@ -682,6 +708,8 @@ await env.Service.StartBuildAsync(
682708
{
683709
new()
684710
{
711+
Id = "corpus1",
712+
685713
Language = "es",
686714
TrainOnChapters =
687715
{
@@ -714,6 +742,8 @@ await env.Service.StartBuildAsync(
714742
{
715743
new()
716744
{
745+
Id = "corpus1",
746+
717747
Language = "en",
718748
TrainOnChapters =
719749
{
@@ -783,6 +813,8 @@ await env.Service.StartBuildAsync(
783813
{
784814
new()
785815
{
816+
Id = "corpus1",
817+
786818
Language = "es",
787819
Files =
788820
{
@@ -804,6 +836,8 @@ await env.Service.StartBuildAsync(
804836
{
805837
new()
806838
{
839+
Id = "corpus1",
840+
807841
Language = "en",
808842
Files =
809843
{

src/ServiceToolkit/src/SIL.ServiceToolkit/Models/MonolingualCorpus.cs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ public record MonolingualCorpus
99
public Dictionary<string, HashSet<int>>? TrainOnChapters { get; set; }
1010
public HashSet<string>? InferenceTextIds { get; set; }
1111
public Dictionary<string, HashSet<int>>? InferenceChapters { get; set; }
12+
public bool TrainOnAll { get; set; }
13+
public bool PretranslateAll { get; set; }
1214

1315
public bool IsFiltered =>
1416
TrainOnTextIds != null || TrainOnChapters != null || InferenceTextIds != null || InferenceChapters != null;

src/ServiceToolkit/src/SIL.ServiceToolkit/Models/ParallelCorpus.cs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,4 @@ public record ParallelCorpus
55
public required string Id { get; set; }
66
public IReadOnlyList<MonolingualCorpus> SourceCorpora { get; set; } = new List<MonolingualCorpus>();
77
public IReadOnlyList<MonolingualCorpus> TargetCorpora { get; set; } = new List<MonolingualCorpus>();
8-
public bool TrainOnAllCorpora { get; set; }
9-
public bool PretranslateAllCorpora { get; set; }
108
}

0 commit comments

Comments
 (0)