Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move preprocess logic to toolkit #512

Merged
merged 31 commits into from
Nov 26, 2024
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
4485822
Initial refactoring (not tested)
Enkidu93 Oct 15, 2024
42c6778
Fix build error
Enkidu93 Oct 16, 2024
3d095ed
Update Echo engine to use toolkit
Enkidu93 Oct 16, 2024
ec30ff9
Fix machine version
Enkidu93 Oct 16, 2024
31a31c3
Fix async stream issue
Enkidu93 Oct 16, 2024
5a61291
Fix test: Add ability to specify CorpusService mock
Enkidu93 Oct 16, 2024
3ec2bce
Initial refactoring (not tested)
Enkidu93 Oct 15, 2024
293f31a
Migrate John's changes to toolkit
Enkidu93 Oct 16, 2024
60d4084
Fix bug with pretranslating all; begin porting tests to toolkit
Enkidu93 Oct 17, 2024
5033bd3
Another small logic fix; update tests to reflect not generating pretr…
Enkidu93 Oct 17, 2024
ec88283
Fix emrge errors
Enkidu93 Oct 17, 2024
d3300c7
Fix issue with mapping non-parallel-corpora to parallel corpora
Enkidu93 Oct 17, 2024
6ceee18
Spread out steps for easier debugging
Enkidu93 Oct 17, 2024
2768b4c
Move to service; address scripture alignment issue
Enkidu93 Oct 18, 2024
331657b
Update Echo engine to use toolkit
Enkidu93 Oct 15, 2024
30756ee
Fix dependency
johnml1135 Oct 21, 2024
891f067
Fix up ParallelCorpusPreprocessingService
ddaspit Oct 22, 2024
f950d6e
Merge branch 'move_preprocess_logic_to_toolkit' of https://github.com…
Enkidu93 Nov 6, 2024
6b1bfdb
Fix dependencies
Enkidu93 Nov 6, 2024
4ec67d2
Working new logic
Enkidu93 Nov 8, 2024
c9a1e23
Change naming; using extensions
Enkidu93 Nov 8, 2024
d104852
Update to new merged corpus API
Enkidu93 Nov 15, 2024
57262e7
Switch over to using enum for usfm behavior
Enkidu93 Nov 25, 2024
939c142
Merge branch 'main' into move_preprocess_logic_to_toolkit
Enkidu93 Nov 25, 2024
5406415
Fix merge error
Enkidu93 Nov 25, 2024
05fd739
Update to machine 3.5.0
Enkidu93 Nov 25, 2024
d33904f
More version updates
Enkidu93 Nov 25, 2024
2c0ae9f
Fix merge error
Enkidu93 Nov 26, 2024
5c6950a
Address reviewer comments
Enkidu93 Nov 26, 2024
8db0bad
Make logic consistent; remove inconsistent error messages
Enkidu93 Nov 26, 2024
14050d9
Merge branch 'main' into move_preprocess_logic_to_toolkit
johnml1135 Nov 26, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions Serval.sln
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,14 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "src", "src", "{C3A14577-A65
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SIL.ServiceToolkit", "src\ServiceToolkit\src\SIL.ServiceToolkit\SIL.ServiceToolkit.csproj", "{0E40F959-C641-40A2-9750-B17A4F9F9E55}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "src", "src", "{E41916A7-B9AA-45BE-BCFF-656722FEEA84}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "ServiceToolkit", "ServiceToolkit", "{A4DA43D4-29BC-4164-A114-E1775B2C9573}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "test", "test", "{5C42D20E-8DFC-4221-BA97-62D9E5742349}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SIL.ServiceToolkit.Tests", "src\ServiceToolkit\test\SIL.ServiceToolkit\SIL.ServiceToolkit.Tests.csproj", "{3DC5CD22-3E98-434A-9B00-EBC4DDF797A1}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Expand Down Expand Up @@ -180,6 +188,10 @@ Global
{0E40F959-C641-40A2-9750-B17A4F9F9E55}.Debug|Any CPU.Build.0 = Debug|Any CPU
{0E40F959-C641-40A2-9750-B17A4F9F9E55}.Release|Any CPU.ActiveCfg = Release|Any CPU
{0E40F959-C641-40A2-9750-B17A4F9F9E55}.Release|Any CPU.Build.0 = Release|Any CPU
{3DC5CD22-3E98-434A-9B00-EBC4DDF797A1}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{3DC5CD22-3E98-434A-9B00-EBC4DDF797A1}.Debug|Any CPU.Build.0 = Debug|Any CPU
{3DC5CD22-3E98-434A-9B00-EBC4DDF797A1}.Release|Any CPU.ActiveCfg = Release|Any CPU
{3DC5CD22-3E98-434A-9B00-EBC4DDF797A1}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
Expand Down Expand Up @@ -215,6 +227,9 @@ Global
{10657805-48F1-4205-B8F5-79447F6EF620} = {25CDB05B-4E24-4A6E-933E-1E0BEC97D74D}
{C3A14577-A654-4604-818C-4E683DD45A51} = {EA69B41C-49EF-4017-A687-44B9DF37FF98}
{0E40F959-C641-40A2-9750-B17A4F9F9E55} = {C3A14577-A654-4604-818C-4E683DD45A51}
{A4DA43D4-29BC-4164-A114-E1775B2C9573} = {E41916A7-B9AA-45BE-BCFF-656722FEEA84}
{5C42D20E-8DFC-4221-BA97-62D9E5742349} = {A4DA43D4-29BC-4164-A114-E1775B2C9573}
{3DC5CD22-3E98-434A-9B00-EBC4DDF797A1} = {5C42D20E-8DFC-4221-BA97-62D9E5742349}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {9F18C25E-E140-43C3-B177-D562E1628370}
Expand Down
230 changes: 85 additions & 145 deletions src/Echo/src/EchoTranslationEngine/TranslationEngineServiceV1.cs
Original file line number Diff line number Diff line change
Expand Up @@ -80,154 +80,25 @@ await client.BuildStartedAsync(
client.InsertPretranslations(cancellationToken: cancellationToken)
)
{
foreach (ParallelCorpus corpus in request.Corpora)
{
var sourceFiles = corpus
.SourceCorpora.SelectMany(sc =>
sc.Files.Where(f =>
(
sc.PretranslateAll
|| sc.PretranslateTextIds is null
|| sc.PretranslateTextIds.Contains(f.TextId)
)
&& f.Format == FileFormat.Text
)
)
.ToDictionary(f => f.TextId, f => f.Location);
var targetFiles = corpus
.TargetCorpora.SelectMany(tc =>
tc.Files.Where(f =>
(
tc.PretranslateAll
|| tc.PretranslateTextIds is null
|| tc.PretranslateTextIds.Contains(f.TextId)
)
&& f.Format == FileFormat.Text
)
)
.ToDictionary(f => f.TextId, f => f.Location);

foreach (KeyValuePair<string, string> sourceFile in sourceFiles)
ParallelCorpusPreprocessor.PreprocessCorpora(
request.Corpora.Select(Map).ToList(),
row => { },
async (row, corpus) =>
{
string[] sourceLines = await File.ReadAllLinesAsync(
sourceFile.Value,
await call.RequestStream.WriteAsync(
new InsertPretranslationsRequest
{
EngineId = request.EngineId,
CorpusId = corpus.Id,
TextId = row.TextId,
Refs = { row.Refs.Select(r => r.ToString()) },
Translation = row.SourceSegment
},
cancellationToken
);

if (targetFiles.TryGetValue(sourceFile.Key, out string? targetPath))
{
string[] targetLines = await File.ReadAllLinesAsync(targetPath, cancellationToken);
bool isTabSeparated = (sourceLines.Length > 0) && sourceLines[0].Contains('/');
if (!isTabSeparated)
{
int lineNum = 1;
foreach (
(string sourceLine, string targetLine) in sourceLines
.Select(l => l.Trim())
.Zip(targetLines.Select(l => l.Trim()))
)
{
if (sourceLine.Length > 0 && targetLine.Length == 0)
{
await call.RequestStream.WriteAsync(
new InsertPretranslationsRequest
{
EngineId = request.EngineId,
CorpusId = corpus.Id,
TextId = sourceFile.Key,
Refs = { $"{sourceFile.Key}:{lineNum}" },
Translation = sourceLine
},
cancellationToken
);
}
lineNum++;
}
}
else
{
var sourceLinesDict = sourceLines.ToDictionary(
l => l.Split('\t')[0].Trim(),
l => l.Split('\t')[1].Trim()
);
var targetLinesDict = targetLines.ToDictionary(
l => l.Split('\t')[0].Trim(),
l => l.Contains('\t') ? l.Split('\t')[1].Trim() : string.Empty
);
foreach (KeyValuePair<string, string> targetLineKVPair in targetLinesDict)
{
string? sourceLine = null;
sourceLinesDict.TryGetValue(targetLineKVPair.Key, out sourceLine);
sourceLine ??= string.Empty;
string? targetLine = targetLineKVPair.Value;
if (sourceLine.Length > 0 && targetLine.Length == 0)
{
await call.RequestStream.WriteAsync(
new InsertPretranslationsRequest
{
EngineId = request.EngineId,
CorpusId = corpus.Id,
TextId = sourceFile.Key,
Refs = { $"{sourceFile.Key}:{targetLineKVPair.Key}" },
Translation = sourceLine
},
cancellationToken
);
}
}
}
}
else
{
bool isTabSeparated = (sourceLines.Length > 0) && sourceLines[0].Contains('/');
if (!isTabSeparated)
{
int lineNum = 1;
foreach (string sourceLine in sourceLines.Select(l => l.Trim()))
{
if (sourceLine.Length > 0)
{
await call.RequestStream.WriteAsync(
new InsertPretranslationsRequest
{
EngineId = request.EngineId,
CorpusId = corpus.Id,
TextId = sourceFile.Key,
Refs = { $"{sourceFile.Key}:{lineNum}" },
Translation = sourceLine
},
cancellationToken
);
}
lineNum++;
}
}
else
{
foreach (string sourceLine in sourceLines.Select(l => l.Trim()))
{
if (sourceLine.Length > 0)
{
await call.RequestStream.WriteAsync(
new InsertPretranslationsRequest
{
EngineId = request.EngineId,
CorpusId = corpus.Id,
TextId = sourceFile.Key,
Refs = { $"{sourceFile.Key}:{sourceLine.Split('\t')[0]}" },
Translation = sourceLine.Contains('\t')
? sourceLine.Split('\t')[1].Trim()
: string.Empty
},
cancellationToken
);
}
}
}
}
}
}

},
false
);
await call.RequestStream.CompleteAsync();
await call;
}
Expand Down Expand Up @@ -325,4 +196,73 @@ ServerCallContext context
new GetLanguageInfoResponse { InternalCode = request.Language + "_echo", IsNative = true, }
);
}

private static SIL.ServiceToolkit.Models.ParallelCorpus Map(ParallelCorpus source)
{
return new SIL.ServiceToolkit.Models.ParallelCorpus
{
Id = source.Id,
SourceCorpora = source.SourceCorpora.Select(Map).ToList(),
TargetCorpora = source.TargetCorpora.Select(Map).ToList()
};
}

private static SIL.ServiceToolkit.Models.MonolingualCorpus Map(MonolingualCorpus source)
{
var trainOnChapters = source.TrainOnChapters.ToDictionary(
kvp => kvp.Key,
kvp => kvp.Value.Chapters.ToHashSet()
);
var trainOnTextIds = source.TrainOnTextIds.ToHashSet();
FilterChoice trainingFilter = GetFilterChoice(trainOnChapters, trainOnTextIds);

var pretranslateChapters = source.PretranslateChapters.ToDictionary(
kvp => kvp.Key,
kvp => kvp.Value.Chapters.ToHashSet()
);
var pretranslateTextIds = source.PretranslateTextIds.ToHashSet();
FilterChoice pretranslateFilter = GetFilterChoice(pretranslateChapters, pretranslateTextIds);

return new SIL.ServiceToolkit.Models.MonolingualCorpus
{
Id = source.Id,
Language = source.Language,
Files = source.Files.Select(Map).ToList(),
TrainOnChapters = trainingFilter == FilterChoice.Chapters ? trainOnChapters : null,
TrainOnTextIds = trainingFilter == FilterChoice.TextIds ? trainOnTextIds : null,
PretranslateChapters = pretranslateFilter == FilterChoice.Chapters ? pretranslateChapters : null,
PretranslateTextIds = pretranslateFilter == FilterChoice.TextIds ? pretranslateTextIds : null
};
}

private static SIL.ServiceToolkit.Models.CorpusFile Map(CorpusFile source)
{
return new SIL.ServiceToolkit.Models.CorpusFile
{
Location = source.Location,
Format = (SIL.ServiceToolkit.Models.FileFormat)source.Format,
TextId = source.TextId
};
}

private enum FilterChoice
{
Chapters,
TextIds,
None
}

private static FilterChoice GetFilterChoice(
IReadOnlyDictionary<string, HashSet<int>> chapters,
HashSet<string> textIds
)
{
// Only either textIds or Scripture Range will be used at a time
// TextIds may be an empty array, so prefer that if both are empty (which applies to both scripture and text)
if (chapters is null && textIds is null)
return FilterChoice.None;
if (chapters is null || chapters.Count == 0)
return FilterChoice.TextIds;
return FilterChoice.Chapters;
}
}
1 change: 1 addition & 0 deletions src/Echo/src/EchoTranslationEngine/Usings.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@
global using Grpc.Core;
global using Microsoft.Extensions.Diagnostics.HealthChecks;
global using Serval.Translation.V1;
global using SIL.ServiceToolkit.Utils;
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ public static IMachineBuilder AddMachine(this IServiceCollection services, IConf
services.AddTransient<IFileSystem, FileSystem>();

services.AddScoped<IDistributedReaderWriterLockFactory, DistributedReaderWriterLockFactory>();
services.AddSingleton<ICorpusService, CorpusService>();
services.AddStartupTask(
(sp, cancellationToken) =>
sp.GetRequiredService<IDistributedReaderWriterLockFactory>().InitAsync(cancellationToken)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,8 @@ public class NmtPreprocessBuildJob(
ILogger<NmtPreprocessBuildJob> logger,
IBuildJobService buildJobService,
ISharedFileService sharedFileService,
ICorpusService corpusService,
ILanguageTagService languageTagService
)
: PreprocessBuildJob(
platformService,
engines,
dataAccessContext,
logger,
buildJobService,
sharedFileService,
corpusService
)
) : PreprocessBuildJob(platformService, engines, dataAccessContext, logger, buildJobService, sharedFileService)
{
private readonly ILanguageTagService _languageTagService = languageTagService;

Expand Down
Loading
Loading