Skip to content

Commit

Permalink
Move scripture range parsing to Serval
Browse files Browse the repository at this point in the history
  • Loading branch information
Enkidu93 committed Feb 3, 2024
1 parent 18d2d64 commit 1119992
Show file tree
Hide file tree
Showing 8 changed files with 572 additions and 302 deletions.
4 changes: 2 additions & 2 deletions src/SIL.Machine.AspNetCore/Models/Corpus.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ public class Corpus
public string TargetLanguage { get; set; } = default!;
public bool TrainOnAll { get; set; }
public bool PretranslateAll { get; set; }
public string? TrainOnBiblicalRange { get; set; }
public string? PretranslateBiblicalRange {get; set; }
public Dictionary<string, List<int>>? TrainOnChapters { get; set; }
public Dictionary<string, List<int>>? PretranslateChapters { get; set; }
public HashSet<string> TrainOnTextIds { get; set; } = default!;
public HashSet<string> PretranslateTextIds { get; set; } = default!;
public List<CorpusFile> SourceFiles { get; set; } = default!;
Expand Down
165 changes: 0 additions & 165 deletions src/SIL.Machine.AspNetCore/Services/BiblicalRangeStringParser.cs

This file was deleted.

62 changes: 44 additions & 18 deletions src/SIL.Machine.AspNetCore/Services/NmtPreprocessBuildJob.cs
Original file line number Diff line number Diff line change
Expand Up @@ -130,32 +130,58 @@ async IAsyncEnumerable<Pretranslation> ProcessRowsAsync()

foreach (ParallelTextRow row in parallelCorpora.Flatten())
{
bool isInTrainOnRange = false;
bool isInPretranslateRange = false;
if(targetCorpora[CorpusType.Text] is ScriptureTextCorpus stc && row.Refs.All(r => r is VerseRef)){
Dictionary<string, List<int>> rowChaptersPerBook = row.Refs.Cast<VerseRef>().GroupBy(vr => vr.Book).ToDictionary(g => g.Key, g => g.Select(vr => vr.ChapterNum).ToList());
var parser = new BiblicalRangeStringParser(stc.Versification);
if(corpus.TrainOnBiblicalRange != null && corpus.TrainOnBiblicalRange != ""){
Dictionary<string, List<int>> trainOnBiblicalRangeChapters = parser.Parse(corpus.TrainOnBiblicalRange); //TODO calculate once
isInTrainOnRange = rowChaptersPerBook.Join(trainOnBiblicalRangeChapters, rcpb => rcpb.Key, tobrc => tobrc.Key, (rcbp, tobrc) =>
rcbp.Value.Intersect(tobrc.Value).Count() > 0 || (rcbp.Value.Count() > 0 && tobrc.Value.Count() == 0) //Empty list means all chapters from book
).Any(b => b);
}
if(corpus.PretranslateBiblicalRange != null && corpus.PretranslateBiblicalRange != ""){
Dictionary<string, List<int>> pretranslateBiblicalRangeChapters = parser.Parse(corpus.PretranslateBiblicalRange);
isInPretranslateRange = rowChaptersPerBook.Join(pretranslateBiblicalRangeChapters, rcpb => rcpb.Key, pbrc => pbrc.Key, (rcbp, pbrc) =>
rcbp.Value.Intersect(pbrc.Value).Count() > 0 || (rcbp.Value.Count() > 0 && pbrc.Value.Count() == 0)
).Any(b => b);
bool isInTrainOnChapters = false;
bool isInPretranslateChapters = false;
if (targetCorpora[CorpusType.Text] is ScriptureTextCorpus stc && row.Refs.All(r => r is VerseRef))
{
Dictionary<string, List<int>>? rowChaptersPerBook = null;
if (corpus.TrainOnChapters != null || corpus.PretranslateChapters != null)
{
rowChaptersPerBook = row
.Refs.Cast<VerseRef>()
.GroupBy(vr => vr.Book)
.ToDictionary(g => g.Key, g => g.Select(vr => vr.ChapterNum).ToList());

if (corpus.TrainOnChapters != null)
{
isInTrainOnChapters = rowChaptersPerBook
.Join(
corpus.TrainOnChapters,
rcpb => rcpb.Key,
tobrc => tobrc.Key,
(rcbp, tobrc) =>
rcbp.Value.Intersect(tobrc.Value).Count() > 0
|| (rcbp.Value.Count() > 0 && tobrc.Value.Count() == 0) //Empty list means all chapters from book
)
.Any(b => b);
}
if (corpus.PretranslateChapters != null)
{
isInPretranslateChapters = rowChaptersPerBook
.Join(
corpus.PretranslateChapters,
rcpb => rcpb.Key,
pbrc => pbrc.Key,
(rcbp, pbrc) =>
rcbp.Value.Intersect(pbrc.Value).Count() > 0
|| (rcbp.Value.Count() > 0 && pbrc.Value.Count() == 0)
)
.Any(b => b);
}
}
}
if (corpus.TrainOnAll || corpus.TrainOnTextIds.Contains(row.TextId) || isInTrainOnRange)
if (corpus.TrainOnAll || corpus.TrainOnTextIds.Contains(row.TextId) || isInTrainOnChapters)
{
await sourceTrainWriter.WriteAsync($"{row.SourceText}\n");
await targetTrainWriter.WriteAsync($"{row.TargetText}\n");
counts["NumTrainRows"] += 1;
}
if (
(corpus.PretranslateAll || corpus.PretranslateTextIds.Contains(row.TextId) || isInPretranslateRange)
(
corpus.PretranslateAll
|| corpus.PretranslateTextIds.Contains(row.TextId)
|| isInPretranslateChapters
)
&& row.SourceSegment.Count > 0
&& row.TargetSegment.Count == 0
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -255,8 +255,12 @@ private static Models.Corpus Map(Serval.Translation.V1.Corpus source)
TargetLanguage = source.TargetLanguage,
TrainOnAll = source.TrainOnAll,
PretranslateAll = source.PretranslateAll,
TrainOnBiblicalRange = source.TrainOnBiblicalRange,
PretranslateBiblicalRange = source.PretranslateBiblicalRange,
TrainOnChapters = source
.TrainOnChapters.Select(kvp => (kvp.Key, kvp.Value.Chapters.ToList()))

Check failure on line 259 in src/SIL.Machine.AspNetCore/Services/ServalTranslationEngineServiceV1.cs

View workflow job for this annotation

GitHub Actions / Build on ubuntu-20.04

'Corpus' does not contain a definition for 'TrainOnChapters' and no accessible extension method 'TrainOnChapters' accepting a first argument of type 'Corpus' could be found (are you missing a using directive or an assembly reference?)

Check failure on line 259 in src/SIL.Machine.AspNetCore/Services/ServalTranslationEngineServiceV1.cs

View workflow job for this annotation

GitHub Actions / Build on ubuntu-20.04

'Corpus' does not contain a definition for 'TrainOnChapters' and no accessible extension method 'TrainOnChapters' accepting a first argument of type 'Corpus' could be found (are you missing a using directive or an assembly reference?)

Check failure on line 259 in src/SIL.Machine.AspNetCore/Services/ServalTranslationEngineServiceV1.cs

View workflow job for this annotation

GitHub Actions / Build on windows-latest

'Corpus' does not contain a definition for 'TrainOnChapters' and no accessible extension method 'TrainOnChapters' accepting a first argument of type 'Corpus' could be found (are you missing a using directive or an assembly reference?)

Check failure on line 259 in src/SIL.Machine.AspNetCore/Services/ServalTranslationEngineServiceV1.cs

View workflow job for this annotation

GitHub Actions / Build on windows-latest

'Corpus' does not contain a definition for 'TrainOnChapters' and no accessible extension method 'TrainOnChapters' accepting a first argument of type 'Corpus' could be found (are you missing a using directive or an assembly reference?)
.ToDictionary(),
PretranslateChapters = source
.PretranslateChapters.Select(kvp => (kvp.Key, kvp.Value.Chapters.ToList()))

Check failure on line 262 in src/SIL.Machine.AspNetCore/Services/ServalTranslationEngineServiceV1.cs

View workflow job for this annotation

GitHub Actions / Build on ubuntu-20.04

'Corpus' does not contain a definition for 'PretranslateChapters' and no accessible extension method 'PretranslateChapters' accepting a first argument of type 'Corpus' could be found (are you missing a using directive or an assembly reference?)

Check failure on line 262 in src/SIL.Machine.AspNetCore/Services/ServalTranslationEngineServiceV1.cs

View workflow job for this annotation

GitHub Actions / Build on ubuntu-20.04

'Corpus' does not contain a definition for 'PretranslateChapters' and no accessible extension method 'PretranslateChapters' accepting a first argument of type 'Corpus' could be found (are you missing a using directive or an assembly reference?)

Check failure on line 262 in src/SIL.Machine.AspNetCore/Services/ServalTranslationEngineServiceV1.cs

View workflow job for this annotation

GitHub Actions / Build on windows-latest

'Corpus' does not contain a definition for 'PretranslateChapters' and no accessible extension method 'PretranslateChapters' accepting a first argument of type 'Corpus' could be found (are you missing a using directive or an assembly reference?)

Check failure on line 262 in src/SIL.Machine.AspNetCore/Services/ServalTranslationEngineServiceV1.cs

View workflow job for this annotation

GitHub Actions / Build on windows-latest

'Corpus' does not contain a definition for 'PretranslateChapters' and no accessible extension method 'PretranslateChapters' accepting a first argument of type 'Corpus' could be found (are you missing a using directive or an assembly reference?)
.ToDictionary(),
TrainOnTextIds = source.TrainOnTextIds.ToHashSet(),
PretranslateTextIds = source.PretranslateTextIds.ToHashSet(),
SourceFiles = source.SourceFiles.Select(Map).ToList(),
Expand Down
Loading

0 comments on commit 1119992

Please sign in to comment.