Skip to content

Commit

Permalink
Align all Scripture corpora to Original versification
Browse files Browse the repository at this point in the history
  • Loading branch information
ddaspit committed Mar 4, 2024
1 parent 0f42b05 commit f45bca8
Show file tree
Hide file tree
Showing 32 changed files with 457 additions and 334 deletions.
297 changes: 204 additions & 93 deletions src/SIL.Machine.AspNetCore/Services/NmtPreprocessBuildJob.cs
Original file line number Diff line number Diff line change
@@ -1,22 +1,43 @@
namespace SIL.Machine.AspNetCore.Services;

public class NmtPreprocessBuildJob(
IPlatformService platformService,
IRepository<TranslationEngine> engines,
IDistributedReaderWriterLockFactory lockFactory,
ILogger<NmtPreprocessBuildJob> logger,
IBuildJobService buildJobService,
ISharedFileService sharedFileService,
ICorpusService corpusService,
ILanguageTagService languageTagService
) : HangfireBuildJob<IReadOnlyList<Corpus>>(platformService, engines, lockFactory, buildJobService, logger)
public class NmtPreprocessBuildJob : HangfireBuildJob<IReadOnlyList<Corpus>>
{
private static readonly JsonSerializerOptions PretranslateSerializerOptions =
new() { WriteIndented = true, PropertyNamingPolicy = JsonNamingPolicy.CamelCase };

private readonly ISharedFileService _sharedFileService = sharedFileService;
private readonly ICorpusService _corpusService = corpusService;
private readonly ILanguageTagService _languageTagService = languageTagService;
private readonly ISharedFileService _sharedFileService;
private readonly ICorpusService _corpusService;
private readonly ILanguageTagService _languageTagService;
private int _seed = 1234;
private Random _random;

public NmtPreprocessBuildJob(
IPlatformService platformService,
IRepository<TranslationEngine> engines,
IDistributedReaderWriterLockFactory lockFactory,
ILogger<NmtPreprocessBuildJob> logger,
IBuildJobService buildJobService,
ISharedFileService sharedFileService,
ICorpusService corpusService,
ILanguageTagService languageTagService
)
: base(platformService, engines, lockFactory, buildJobService, logger)
{
_sharedFileService = sharedFileService;
_corpusService = corpusService;
_languageTagService = languageTagService;
_random = new Random(_seed);
}

internal int Seed
{
get => _seed;
set
{
_seed = value;
_random = new Random(_seed);
}
}

protected override async Task DoWorkAsync(
string engineId,
Expand Down Expand Up @@ -98,86 +119,57 @@ async IAsyncEnumerable<Pretranslation> ProcessRowsAsync()
if (sourceTextCorpora.Length == 0 || targetTextCorpus is null)
continue;

IParallelTextCorpus parallelTextCorpus = sourceTextCorpora[0]
.AlignRows(targetTextCorpus, allSourceRows: true);
foreach (
ParallelTextRow row in parallelTextCorpus.Where(r =>
r.SourceSegment.Count > 0 && r.TargetSegment.Count == 0 && !r.IsTargetInRange
)
)
//var src1 = AlignScripture(sourceTextCorpora[0], targetTextCorpus).ToArray();
//var src2 = AlignScripture(sourceTextCorpora[1], targetTextCorpus).ToArray();

//var rs = AlignCorpora(sourceTextCorpora, targetTextCorpus).ToArray();

int skipCount = 0;
foreach (Row?[] rows in AlignCorpora(sourceTextCorpora, targetTextCorpus))
{
bool isInTrainOnChapters = false;
if (corpus.TrainOnChapters is not null)
isInTrainOnChapters = row.Refs.Any(r => IsInChapters(corpus.TrainOnChapters, r));
if (corpus.TrainOnAll || corpus.TrainOnTextIds.Contains(row.TextId) || isInTrainOnChapters)
if (skipCount > 0)
{
await sourceTrainWriter.WriteAsync($"{row.SourceText}\n");
await targetTrainWriter.WriteAsync("\n");
skipCount--;
continue;
}

bool isInPretranslateChapters = false;
if (corpus.PretranslateChapters is not null)
isInPretranslateChapters = row.Refs.Any(r => IsInChapters(corpus.PretranslateChapters, r));
if (
corpus.PretranslateAll
|| corpus.PretranslateTextIds.Contains(row.TextId)
|| isInPretranslateChapters
)
Row[] trainRows = rows.Where(r => r is not null && IsInTrain(r, corpus)).Cast<Row>().ToArray();
if (trainRows.Length > 0)
{
IReadOnlyList<object> refs;
if (row.TargetRefs.Count == 0)
Row row = trainRows[0];
if (rows.Length > 1)
{
refs = row.SourceRefs;
}
else
{
refs = row.TargetRefs;
Row[] nonEmptyRows = trainRows.Where(r => r.SourceSegment.Length > 0).ToArray();
if (nonEmptyRows.Length > 0)
row = nonEmptyRows[_random.Next(nonEmptyRows.Length)];
}

await sourceTrainWriter.WriteAsync($"{row.SourceSegment}\n");
await targetTrainWriter.WriteAsync($"{row.TargetSegment}\n");
skipCount = row.RowCount - 1;
if (row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0)
trainCount++;
}

Row? pretranslateRow = rows[0];
if (
pretranslateRow is not null
&& IsInPretranslate(pretranslateRow, corpus)
&& pretranslateRow.SourceSegment.Length > 0
&& pretranslateRow.TargetSegment.Length == 0
)
{
yield return new Pretranslation
{
CorpusId = corpus.Id,
TextId = row.TextId,
Refs = refs.Select(r => r.ToString() ?? "").ToList(),
Translation = row.SourceText
TextId = pretranslateRow.TextId,
Refs = pretranslateRow.Refs.Select(r => r.ToString() ?? "").ToList(),
Translation = pretranslateRow.SourceSegment
};
pretranslateCount++;
}
}

Random random = new();
foreach (
ParallelTextRow[] rows in sourceTextCorpora
.Select(stc => AlignCorpora(stc, targetTextCorpus))
.ZipMany(rows => rows.ToArray())
.Where(rows => rows.Any(r => r.TargetSegment.Count > 0))
)
{
bool isInTrainOnChapters = false;
if (corpus.TrainOnChapters is not null)
isInTrainOnChapters = rows[0].Refs.Any(r => IsInChapters(corpus.TrainOnChapters, r));
if (corpus.TrainOnAll || corpus.TrainOnTextIds.Contains(rows[0].TextId) || isInTrainOnChapters)
{
ParallelTextRow? row = null;
if (rows.Length == 1)
{
if (!rows[0].IsEmpty)
row = rows[0];
}
else
{
ParallelTextRow[] nonEmptyRows = rows.Where(r => !r.IsEmpty).ToArray();
if (nonEmptyRows.Length > 0)
row = nonEmptyRows[random.Next(nonEmptyRows.Length)];
}
if (row is not null)
{
await sourceTrainWriter.WriteAsync($"{row.SourceText}\n");
await targetTrainWriter.WriteAsync($"{row.TargetText}\n");
trainCount++;
}
}
}

if ((bool?)buildOptionsObject?["use_key_terms"] ?? true)
{
ITextCorpus? sourceTermCorpus = _corpusService
Expand Down Expand Up @@ -236,6 +228,31 @@ JobCompletionStatus completionStatus
}
}

private static bool IsInTrain(Row row, Corpus corpus)
{
return IsIncluded(row, corpus.TrainOnAll, corpus.TrainOnTextIds, corpus.TrainOnChapters);
}

private static bool IsInPretranslate(Row row, Corpus corpus)
{
return IsIncluded(row, corpus.PretranslateAll, corpus.PretranslateTextIds, corpus.PretranslateChapters);
}

private static bool IsIncluded(
Row row,
bool all,
IReadOnlySet<string> textIds,
IReadOnlyDictionary<string, IReadOnlySet<int>>? chapters
)
{
if (chapters is not null)
{
if (row.Refs.Any(r => IsInChapters(chapters, r)))
return true;
}
return all || textIds.Contains(row.TextId);
}

private static bool IsInChapters(IReadOnlyDictionary<string, IReadOnlySet<int>> bookChapters, object rowRef)
{
if (rowRef is not VerseRef vr)
Expand All @@ -244,30 +261,124 @@ private static bool IsInChapters(IReadOnlyDictionary<string, IReadOnlySet<int>>
&& (chapters.Contains(vr.ChapterNum) || chapters.Count == 0);
}

private static IEnumerable<ParallelTextRow> AlignCorpora(ITextCorpus srcCorpus, ITextCorpus trgCorpus)
private static IEnumerable<Row?[]> AlignCorpora(IReadOnlyList<ITextCorpus> srcCorpora, ITextCorpus trgCorpus)
{
if (trgCorpus.IsScripture())
{
return srcCorpora
.Select(sc => AlignScripture(sc, trgCorpus))
.ZipMany(rows => rows.ToArray())
.Where(rows => rows.Any(r => r is null || r.SourceSegment.Length > 0 || r.TargetSegment.Length > 0));
}

IEnumerable<Row[]> sourceOnlyRows = srcCorpora
.Select(sc => sc.AlignRows(trgCorpus, allSourceRows: true))
.ZipMany(rows =>
rows.Where(r => r.TargetSegment.Count == 0)
.Select(r => new Row(r.TextId, r.Refs, r.SourceText, r.TargetText, 1))
.ToArray()
);

IEnumerable<Row[]> targetRows = srcCorpora
.Select(sc => sc.AlignRows(trgCorpus, allTargetRows: true))
.ZipMany(rows =>
rows.Where(r => r.TargetSegment.Count > 0)
.Select(r => new Row(r.TextId, r.Refs, r.SourceText, r.TargetText, 1))
.ToArray()
);

return sourceOnlyRows
.Concat(targetRows)
.Where(rows => rows.Any(r => r.SourceSegment.Length > 0 || r.TargetSegment.Length > 0));
}

private static IEnumerable<Row?> AlignScripture(ITextCorpus srcCorpus, ITextCorpus trgCorpus)
{
List<ParallelTextRow> rangeRows = [];
foreach (ParallelTextRow row in srcCorpus.AlignRows(trgCorpus, allTargetRows: true))
int rowCount = 0;
StringBuilder srcSegBuffer = new();
StringBuilder trgSegBuffer = new();
HashSet<VerseRef> vrefs = [];
foreach (
(VerseRef vref, string srcSegment, string trgSegment) in srcCorpus
.ExtractScripture()
.Select(r => (r.CorpusVerseRef, r.Text))
.Zip(
trgCorpus.ExtractScripture().Select(r => r.Text),
(s, t) => (VerseRef: s.CorpusVerseRef, SourceSegment: s.Text, TargetSegment: t)
)
)
{
if (rangeRows.Count > 0 && row.IsSourceInRange && !row.IsSourceRangeStart)
if (srcSegment == "<range>" && trgSegment == "<range>")
{
vrefs.UnionWith(vref.AllVerses());
rowCount++;
}
else if (srcSegment == "<range>")
{
rangeRows[0].TargetSegment = [rangeRows[0].TargetText + " " + row.TargetText];
row.TargetSegment = [];
rangeRows.Add(row);
vrefs.UnionWith(vref.AllVerses());
if (trgSegment.Length > 0)
{
if (trgSegBuffer.Length > 0)
trgSegBuffer.Append(' ');
trgSegBuffer.Append(trgSegment);
}
rowCount++;
}
else if (trgSegment == "<range>")
{
vrefs.UnionWith(vref.AllVerses());
if (srcSegment.Length > 0)
{
if (srcSegBuffer.Length > 0)
srcSegBuffer.Append(' ');
srcSegBuffer.Append(srcSegment);
}
rowCount++;
}
else
{
if (rangeRows.Count > 0)
if (rowCount > 0)
{
foreach (ParallelTextRow rangeRow in rangeRows)
yield return rangeRow;
rangeRows.Clear();
yield return new(
vrefs.First().Book,
vrefs.Order().Cast<object>().ToArray(),
srcSegBuffer.ToString(),
trgSegBuffer.ToString(),
rowCount
);
for (int i = 0; i < rowCount - 1; i++)
yield return null;
srcSegBuffer.Clear();
trgSegBuffer.Clear();
vrefs.Clear();
rowCount = 0;
}
if (row.IsSourceRangeStart)
rangeRows.Add(row);
else
yield return row;
vrefs.UnionWith(vref.AllVerses());
srcSegBuffer.Append(srcSegment);
trgSegBuffer.Append(trgSegment);
rowCount++;
}
}

if (rowCount > 0)
{
yield return new(
vrefs.First().Book,
vrefs.Order().Cast<object>().ToArray(),
srcSegBuffer.ToString(),
trgSegBuffer.ToString(),
rowCount
);
for (int i = 0; i < rowCount - 1; i++)
yield return null;
}
}

private record Row(
string TextId,
IReadOnlyList<object> Refs,
string SourceSegment,
string TargetSegment,
int RowCount
);
}
5 changes: 5 additions & 0 deletions src/SIL.Machine/Corpora/CorporaExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,11 @@ public static ITextCorpus Flatten(this IEnumerable<ITextCorpus> corpora)
yield return (curTrgLineRange ? "<range>" : curTrgLine.ToString(), curRef.Value, curTrgRef.Value);
}

public static bool IsScripture(this ITextCorpus textCorpus)
{
return textCorpus is ScriptureTextCorpus;
}

private class TransformTextCorpus : TextCorpusBase
{
private readonly ITextCorpus _corpus;
Expand Down
Loading

0 comments on commit f45bca8

Please sign in to comment.