From 9cd9074691f877e6fbbe21ff5bb3267d07db462c Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Wed, 7 Feb 2024 18:08:19 -0500 Subject: [PATCH 1/2] Move logic to parallel text corpus --- .../Services/NmtPreprocessBuildJob.cs | 17 +-------- src/SIL.Machine/Corpora/ParallelTextCorpus.cs | 36 ++++++++++++++++++- .../Corpora/CorporaExtensionsTests.cs | 3 +- 3 files changed, 37 insertions(+), 19 deletions(-) diff --git a/src/SIL.Machine.AspNetCore/Services/NmtPreprocessBuildJob.cs b/src/SIL.Machine.AspNetCore/Services/NmtPreprocessBuildJob.cs index 987b61823..0fc33fcb8 100644 --- a/src/SIL.Machine.AspNetCore/Services/NmtPreprocessBuildJob.cs +++ b/src/SIL.Machine.AspNetCore/Services/NmtPreprocessBuildJob.cs @@ -161,22 +161,7 @@ bool IsInChapters(Dictionary> bookChapters, object rowRef) IReadOnlyList refs; if (row.TargetRefs.Count == 0) { - if (targetCorpora[CorpusType.Text] is ScriptureTextCorpus tstc) - { - refs = row - .SourceRefs.Cast() - .Select(srcRef => - { - var trgRef = srcRef.Clone(); - trgRef.ChangeVersification(tstc.Versification); - return (object)trgRef; - }) - .ToList(); - } - else - { - refs = row.SourceRefs; - } + refs = row.SourceRefs; } else { diff --git a/src/SIL.Machine/Corpora/ParallelTextCorpus.cs b/src/SIL.Machine/Corpora/ParallelTextCorpus.cs index 415f10c74..761f11836 100644 --- a/src/SIL.Machine/Corpora/ParallelTextCorpus.cs +++ b/src/SIL.Machine/Corpora/ParallelTextCorpus.cs @@ -68,6 +68,10 @@ public IEnumerable GetRows() using (IEnumerator alignmentEnumerator = AlignmentCorpus.GetRows(textIds).GetEnumerator()) { var rangeInfo = new RangeInfo(); + rangeInfo.Versification = + TargetCorpus is ScriptureTextCorpus tc && SourceCorpus is ScriptureTextCorpus + ? tc.Versification + : null; var sourceSameRefRows = new List(); var targetSameRefRows = new List(); @@ -357,6 +361,19 @@ private IEnumerable CreateRows( var sourceRefs = srcRow != null ? new object[] { srcRow.Ref } : Array.Empty(); var targetRefs = trgRow != null ? new object[] { trgRow.Ref } : Array.Empty(); + if (targetRefs.Length == 0 && TargetCorpus is ScriptureTextCorpus stc) + { + targetRefs = sourceRefs + .Cast() + .Select(r => + { + var t = r.Clone(); + t.ChangeVersification(stc.Versification); + return t; + }) + .Cast() + .ToArray(); + } TextRowFlags sourceFlags; if (srcRow == null) @@ -465,9 +482,26 @@ private class RangeInfo public bool IsSourceEmpty => SourceSegment.Count == 0; public bool IsTargetEmpty => TargetSegment.Count == 0; + public ScrVers Versification { get; set; } = null; + public ParallelTextRow CreateRow() { - var row = new ParallelTextRow(TextId, SourceRefs.ToArray(), TargetRefs.ToArray()) + var trgRefs = TargetRefs.ToArray(); + if (TargetRefs.Count == 0 && Versification != null) + { + trgRefs = SourceRefs + .ToArray() + .Cast() + .Select(r => + { + var t = r.Clone(); + t.ChangeVersification(Versification); + return t; + }) + .Cast() + .ToArray(); + } + var row = new ParallelTextRow(TextId, SourceRefs.ToArray(), trgRefs) { SourceSegment = SourceSegment.ToArray(), TargetSegment = TargetSegment.ToArray(), diff --git a/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs b/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs index 86a2bc9ba..e762bc03c 100644 --- a/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs @@ -15,9 +15,9 @@ public void ExtractScripture() Assert.That(lines.Count, Is.EqualTo(41899)); (string text, VerseRef origRef, VerseRef? corpusRef) = lines[0]; + Assert.That(text, Is.EqualTo("")); Assert.That(origRef, Is.EqualTo(new VerseRef("GEN 1:1", ScrVers.Original))); - Assert.That(corpusRef.HasValue, Is.False); (text, origRef, corpusRef) = lines[3167]; Assert.That(text, Is.EqualTo("Chapter fourteen, verse fifty-five. Segment b.")); @@ -32,7 +32,6 @@ public void ExtractScripture() (text, origRef, corpusRef) = lines[10727]; Assert.That(text, Is.EqualTo("")); Assert.That(origRef, Is.EqualTo(new VerseRef("1CH 12:4", ScrVers.Original))); - Assert.That(corpusRef.HasValue, Is.False); (text, origRef, corpusRef) = lines[10731]; Assert.That(text, Is.EqualTo("")); From 37380ce0f3ae798f72bbb17bd4bcf7d7443b1f28 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Thu, 8 Feb 2024 12:03:17 -0500 Subject: [PATCH 2/2] Review changes --- src/SIL.Machine/Corpora/CorporaExtensions.cs | 6 +++--- tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/SIL.Machine/Corpora/CorporaExtensions.cs b/src/SIL.Machine/Corpora/CorporaExtensions.cs index 9a265be7b..f5019e8fa 100644 --- a/src/SIL.Machine/Corpora/CorporaExtensions.cs +++ b/src/SIL.Machine/Corpora/CorporaExtensions.cs @@ -269,7 +269,7 @@ public static ITextCorpus Flatten(this IEnumerable corpora) return new FlattenTextCorpus(corpusArray); } - public static IEnumerable<(string Text, VerseRef RefCorpusVerseRef, VerseRef? CorpusVerseRef)> ExtractScripture( + public static IEnumerable<(string Text, VerseRef RefCorpusVerseRef, VerseRef CorpusVerseRef)> ExtractScripture( this ITextCorpus corpus, ITextCorpus refCorpus = null ) @@ -290,7 +290,7 @@ public static ITextCorpus Flatten(this IEnumerable corpora) && vref.CompareTo(curRef.Value, null, compareAllVerses: true, compareSegments: false) != 0 ) { - yield return (curTrgLineRange ? "" : curTrgLine.ToString(), curRef.Value, curTrgRef); + yield return (curTrgLineRange ? "" : curTrgLine.ToString(), curRef.Value, curTrgRef.Value); curTrgLineRange = curTrgLineRange || curTrgLine.Length > 0; curTrgLine = new StringBuilder(); curTrgRef = null; @@ -348,7 +348,7 @@ public static ITextCorpus Flatten(this IEnumerable corpora) } if (curRef.HasValue) - yield return (curTrgLineRange ? "" : curTrgLine.ToString(), curRef.Value, curTrgRef); + yield return (curTrgLineRange ? "" : curTrgLine.ToString(), curRef.Value, curTrgRef.Value); } private class TransformTextCorpus : TextCorpusBase diff --git a/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs b/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs index e762bc03c..51a0a7a70 100644 --- a/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs @@ -18,6 +18,7 @@ public void ExtractScripture() Assert.That(text, Is.EqualTo("")); Assert.That(origRef, Is.EqualTo(new VerseRef("GEN 1:1", ScrVers.Original))); + Assert.That(corpusRef, Is.EqualTo(new VerseRef("GEN 1:1", corpus.Versification))); (text, origRef, corpusRef) = lines[3167]; Assert.That(text, Is.EqualTo("Chapter fourteen, verse fifty-five. Segment b.")); @@ -32,6 +33,7 @@ public void ExtractScripture() (text, origRef, corpusRef) = lines[10727]; Assert.That(text, Is.EqualTo("")); Assert.That(origRef, Is.EqualTo(new VerseRef("1CH 12:4", ScrVers.Original))); + Assert.That(corpusRef, Is.EqualTo(new VerseRef("1CH 12:4", corpus.Versification))); (text, origRef, corpusRef) = lines[10731]; Assert.That(text, Is.EqualTo(""));