Skip to content

Commit

Permalink
Add support for non-verse text segments in Scripture corpora
Browse files Browse the repository at this point in the history
- add new ScriptureRef corpus ref class
- update Scripture corpora classes to use ScriptureRef
- add ScriptureRefUsfmParserHandlerBase class to track ScriptureRef in USFM
- update UsfmTextUpdater and UsfmTextBase to use ScriptureRefUsfmParserHandlerBase
- add support for updating non-Scripture paragraphs and notes
  • Loading branch information
ddaspit committed Apr 2, 2024
1 parent 293bbdf commit 6db7f70
Show file tree
Hide file tree
Showing 27 changed files with 1,471 additions and 526 deletions.
22 changes: 16 additions & 6 deletions src/SIL.Machine/Corpora/CorporaExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ public static ITextCorpus Flatten(this IEnumerable<ITextCorpus> corpora)
bool curTrgLineRange = true;
foreach (ParallelTextRow row in parallelCorpus)
{
var vref = (VerseRef)row.Ref;
VerseRef vref = ((ScriptureRef)row.Ref).VerseRef;
if (
curRef.HasValue
&& vref.CompareTo(curRef.Value, null, compareAllVerses: true, compareSegments: false) != 0
Expand All @@ -299,14 +299,14 @@ public static ITextCorpus Flatten(this IEnumerable<ITextCorpus> corpora)
curRef = vref;
if (!curTrgRef.HasValue && row.TargetRefs.Count > 0)
{
curTrgRef = (VerseRef)row.TargetRefs[0];
curTrgRef = ((ScriptureRef)row.TargetRefs[0]).VerseRef;
}
else if (curTrgRef.HasValue && row.TargetRefs.Count > 0 && !curTrgRef.Value.Equals(row.TargetRefs[0]))
{
curTrgRef.Value.Simplify();
var trgRef = (VerseRef)row.TargetRefs[0];
VerseRef startRef,
endRef;
VerseRef trgRef = ((ScriptureRef)row.TargetRefs[0]).VerseRef;
VerseRef startRef;
VerseRef endRef;
if (curTrgRef.Value < trgRef)
{
startRef = curTrgRef.Value;
Expand Down Expand Up @@ -353,7 +353,7 @@ public static ITextCorpus Flatten(this IEnumerable<ITextCorpus> corpora)

public static bool IsScripture(this ITextCorpus textCorpus)
{
return textCorpus is ScriptureTextCorpus;
return textCorpus.Versification != null;
}

private class TransformTextCorpus : TextCorpusBase
Expand All @@ -372,6 +372,8 @@ public TransformTextCorpus(ITextCorpus corpus, Func<TextRow, TextRow> transform,

public override bool IsTokenized { get; }

public override ScrVers Versification => _corpus.Versification;

public override int Count(bool includeEmpty = true)
{
return _corpus.Count(includeEmpty);
Expand All @@ -398,6 +400,8 @@ public WhereTextCorpus(ITextCorpus corpus, Func<TextRow, int, bool> predicate)

public override bool IsTokenized => _corpus.IsTokenized;

public override ScrVers Versification => _corpus.Versification;

public override IEnumerable<TextRow> GetRows(IEnumerable<string> textIds)
{
return _corpus.GetRows(textIds).Where(_predicate);
Expand All @@ -419,6 +423,8 @@ public TextFilterTextCorpus(ITextCorpus corpus, Func<IText, bool> predicate)

public override bool IsTokenized => _corpus.IsTokenized;

public override ScrVers Versification => _corpus.Versification;

public override IEnumerable<TextRow> GetRows(IEnumerable<string> textIds)
{
return _corpus.GetRows(textIds ?? Texts.Select(t => t.Id));
Expand All @@ -440,6 +446,8 @@ public TakeTextCorpus(ITextCorpus corpus, int count)

public override bool IsTokenized => _corpus.IsTokenized;

public override ScrVers Versification => _corpus.Versification;

public override IEnumerable<TextRow> GetRows(IEnumerable<string> textIds)
{
return _corpus.GetRows(textIds).Take(_count);
Expand All @@ -459,6 +467,8 @@ public FlattenTextCorpus(ITextCorpus[] corpora)

public override bool IsTokenized => _corpora.All(corpus => corpus.IsTokenized);

public override ScrVers Versification => _corpora.Length > 0 ? _corpora[0].Versification : null;

public override int Count(bool includeEmpty = true)
{
return _corpora.Sum(corpus => corpus.Count(includeEmpty));
Expand Down
3 changes: 3 additions & 0 deletions src/SIL.Machine/Corpora/DictionaryTextCorpus.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
using System.Collections.Generic;
using System.Linq;
using SIL.Scripture;

namespace SIL.Machine.Corpora
{
Expand All @@ -21,6 +22,8 @@ public DictionaryTextCorpus(IEnumerable<IText> texts)

public bool IsTokenized { get; set; }

public ScrVers Versification { get; set; }

public override int Count(bool includeEmpty = true)
{
return Texts.Sum(t => t.Count(includeEmpty));
Expand Down
3 changes: 3 additions & 0 deletions src/SIL.Machine/Corpora/ITextCorpus.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using System.Collections.Generic;
using SIL.Scripture;

namespace SIL.Machine.Corpora
{
Expand All @@ -9,5 +10,7 @@ public interface ITextCorpus : ICorpus<TextRow>
IEnumerable<TextRow> GetRows(IEnumerable<string> textIds);

bool IsTokenized { get; }

ScrVers Versification { get; }
}
}
122 changes: 51 additions & 71 deletions src/SIL.Machine/Corpora/ParallelTextCorpus.cs
Original file line number Diff line number Diff line change
Expand Up @@ -64,20 +64,20 @@ public IEnumerable<ParallelTextRow> GetRows()
textIds = targetTextIds;

using (IEnumerator<TextRow> srcEnumerator = SourceCorpus.GetRows(textIds).GetEnumerator())
using (var trgEnumerator = new TargetCorpusEnumerator(TargetCorpus.GetRows(textIds).GetEnumerator()))
using (
var trgEnumerator = new TargetCorpusEnumerator(
TargetCorpus.GetRows(textIds).GetEnumerator(),
SourceCorpus.Versification,
TargetCorpus.Versification
)
)
using (IEnumerator<AlignmentRow> alignmentEnumerator = AlignmentCorpus.GetRows(textIds).GetEnumerator())
{
var rangeInfo = new RangeInfo();
rangeInfo.Versification =
TargetCorpus is ScriptureTextCorpus tc && SourceCorpus is ScriptureTextCorpus
? tc.Versification
: null;
var rangeInfo = new RangeInfo { TargetVersification = TargetCorpus.Versification };
var sourceSameRefRows = new List<TextRow>();
var targetSameRefRows = new List<TextRow>();

bool srcCompleted = !srcEnumerator.MoveNext();
if (!srcCompleted && srcEnumerator.Current.Ref is VerseRef verseRef)
trgEnumerator.SourceVersification = verseRef.Versification;
bool trgCompleted = !trgEnumerator.MoveNext();
while (!srcCompleted && !trgCompleted)
{
Expand Down Expand Up @@ -359,18 +359,13 @@ private IEnumerable<ParallelTextRow> CreateRows(
else
throw new ArgumentNullException("Either a source or target must be specified.");

var sourceRefs = srcRow != null ? new object[] { srcRow.Ref } : Array.Empty<object>();
var targetRefs = trgRow != null ? new object[] { trgRow.Ref } : Array.Empty<object>();
object[] sourceRefs = srcRow != null ? new object[] { srcRow.Ref } : Array.Empty<object>();
object[] targetRefs = trgRow != null ? new object[] { trgRow.Ref } : Array.Empty<object>();
if (targetRefs.Length == 0 && TargetCorpus is ScriptureTextCorpus stc)
{
targetRefs = sourceRefs
.Cast<VerseRef>()
.Select(r =>
{
var t = r.Clone();
t.ChangeVersification(stc.Versification);
return t;
})
.Cast<ScriptureRef>()
.Select(r => r.ChangeVersification(stc.Versification))
.Cast<object>()
.ToArray();
}
Expand Down Expand Up @@ -486,22 +481,17 @@ private class RangeInfo
public bool IsSourceEmpty => SourceSegment.Count == 0;
public bool IsTargetEmpty => TargetSegment.Count == 0;

public ScrVers Versification { get; set; } = null;
public ScrVers TargetVersification { get; set; } = null;

public ParallelTextRow CreateRow()
{
object[] trgRefs = TargetRefs.ToArray();
if (TargetRefs.Count == 0 && Versification != null)
if (TargetRefs.Count == 0 && TargetVersification != null)
{
trgRefs = SourceRefs
.ToArray()
.Cast<VerseRef>()
.Select(r =>
{
VerseRef t = r.Clone();
t.ChangeVersification(Versification);
return t;
})
.Cast<ScriptureRef>()
.Select(r => r.ChangeVersification(TargetVersification))
.Cast<object>()
.ToArray();
}
Expand All @@ -525,14 +515,11 @@ public ParallelTextRow CreateRow()

private class DefaultRowRefComparer : IComparer<object>
{
private static readonly VerseRefComparer VerseRefComparer = new VerseRefComparer(compareSegments: false);

public int Compare(object x, object y)
{
// Do not use the default comparer for VerseRef, since we want to compare all verses in a range or
// sequence
if (x is VerseRef vx && y is VerseRef vy)
return VerseRefComparer.Compare(vx, vy);
// Do not use the default comparer for ScriptureRef, since we want to ignore segments
if (x is ScriptureRef sx && y is ScriptureRef sy)
return sx.CompareTo(sy, compareSegments: false);

return Comparer<object>.Default.Compare(x, y);
}
Expand All @@ -541,48 +528,40 @@ public int Compare(object x, object y)
private class TargetCorpusEnumerator : DisposableBase, IEnumerator<TextRow>
{
private readonly IEnumerator<TextRow> _enumerator;
private bool _isScripture = false;
private bool _isEnumerating = false;
private readonly bool _isScripture = false;
private readonly Queue<TextRow> _verseRows;
private readonly ScrVers _sourceVersification;
private TextRow _current;
private bool _isEnumerating = false;

public TargetCorpusEnumerator(IEnumerator<TextRow> enumerator)
public TargetCorpusEnumerator(
IEnumerator<TextRow> enumerator,
ScrVers sourceVersification,
ScrVers targetVersification
)
{
_enumerator = enumerator;
_sourceVersification = sourceVersification;
_isScripture =
sourceVersification != null
&& targetVersification != null
&& sourceVersification != targetVersification;
_verseRows = new Queue<TextRow>();
}

public ScrVers SourceVersification { get; set; }

public TextRow Current => _current;

object IEnumerator.Current => Current;

public bool MoveNext()
{
bool result;
if (!_isEnumerating)
if (_isScripture)
{
_isEnumerating = true;
result = _enumerator.MoveNext();
if (
result
&& _enumerator.Current.Ref is VerseRef verseRef
&& SourceVersification != null
&& SourceVersification != verseRef.Versification
)
{
_isScripture = true;
}
else
if (!_isEnumerating)
{
_current = _enumerator.Current;
return result;
_enumerator.MoveNext();
_isEnumerating = true;
}
}

if (_isScripture)
{
if (_verseRows.Count == 0 && _enumerator.Current != null)
CollectVerses();
if (_verseRows.Count > 0)
Expand All @@ -594,7 +573,7 @@ public bool MoveNext()
return false;
}

result = _enumerator.MoveNext();
bool result = _enumerator.MoveNext();
_current = _enumerator.Current;
return result;
}
Expand All @@ -603,7 +582,6 @@ public void Reset()
{
_enumerator.Reset();
_isEnumerating = false;
_isScripture = false;
}

protected override void DisposeManagedResources()
Expand All @@ -613,23 +591,25 @@ protected override void DisposeManagedResources()

private void CollectVerses()
{
var rowList = new List<(VerseRef Ref, TextRow Row)>();
var rowList = new List<(ScriptureRef Ref, TextRow Row)>();
bool outOfOrder = false;
var prevVerseRef = new VerseRef();
ScriptureRef prevScrRef = ScriptureRef.Empty;
int rangeStartOffset = -1;
do
{
TextRow row = _enumerator.Current;
var verseRef = (VerseRef)row.Ref;
if (!prevVerseRef.IsDefault && verseRef.BookNum != prevVerseRef.BookNum)
var scrRef = (ScriptureRef)row.Ref;
if (!prevScrRef.IsEmpty && scrRef.BookNum != prevScrRef.BookNum)
break;

verseRef.ChangeVersification(SourceVersification);
scrRef = scrRef.ChangeVersification(_sourceVersification);
// convert one-to-many versification mapping to a verse range
if (verseRef.Equals(prevVerseRef))
if (scrRef.Equals(prevScrRef))
{
var (rangeStartVerseRef, rangeStartRow) = rowList[rowList.Count + rangeStartOffset];
var flags = TextRowFlags.InRange;
(ScriptureRef rangeStartVerseRef, TextRow rangeStartRow) = rowList[
rowList.Count + rangeStartOffset
];
TextRowFlags flags = TextRowFlags.InRange;
if (rangeStartRow.IsSentenceStart)
flags |= TextRowFlags.SentenceStart;
if (rangeStartOffset == -1 && (!rangeStartRow.IsInRange || rangeStartRow.IsRangeStart))
Expand All @@ -649,16 +629,16 @@ private void CollectVerses()
{
rangeStartOffset = -1;
}
rowList.Add((verseRef, row));
if (!outOfOrder && verseRef.CompareTo(prevVerseRef) < 0)
rowList.Add((scrRef, row));
if (!outOfOrder && scrRef.CompareTo(prevScrRef) < 0)
outOfOrder = true;
prevVerseRef = verseRef;
prevScrRef = scrRef;
} while (_enumerator.MoveNext());

if (outOfOrder)
rowList.Sort((x, y) => x.Ref.CompareTo(y.Ref));

foreach (var (_, row) in rowList)
foreach ((ScriptureRef _, TextRow row) in rowList)
_verseRows.Enqueue(row);
}
}
Expand Down
Loading

0 comments on commit 6db7f70

Please sign in to comment.