Skip to content

Commit

Permalink
Add support for detokenizing USFM (#154)
Browse files Browse the repository at this point in the history
- closes #153
  • Loading branch information
ddaspit authored Jan 5, 2024
1 parent 05c2ee7 commit 6451c71
Show file tree
Hide file tree
Showing 9 changed files with 305 additions and 85 deletions.
185 changes: 117 additions & 68 deletions src/SIL.Machine/Corpora/UsfmParser.cs

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/SIL.Machine/Corpora/UsfmTextBase.cs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ protected override IEnumerable<TextRow> GetVersesInDocOrder()
{
string usfm = ReadUsfm();
var rowCollector = new TextRowCollector(this);
UsfmParser.Parse(_stylesheet, usfm, rowCollector, Versification, preserveWhitespace: _includeMarkers);
UsfmParser.Parse(usfm, rowCollector, _stylesheet, Versification, preserveWhitespace: _includeMarkers);
return rowCollector.Rows;
}

Expand Down
8 changes: 5 additions & 3 deletions src/SIL.Machine/Corpora/UsfmToken.cs
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ public int GetLength(bool includeNewlines = false, bool addSpaces = true)

if (!string.IsNullOrEmpty(Data))
{
if (Marker.Length > 0)
if (!addSpaces && (Marker.Length == 0 || Marker[Marker.Length - 1] != '*'))
totalLength++;
totalLength += Data.Length;
if (addSpaces)
Expand Down Expand Up @@ -225,7 +225,7 @@ public string ToUsfm(bool includeNewlines = false, bool addSpaces = true)

if (!string.IsNullOrEmpty(Data))
{
if (Marker.Length > 0)
if (!addSpaces && (Marker.Length == 0 || Marker[Marker.Length - 1] != '*'))
sb.Append(' ');
sb.Append(Data);
if (addSpaces)
Expand All @@ -236,11 +236,13 @@ public string ToUsfm(bool includeNewlines = false, bool addSpaces = true)
{
string attributes = ToAttributeString();
if (attributes != "")
{
sb.Append(attributes);
}
else
{
// remove space that was put after marker - not needed when there are no attributes.
sb.Length -= 1;
sb.Length--;
}
sb.Append(@"\*");
}
Expand Down
136 changes: 130 additions & 6 deletions src/SIL.Machine/Corpora/UsfmTokenizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,41 @@
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;

namespace SIL.Machine.Corpora
{
public enum RtlReferenceOrder
{
NotSet,
BookChapterVerse,
BookVerseChapter
}

public class UsfmTokenizer
{
private const char ZeroWidthSpace = '\u200B';

private readonly UsfmStylesheet _stylesheet;
private static readonly Regex RtlVerseRegex = new Regex(
@"[\u200E\u200F]*(\d+\w?)[\u200E\u200F]*([\p{P}\p{S}])[\u200E\u200F]*(?=\d)",
RegexOptions.Compiled
);

public UsfmTokenizer(UsfmStylesheet stylesheet)
public UsfmTokenizer(
string stylesheetFileName = "usfm.sty",
RtlReferenceOrder rtlReferenceOrder = RtlReferenceOrder.NotSet
)
: this(new UsfmStylesheet(stylesheetFileName), rtlReferenceOrder) { }

public UsfmTokenizer(UsfmStylesheet stylesheet, RtlReferenceOrder rtlReferenceOrder = RtlReferenceOrder.NotSet)
{
_stylesheet = stylesheet;
Stylesheet = stylesheet ?? new UsfmStylesheet("usfm.sty");
RtlReferenceOrder = rtlReferenceOrder;
}

public UsfmStylesheet Stylesheet { get; }
public RtlReferenceOrder RtlReferenceOrder { get; }

public IReadOnlyList<UsfmToken> Tokenize(string usfm, bool preserveWhitespace = false)
{
List<UsfmToken> tokens = new List<UsfmToken>();
Expand Down Expand Up @@ -112,7 +133,7 @@ ref text
}

// Lookup marker
UsfmTag tag = _stylesheet.GetTag(marker.TrimStart('+'));
UsfmTag tag = Stylesheet.GetTag(marker.TrimStart('+'));

// If starts with a plus and is not a character style or an end style, it is an unknown tag
if (
Expand All @@ -121,7 +142,7 @@ ref text
&& tag.StyleType != UsfmStyleType.End
)
{
tag = _stylesheet.GetTag(marker);
tag = Stylesheet.GetTag(marker);
}

string endMarker = tag.StyleType != UsfmStyleType.Milestone ? marker + "*" : tag.EndMarker;
Expand Down Expand Up @@ -276,6 +297,109 @@ ref text
return tokens;
}

public string Detokenize(IEnumerable<UsfmToken> tokens, bool tokensHaveWhitespace = false)
{
UsfmToken prevToken = null;
var usfm = new StringBuilder();
foreach (UsfmToken token in tokens)
{
string tokenUsfm = "";
switch (token.Type)
{
case UsfmTokenType.Book:
case UsfmTokenType.Chapter:
case UsfmTokenType.Paragraph:
// Strip space from end of string before CR/LF
if (usfm.Length > 0)
{
if (
usfm[usfm.Length - 1] == ' ' && (prevToken != null && prevToken.ToUsfm().Trim() != "")
|| !tokensHaveWhitespace
)
{
usfm.Length--;
}
if (!tokensHaveWhitespace)
usfm.Append("\r\n");
}
tokenUsfm = token.ToUsfm();
break;
case UsfmTokenType.Verse:
// Add newline if after anything other than [ or (
if (usfm.Length > 0 && usfm[usfm.Length - 1] != '[' && usfm[usfm.Length - 1] != '(')
{
if (
usfm[usfm.Length - 1] == ' ' && (prevToken != null && prevToken.ToUsfm().Trim() != "")
|| !tokensHaveWhitespace
)
{
usfm.Length--;
}
if (!tokensHaveWhitespace)
usfm.Append("\r\n");
}

tokenUsfm = tokensHaveWhitespace ? token.ToUsfm().Trim() : token.ToUsfm();

if (RtlReferenceOrder != RtlReferenceOrder.NotSet)
{
string directionMarker =
RtlReferenceOrder == RtlReferenceOrder.BookVerseChapter ? "\u200e" : "\u200f";
tokenUsfm = RtlVerseRegex.Replace(tokenUsfm, $"$1{directionMarker}$2");
}
break;
case UsfmTokenType.Text:
// Ensure spaces are preserved
tokenUsfm = token.ToUsfm();
if (tokensHaveWhitespace && usfm.Length > 0 && usfm[usfm.Length - 1] == ' ')
{
if (
(
tokenUsfm.Length > 0
&& tokenUsfm[0] == ' '
&& prevToken != null
&& prevToken.ToUsfm().Trim() != ""
) || tokenUsfm.StartsWith("\r\n")
)
{
usfm.Length--;
}
else
{
tokenUsfm = tokenUsfm.TrimStart(' ');
}
}
break;
default:
tokenUsfm = token.ToUsfm();
break;
}

usfm.Append(tokenUsfm);
prevToken = token;
}

// Make sure begins without space or CR/LF
if (usfm.Length > 0 && usfm[0] == ' ')
usfm.Remove(0, 1);
if (usfm.Length > 0 && usfm[0] == '\r')
usfm.Remove(0, 2);

// Make sure ends without space and with a CR/LF
if (usfm.Length > 0 && usfm[usfm.Length - 1] == ' ')
usfm.Length--;
if (usfm.Length > 0 && usfm[usfm.Length - 1] != '\n')
usfm.Append("\r\n");
if (
usfm.Length > 3
&& usfm[usfm.Length - 3] == ' '
&& usfm[usfm.Length - 2] == '\r'
&& usfm[usfm.Length - 1] == '\n'
)
usfm.Remove(usfm.Length - 3, 1);
return usfm.ToString();
}

/// <summary>
/// Gets the next word in the usfm and advances the index past it
/// </summary>
Expand Down Expand Up @@ -361,7 +485,7 @@ ref string text
if (matchingToken == null)
return null;

UsfmTag matchingTag = _stylesheet.GetTag(matchingToken.NestlessMarker);
UsfmTag matchingTag = Stylesheet.GetTag(matchingToken.NestlessMarker);
if (
matchingTag.StyleType != UsfmStyleType.Character
&& matchingTag.StyleType != UsfmStyleType.Milestone
Expand Down
4 changes: 1 addition & 3 deletions tests/SIL.Machine.Tests/Corpora/CorporaTestHelpers.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
using System;
using System.IO;
using System.IO.Compression;
using System.IO.Compression;

namespace SIL.Machine.Corpora
{
Expand Down
1 change: 1 addition & 0 deletions tests/SIL.Machine.Tests/Corpora/TestData/.gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.SFM eol=crlf
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
\li2 verse four,
\v 5 Chapter one,
\li2 verse \fig Figure 1|src="image1.png" size="col" ref="1:5"\fig* five.
\c 2
\c 2
\s1 Chapter Two
\p
\v 1 Chapter \add two\add*, verse \f + \fr 2:1: \ft This is a footnote.\f*one.
Expand All @@ -25,7 +25,7 @@
\v 6 Chapter two, verse \w six|strong="12345" \w*.
\v 6 Bad verse.
\v 5 Chapter two, verse five \rq (MAT 3:1)\rq*.
\v 7a Chapter two, verse seven A,
\v 7a Chapter two, verse seven A,
\s Section header
\p
\v 7b verse seven B.
Expand Down
3 changes: 1 addition & 2 deletions tests/SIL.Machine.Tests/Corpora/UsfmFileTextTests.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
using System.Linq;
using System.Text;
using System.Text;
using NUnit.Framework;
using SIL.Scripture;

Expand Down
47 changes: 47 additions & 0 deletions tests/SIL.Machine.Tests/Corpora/UsfmTokenizerTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
using NUnit.Framework;

namespace SIL.Machine.Corpora
{
[TestFixture]
public class UsfmTokenizerTests
{
[Test]
public void Tokenize()
{
string usfm = ReadUsfm();
var tokenizer = new UsfmTokenizer();
IReadOnlyList<UsfmToken> tokens = tokenizer.Tokenize(usfm);
Assert.That(tokens, Has.Count.EqualTo(136));

Assert.That(tokens[0].Type, Is.EqualTo(UsfmTokenType.Book));
Assert.That(tokens[0].Marker, Is.EqualTo("id"));
Assert.That(tokens[0].Data, Is.EqualTo("MAT"));

Assert.That(tokens[10].Type, Is.EqualTo(UsfmTokenType.Text));
Assert.That(tokens[10].Text, Is.EqualTo("Chapter One "));

Assert.That(tokens[11].Type, Is.EqualTo(UsfmTokenType.Verse));
Assert.That(tokens[11].Marker, Is.EqualTo("v"));
Assert.That(tokens[11].Data, Is.EqualTo("1"));

Assert.That(tokens[20].Type, Is.EqualTo(UsfmTokenType.Note));
Assert.That(tokens[20].Marker, Is.EqualTo("f"));
Assert.That(tokens[20].Data, Is.EqualTo("+"));
}

[Test]
public void Detokenize()
{
string usfm = ReadUsfm();
var tokenizer = new UsfmTokenizer();
IReadOnlyList<UsfmToken> tokens = tokenizer.Tokenize(usfm);
string result = tokenizer.Detokenize(tokens);
Assert.That(result, Is.EqualTo(usfm));
}

private static string ReadUsfm()
{
return File.ReadAllText(Path.Combine(CorporaTestHelpers.UsfmTestProjectPath, "41MATTes.SFM"));
}
}
}

0 comments on commit 6451c71

Please sign in to comment.