From 6451c711ebd95448ff0da7d23709324b999519f4 Mon Sep 17 00:00:00 2001
From: Damien Daspit <damien_daspit@sil.org>
Date: Fri, 5 Jan 2024 14:29:15 -0500
Subject: [PATCH] Add support for detokenizing USFM (#154)

- closes #153
---
 src/SIL.Machine/Corpora/UsfmParser.cs         | 185 +++++++++++-------
 src/SIL.Machine/Corpora/UsfmTextBase.cs       |   2 +-
 src/SIL.Machine/Corpora/UsfmToken.cs          |   8 +-
 src/SIL.Machine/Corpora/UsfmTokenizer.cs      | 136 ++++++++++++-
 .../Corpora/CorporaTestHelpers.cs             |   4 +-
 .../Corpora/TestData/.gitattributes           |   1 +
 .../Corpora/TestData/usfm/Tes/41MATTes.SFM    |   4 +-
 .../Corpora/UsfmFileTextTests.cs              |   3 +-
 .../Corpora/UsfmTokenizerTests.cs             |  47 +++++
 9 files changed, 305 insertions(+), 85 deletions(-)
 create mode 100644 tests/SIL.Machine.Tests/Corpora/TestData/.gitattributes
 create mode 100644 tests/SIL.Machine.Tests/Corpora/UsfmTokenizerTests.cs

diff --git a/src/SIL.Machine/Corpora/UsfmParser.cs b/src/SIL.Machine/Corpora/UsfmParser.cs
index 0e632bf96..32a463fe0 100644
--- a/src/SIL.Machine/Corpora/UsfmParser.cs
+++ b/src/SIL.Machine/Corpora/UsfmParser.cs
@@ -16,21 +16,35 @@ namespace SIL.Machine.Corpora
     public class UsfmParser
     {
         public static void Parse(
-            UsfmStylesheet stylesheet,
             string usfm,
             IUsfmParserHandler handler,
+            string stylesheetFileName = "usfm.sty",
             ScrVers versification = null,
             bool preserveWhitespace = false
         )
         {
-            var parser = new UsfmParser(stylesheet, usfm, handler, versification, preserveWhitespace);
+            Parse(usfm, handler, new UsfmStylesheet(stylesheetFileName), versification, preserveWhitespace);
+        }
+
+        public static void Parse(
+            string usfm,
+            IUsfmParserHandler handler,
+            UsfmStylesheet stylesheet = null,
+            ScrVers versification = null,
+            bool preserveWhitespace = false
+        )
+        {
+            var parser = new UsfmParser(
+                usfm,
+                handler,
+                stylesheet ?? new UsfmStylesheet("usfm.sty"),
+                versification,
+                preserveWhitespace
+            );
             parser.ProcessTokens();
         }
 
         private static readonly Regex OptBreakSplitter = new Regex("(//)", RegexOptions.Compiled);
-        private readonly bool _tokensPreserveWhitespace;
-
-        private readonly IUsfmParserHandler _handler;
 
         /// <summary>
         /// Number of tokens to skip over because have been processed in advance
@@ -39,33 +53,64 @@ public static void Parse(
         private int _skip = 0;
 
         public UsfmParser(
-            UsfmStylesheet stylesheet,
             IReadOnlyList<UsfmToken> tokens,
             IUsfmParserHandler handler = null,
+            string stylesheetFileName = "usfm.sty",
             ScrVers versification = null,
             bool tokensPreserveWhitespace = false
         )
-        {
-            State = new UsfmParserState(stylesheet, versification ?? ScrVers.English, tokens);
-            _handler = handler;
-            _tokensPreserveWhitespace = tokensPreserveWhitespace;
-        }
+            : this(tokens, handler, new UsfmStylesheet(stylesheetFileName), versification, tokensPreserveWhitespace) { }
+
+        public UsfmParser(
+            IReadOnlyList<UsfmToken> tokens,
+            IUsfmParserHandler handler = null,
+            UsfmStylesheet stylesheet = null,
+            ScrVers versification = null,
+            bool tokensPreserveWhitespace = false
+        )
+            : this(
+                new UsfmParserState(
+                    stylesheet ?? new UsfmStylesheet("usfm.sty"),
+                    versification ?? ScrVers.English,
+                    tokens
+                ),
+                handler,
+                tokensPreserveWhitespace
+            ) { }
 
         public UsfmParser(
-            UsfmStylesheet stylesheet,
             string usfm,
             IUsfmParserHandler handler = null,
+            string stylesheetFileName = "usfm.sty",
             ScrVers versification = null,
-            bool preserveWhitespace = false
+            bool tokensPreserveWhitespace = false
+        )
+            : this(usfm, handler, new UsfmStylesheet(stylesheetFileName), versification, tokensPreserveWhitespace) { }
+
+        public UsfmParser(
+            string usfm,
+            IUsfmParserHandler handler = null,
+            UsfmStylesheet stylesheet = null,
+            ScrVers versification = null,
+            bool tokensPreserveWhitespace = false
         )
             : this(
-                stylesheet,
-                GetTokens(stylesheet, usfm, preserveWhitespace),
+                new UsfmParserState(
+                    stylesheet ?? new UsfmStylesheet("usfm.sty"),
+                    versification ?? ScrVers.English,
+                    GetTokens(stylesheet, usfm, tokensPreserveWhitespace)
+                ),
                 handler,
-                versification,
-                preserveWhitespace
+                tokensPreserveWhitespace
             ) { }
 
+        private UsfmParser(UsfmParserState state, IUsfmParserHandler handler, bool tokensPreserveWhitespace)
+        {
+            State = state;
+            Handler = handler;
+            TokensPreserveWhitespace = tokensPreserveWhitespace;
+        }
+
         private static IReadOnlyList<UsfmToken> GetTokens(
             UsfmStylesheet stylesheet,
             string usfm,
@@ -76,6 +121,10 @@ bool preserveWhitespace
             return tokenizer.Tokenize(usfm, preserveWhitespace);
         }
 
+        public IUsfmParserHandler Handler { get; }
+
+        public bool TokensPreserveWhitespace { get; }
+
         /// <summary>
         /// Gets the current parser state. Note: Will change with each token parsed
         /// </summary>
@@ -98,12 +147,12 @@ public bool ProcessToken()
             // If past end
             if (State.Index >= State.Tokens.Count - 1)
             {
-                _handler?.EndUsfm(State);
+                Handler?.EndUsfm(State);
                 return false;
             }
             else if (State.Index < 0)
             {
-                _handler?.StartUsfm(State);
+                Handler?.StartUsfm(State);
             }
 
             // Move to next token
@@ -111,7 +160,7 @@ public bool ProcessToken()
 
             // Update verse offset with previous token (since verse offset is from start of current token)
             if (State.PrevToken != null)
-                State.VerseOffset += State.PrevToken.GetLength(addSpaces: !_tokensPreserveWhitespace);
+                State.VerseOffset += State.PrevToken.GetLength(addSpaces: !TokensPreserveWhitespace);
 
             // Skip over tokens that are to be skipped, ensuring that
             // SpecialToken state is true.
@@ -132,8 +181,8 @@ public bool ProcessToken()
             if (tokenType == UsfmTokenType.Unknown)
                 tokenType = DetermineUnknownTokenType();
 
-            if (_handler != null && !string.IsNullOrEmpty(token.Marker))
-                _handler.GotMarker(State, token.Marker);
+            if (Handler != null && !string.IsNullOrEmpty(token.Marker))
+                Handler.GotMarker(State, token.Marker);
 
             // Close open elements
             switch (tokenType)
@@ -237,8 +286,8 @@ public bool ProcessToken()
 
                     // Unmatched end marker
                     if (unmatched)
-                        if (_handler != null)
-                            _handler.Unmatched(State, token.Marker);
+                        if (Handler != null)
+                            Handler.Unmatched(State, token.Marker);
                     break;
             }
 
@@ -263,8 +312,8 @@ public bool ProcessToken()
                     State.VerseOffset = 0;
 
                     // Book start.
-                    if (_handler != null)
-                        _handler.StartBook(State, token.Marker, code);
+                    if (Handler != null)
+                        Handler.StartBook(State, token.Marker, code);
                     break;
                 case UsfmTokenType.Chapter:
                     // Get alternate chapter number
@@ -309,8 +358,8 @@ public bool ProcessToken()
                     if (State.VerseRef.ChapterNum != 1)
                         State.VerseOffset = 0;
 
-                    if (_handler != null)
-                        _handler.Chapter(State, token.Data, token.Marker, altChapter, pubChapter);
+                    if (Handler != null)
+                        Handler.Chapter(State, token.Data, token.Marker, altChapter, pubChapter);
                     break;
                 case UsfmTokenType.Verse:
                     string pubVerse = null;
@@ -344,8 +393,8 @@ public bool ProcessToken()
                     State.VerseRef = vref;
                     State.VerseOffset = 0;
 
-                    if (_handler != null)
-                        _handler.Verse(State, token.Data, token.Marker, altVerse, pubVerse);
+                    if (Handler != null)
+                        Handler.Verse(State, token.Data, token.Marker, altVerse, pubVerse);
                     break;
                 case UsfmTokenType.Paragraph:
                     // Handle special case of table rows
@@ -355,15 +404,15 @@ public bool ProcessToken()
                         if (State.Stack.All(e => e.Type != UsfmElementType.Table))
                         {
                             State.Push(new UsfmParserElement(UsfmElementType.Table, null));
-                            if (_handler != null)
-                                _handler.StartTable(State);
+                            if (Handler != null)
+                                Handler.StartTable(State);
                         }
 
                         State.Push(new UsfmParserElement(UsfmElementType.Row, token.Marker));
 
                         // Row start
-                        if (_handler != null)
-                            _handler.StartRow(State, token.Marker);
+                        if (Handler != null)
+                            Handler.StartRow(State, token.Marker);
                         break;
                     }
 
@@ -386,8 +435,8 @@ public bool ProcessToken()
                             _skip += 3;
                         }
 
-                        if (_handler != null)
-                            _handler.StartSidebar(State, token.Marker, sidebarCategory);
+                        if (Handler != null)
+                            Handler.StartSidebar(State, token.Marker, sidebarCategory);
                         break;
                     }
 
@@ -399,9 +448,9 @@ public bool ProcessToken()
                             while (State.Stack.Count > 0)
                                 CloseElement(State.Peek().Type == UsfmElementType.Sidebar);
                         }
-                        else if (_handler != null)
+                        else if (Handler != null)
                         {
-                            _handler.Unmatched(State, token.Marker);
+                            Handler.Unmatched(State, token.Marker);
                         }
                         break;
                     }
@@ -409,8 +458,8 @@ public bool ProcessToken()
                     State.Push(new UsfmParserElement(UsfmElementType.Para, token.Marker));
 
                     // Paragraph opening
-                    if (_handler != null)
-                        _handler.StartPara(State, token.Marker, token.Type == UsfmTokenType.Unknown, token.Attributes);
+                    if (Handler != null)
+                        Handler.StartPara(State, token.Marker, token.Type == UsfmTokenType.Unknown, token.Attributes);
                     break;
                 case UsfmTokenType.Character:
                     // Handle special case of table cells (treated as special character style)
@@ -425,8 +474,8 @@ public bool ProcessToken()
                         UsfmStylesheet.IsCellRange(token.Marker, out string baseMarker, out int colspan);
                         State.Push(new UsfmParserElement(UsfmElementType.Cell, baseMarker));
 
-                        if (_handler != null)
-                            _handler.StartCell(State, baseMarker, align, colspan);
+                        if (Handler != null)
+                            Handler.StartCell(State, baseMarker, align, colspan);
                         break;
                     }
 
@@ -439,8 +488,8 @@ public bool ProcessToken()
 
                         _skip += 2;
 
-                        if (_handler != null)
-                            _handler.Ref(State, token.Marker, display, target);
+                        if (Handler != null)
+                            Handler.Ref(State, token.Marker, display, target);
                         break;
                     }
 
@@ -457,9 +506,9 @@ public bool ProcessToken()
                         actualMarker = token.Marker;
 
                     State.Push(new UsfmParserElement(UsfmElementType.Char, actualMarker, token.Attributes));
-                    if (_handler != null)
+                    if (Handler != null)
                     {
-                        _handler.StartChar(
+                        Handler.StartChar(
                             State,
                             actualMarker,
                             token.Type == UsfmTokenType.Unknown || invalidMarker,
@@ -484,8 +533,8 @@ public bool ProcessToken()
 
                     State.Push(new UsfmParserElement(UsfmElementType.Note, token.Marker));
 
-                    if (_handler != null)
-                        _handler.StartNote(State, token.Marker, token.Data, noteCategory);
+                    if (Handler != null)
+                        Handler.StartNote(State, token.Marker, token.Data, noteCategory);
                     break;
                 case UsfmTokenType.Text:
                     string text = token.Text;
@@ -507,7 +556,7 @@ public bool ProcessToken()
                         text = text.Substring(0, text.Length - 1);
                     }
 
-                    if (_handler != null)
+                    if (Handler != null)
                     {
                         // Replace ~ with nbsp
                         text = text.Replace('~', '\u00A0');
@@ -516,9 +565,9 @@ public bool ProcessToken()
                         foreach (string str in OptBreakSplitter.Split(text))
                         {
                             if (str == "//")
-                                _handler.OptBreak(State);
+                                Handler.OptBreak(State);
                             else
-                                _handler.Text(State, str);
+                                Handler.Text(State, str);
                         }
                     }
                     break;
@@ -526,7 +575,7 @@ public bool ProcessToken()
                 case UsfmTokenType.Milestone:
                 case UsfmTokenType.MilestoneEnd:
                     // currently, parse state doesn't need to be update, so just inform the handler about the milestone.
-                    _handler?.Milestone(State, token.Marker, token.Type == UsfmTokenType.Milestone, token.Attributes);
+                    Handler?.Milestone(State, token.Marker, token.Type == UsfmTokenType.Milestone, token.Attributes);
                     break;
             }
 
@@ -589,36 +638,36 @@ private void CloseElement(bool closed = false)
             switch (element.Type)
             {
                 case UsfmElementType.Book:
-                    if (_handler != null)
-                        _handler.EndBook(State, element.Marker);
+                    if (Handler != null)
+                        Handler.EndBook(State, element.Marker);
                     break;
                 case UsfmElementType.Para:
-                    if (_handler != null)
-                        _handler.EndPara(State, element.Marker);
+                    if (Handler != null)
+                        Handler.EndPara(State, element.Marker);
                     break;
                 case UsfmElementType.Char:
-                    if (_handler != null)
-                        _handler.EndChar(State, element.Marker, element.Attributes, closed);
+                    if (Handler != null)
+                        Handler.EndChar(State, element.Marker, element.Attributes, closed);
                     break;
                 case UsfmElementType.Note:
-                    if (_handler != null)
-                        _handler.EndNote(State, element.Marker, closed);
+                    if (Handler != null)
+                        Handler.EndNote(State, element.Marker, closed);
                     break;
                 case UsfmElementType.Table:
-                    if (_handler != null)
-                        _handler.EndTable(State);
+                    if (Handler != null)
+                        Handler.EndTable(State);
                     break;
                 case UsfmElementType.Row:
-                    if (_handler != null)
-                        _handler.EndRow(State, element.Marker);
+                    if (Handler != null)
+                        Handler.EndRow(State, element.Marker);
                     break;
                 case UsfmElementType.Cell:
-                    if (_handler != null)
-                        _handler.EndCell(State, element.Marker);
+                    if (Handler != null)
+                        Handler.EndCell(State, element.Marker);
                     break;
                 case UsfmElementType.Sidebar:
-                    if (_handler != null)
-                        _handler.EndSidebar(State, element.Marker, closed);
+                    if (Handler != null)
+                        Handler.EndSidebar(State, element.Marker, closed);
                     break;
             }
         }
diff --git a/src/SIL.Machine/Corpora/UsfmTextBase.cs b/src/SIL.Machine/Corpora/UsfmTextBase.cs
index af75f639f..3180a3eb2 100644
--- a/src/SIL.Machine/Corpora/UsfmTextBase.cs
+++ b/src/SIL.Machine/Corpora/UsfmTextBase.cs
@@ -30,7 +30,7 @@ protected override IEnumerable<TextRow> GetVersesInDocOrder()
         {
             string usfm = ReadUsfm();
             var rowCollector = new TextRowCollector(this);
-            UsfmParser.Parse(_stylesheet, usfm, rowCollector, Versification, preserveWhitespace: _includeMarkers);
+            UsfmParser.Parse(usfm, rowCollector, _stylesheet, Versification, preserveWhitespace: _includeMarkers);
             return rowCollector.Rows;
         }
 
diff --git a/src/SIL.Machine/Corpora/UsfmToken.cs b/src/SIL.Machine/Corpora/UsfmToken.cs
index 765b3fb50..46ec1bd9c 100644
--- a/src/SIL.Machine/Corpora/UsfmToken.cs
+++ b/src/SIL.Machine/Corpora/UsfmToken.cs
@@ -172,7 +172,7 @@ public int GetLength(bool includeNewlines = false, bool addSpaces = true)
 
                 if (!string.IsNullOrEmpty(Data))
                 {
-                    if (Marker.Length > 0)
+                    if (!addSpaces && (Marker.Length == 0 || Marker[Marker.Length - 1] != '*'))
                         totalLength++;
                     totalLength += Data.Length;
                     if (addSpaces)
@@ -225,7 +225,7 @@ public string ToUsfm(bool includeNewlines = false, bool addSpaces = true)
 
                 if (!string.IsNullOrEmpty(Data))
                 {
-                    if (Marker.Length > 0)
+                    if (!addSpaces && (Marker.Length == 0 || Marker[Marker.Length - 1] != '*'))
                         sb.Append(' ');
                     sb.Append(Data);
                     if (addSpaces)
@@ -236,11 +236,13 @@ public string ToUsfm(bool includeNewlines = false, bool addSpaces = true)
                 {
                     string attributes = ToAttributeString();
                     if (attributes != "")
+                    {
                         sb.Append(attributes);
+                    }
                     else
                     {
                         // remove space that was put after marker - not needed when there are no attributes.
-                        sb.Length -= 1;
+                        sb.Length--;
                     }
                     sb.Append(@"\*");
                 }
diff --git a/src/SIL.Machine/Corpora/UsfmTokenizer.cs b/src/SIL.Machine/Corpora/UsfmTokenizer.cs
index e44dd91e3..564c4b4e9 100644
--- a/src/SIL.Machine/Corpora/UsfmTokenizer.cs
+++ b/src/SIL.Machine/Corpora/UsfmTokenizer.cs
@@ -2,20 +2,41 @@
 using System.Collections.Generic;
 using System.Linq;
 using System.Text;
+using System.Text.RegularExpressions;
 
 namespace SIL.Machine.Corpora
 {
+    public enum RtlReferenceOrder
+    {
+        NotSet,
+        BookChapterVerse,
+        BookVerseChapter
+    }
+
     public class UsfmTokenizer
     {
         private const char ZeroWidthSpace = '\u200B';
 
-        private readonly UsfmStylesheet _stylesheet;
+        private static readonly Regex RtlVerseRegex = new Regex(
+            @"[\u200E\u200F]*(\d+\w?)[\u200E\u200F]*([\p{P}\p{S}])[\u200E\u200F]*(?=\d)",
+            RegexOptions.Compiled
+        );
 
-        public UsfmTokenizer(UsfmStylesheet stylesheet)
+        public UsfmTokenizer(
+            string stylesheetFileName = "usfm.sty",
+            RtlReferenceOrder rtlReferenceOrder = RtlReferenceOrder.NotSet
+        )
+            : this(new UsfmStylesheet(stylesheetFileName), rtlReferenceOrder) { }
+
+        public UsfmTokenizer(UsfmStylesheet stylesheet, RtlReferenceOrder rtlReferenceOrder = RtlReferenceOrder.NotSet)
         {
-            _stylesheet = stylesheet;
+            Stylesheet = stylesheet ?? new UsfmStylesheet("usfm.sty");
+            RtlReferenceOrder = rtlReferenceOrder;
         }
 
+        public UsfmStylesheet Stylesheet { get; }
+        public RtlReferenceOrder RtlReferenceOrder { get; }
+
         public IReadOnlyList<UsfmToken> Tokenize(string usfm, bool preserveWhitespace = false)
         {
             List<UsfmToken> tokens = new List<UsfmToken>();
@@ -112,7 +133,7 @@ ref text
                 }
 
                 // Lookup marker
-                UsfmTag tag = _stylesheet.GetTag(marker.TrimStart('+'));
+                UsfmTag tag = Stylesheet.GetTag(marker.TrimStart('+'));
 
                 // If starts with a plus and is not a character style or an end style, it is an unknown tag
                 if (
@@ -121,7 +142,7 @@ ref text
                     && tag.StyleType != UsfmStyleType.End
                 )
                 {
-                    tag = _stylesheet.GetTag(marker);
+                    tag = Stylesheet.GetTag(marker);
                 }
 
                 string endMarker = tag.StyleType != UsfmStyleType.Milestone ? marker + "*" : tag.EndMarker;
@@ -276,6 +297,109 @@ ref text
             return tokens;
         }
 
+        public string Detokenize(IEnumerable<UsfmToken> tokens, bool tokensHaveWhitespace = false)
+        {
+            UsfmToken prevToken = null;
+            var usfm = new StringBuilder();
+            foreach (UsfmToken token in tokens)
+            {
+                string tokenUsfm = "";
+                switch (token.Type)
+                {
+                    case UsfmTokenType.Book:
+                    case UsfmTokenType.Chapter:
+                    case UsfmTokenType.Paragraph:
+                        // Strip space from end of string before CR/LF
+                        if (usfm.Length > 0)
+                        {
+                            if (
+                                usfm[usfm.Length - 1] == ' ' && (prevToken != null && prevToken.ToUsfm().Trim() != "")
+                                || !tokensHaveWhitespace
+                            )
+                            {
+                                usfm.Length--;
+                            }
+                            if (!tokensHaveWhitespace)
+                                usfm.Append("\r\n");
+                        }
+                        tokenUsfm = token.ToUsfm();
+                        break;
+                    case UsfmTokenType.Verse:
+                        // Add newline if after anything other than [ or (
+                        if (usfm.Length > 0 && usfm[usfm.Length - 1] != '[' && usfm[usfm.Length - 1] != '(')
+                        {
+                            if (
+                                usfm[usfm.Length - 1] == ' ' && (prevToken != null && prevToken.ToUsfm().Trim() != "")
+                                || !tokensHaveWhitespace
+                            )
+                            {
+                                usfm.Length--;
+                            }
+                            if (!tokensHaveWhitespace)
+                                usfm.Append("\r\n");
+                        }
+
+                        tokenUsfm = tokensHaveWhitespace ? token.ToUsfm().Trim() : token.ToUsfm();
+
+                        if (RtlReferenceOrder != RtlReferenceOrder.NotSet)
+                        {
+                            string directionMarker =
+                                RtlReferenceOrder == RtlReferenceOrder.BookVerseChapter ? "\u200e" : "\u200f";
+                            tokenUsfm = RtlVerseRegex.Replace(tokenUsfm, $"$1{directionMarker}$2");
+                        }
+                        break;
+                    case UsfmTokenType.Text:
+                        // Ensure spaces are preserved
+                        tokenUsfm = token.ToUsfm();
+                        if (tokensHaveWhitespace && usfm.Length > 0 && usfm[usfm.Length - 1] == ' ')
+                        {
+                            if (
+                                (
+                                    tokenUsfm.Length > 0
+                                    && tokenUsfm[0] == ' '
+                                    && prevToken != null
+                                    && prevToken.ToUsfm().Trim() != ""
+                                ) || tokenUsfm.StartsWith("\r\n")
+                            )
+                            {
+                                usfm.Length--;
+                            }
+                            else
+                            {
+                                tokenUsfm = tokenUsfm.TrimStart(' ');
+                            }
+                        }
+                        break;
+                    default:
+                        tokenUsfm = token.ToUsfm();
+                        break;
+                }
+
+                usfm.Append(tokenUsfm);
+                prevToken = token;
+            }
+
+            // Make sure begins without space or CR/LF
+            if (usfm.Length > 0 && usfm[0] == ' ')
+                usfm.Remove(0, 1);
+            if (usfm.Length > 0 && usfm[0] == '\r')
+                usfm.Remove(0, 2);
+
+            // Make sure ends without space and with a CR/LF
+            if (usfm.Length > 0 && usfm[usfm.Length - 1] == ' ')
+                usfm.Length--;
+            if (usfm.Length > 0 && usfm[usfm.Length - 1] != '\n')
+                usfm.Append("\r\n");
+            if (
+                usfm.Length > 3
+                && usfm[usfm.Length - 3] == ' '
+                && usfm[usfm.Length - 2] == '\r'
+                && usfm[usfm.Length - 1] == '\n'
+            )
+                usfm.Remove(usfm.Length - 3, 1);
+            return usfm.ToString();
+        }
+
         /// <summary>
         /// Gets the next word in the usfm and advances the index past it
         /// </summary>
@@ -361,7 +485,7 @@ ref string text
             if (matchingToken == null)
                 return null;
 
-            UsfmTag matchingTag = _stylesheet.GetTag(matchingToken.NestlessMarker);
+            UsfmTag matchingTag = Stylesheet.GetTag(matchingToken.NestlessMarker);
             if (
                 matchingTag.StyleType != UsfmStyleType.Character
                 && matchingTag.StyleType != UsfmStyleType.Milestone
diff --git a/tests/SIL.Machine.Tests/Corpora/CorporaTestHelpers.cs b/tests/SIL.Machine.Tests/Corpora/CorporaTestHelpers.cs
index 9c19308e7..26deaf022 100644
--- a/tests/SIL.Machine.Tests/Corpora/CorporaTestHelpers.cs
+++ b/tests/SIL.Machine.Tests/Corpora/CorporaTestHelpers.cs
@@ -1,6 +1,4 @@
-using System;
-using System.IO;
-using System.IO.Compression;
+using System.IO.Compression;
 
 namespace SIL.Machine.Corpora
 {
diff --git a/tests/SIL.Machine.Tests/Corpora/TestData/.gitattributes b/tests/SIL.Machine.Tests/Corpora/TestData/.gitattributes
new file mode 100644
index 000000000..e29803af9
--- /dev/null
+++ b/tests/SIL.Machine.Tests/Corpora/TestData/.gitattributes
@@ -0,0 +1 @@
+*.SFM eol=crlf
diff --git a/tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/41MATTes.SFM b/tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/41MATTes.SFM
index 83a1f6792..8cb81d7a4 100644
--- a/tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/41MATTes.SFM
+++ b/tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/41MATTes.SFM
@@ -14,7 +14,7 @@
 \li2 verse four,
 \v 5 Chapter one,
 \li2 verse \fig Figure 1|src="image1.png" size="col" ref="1:5"\fig* five.
-\c 2 
+\c 2
 \s1 Chapter Two
 \p
 \v 1 Chapter \add two\add*, verse \f + \fr 2:1: \ft This is a footnote.\f*one.
@@ -25,7 +25,7 @@
 \v 6 Chapter two, verse \w six|strong="12345" \w*.
 \v 6 Bad verse.
 \v 5 Chapter two, verse five \rq (MAT 3:1)\rq*.
-\v 7a Chapter two, verse seven A, 
+\v 7a Chapter two, verse seven A,
 \s Section header
 \p
 \v 7b verse seven B.
diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmFileTextTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmFileTextTests.cs
index 26fe4eba7..c7a162b51 100644
--- a/tests/SIL.Machine.Tests/Corpora/UsfmFileTextTests.cs
+++ b/tests/SIL.Machine.Tests/Corpora/UsfmFileTextTests.cs
@@ -1,5 +1,4 @@
-using System.Linq;
-using System.Text;
+using System.Text;
 using NUnit.Framework;
 using SIL.Scripture;
 
diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmTokenizerTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmTokenizerTests.cs
new file mode 100644
index 000000000..ff6aaf30a
--- /dev/null
+++ b/tests/SIL.Machine.Tests/Corpora/UsfmTokenizerTests.cs
@@ -0,0 +1,47 @@
+using NUnit.Framework;
+
+namespace SIL.Machine.Corpora
+{
+    [TestFixture]
+    public class UsfmTokenizerTests
+    {
+        [Test]
+        public void Tokenize()
+        {
+            string usfm = ReadUsfm();
+            var tokenizer = new UsfmTokenizer();
+            IReadOnlyList<UsfmToken> tokens = tokenizer.Tokenize(usfm);
+            Assert.That(tokens, Has.Count.EqualTo(136));
+
+            Assert.That(tokens[0].Type, Is.EqualTo(UsfmTokenType.Book));
+            Assert.That(tokens[0].Marker, Is.EqualTo("id"));
+            Assert.That(tokens[0].Data, Is.EqualTo("MAT"));
+
+            Assert.That(tokens[10].Type, Is.EqualTo(UsfmTokenType.Text));
+            Assert.That(tokens[10].Text, Is.EqualTo("Chapter One "));
+
+            Assert.That(tokens[11].Type, Is.EqualTo(UsfmTokenType.Verse));
+            Assert.That(tokens[11].Marker, Is.EqualTo("v"));
+            Assert.That(tokens[11].Data, Is.EqualTo("1"));
+
+            Assert.That(tokens[20].Type, Is.EqualTo(UsfmTokenType.Note));
+            Assert.That(tokens[20].Marker, Is.EqualTo("f"));
+            Assert.That(tokens[20].Data, Is.EqualTo("+"));
+        }
+
+        [Test]
+        public void Detokenize()
+        {
+            string usfm = ReadUsfm();
+            var tokenizer = new UsfmTokenizer();
+            IReadOnlyList<UsfmToken> tokens = tokenizer.Tokenize(usfm);
+            string result = tokenizer.Detokenize(tokens);
+            Assert.That(result, Is.EqualTo(usfm));
+        }
+
+        private static string ReadUsfm()
+        {
+            return File.ReadAllText(Path.Combine(CorporaTestHelpers.UsfmTestProjectPath, "41MATTes.SFM"));
+        }
+    }
+}