From 6766487ca8692297168dba436ebb6c8bd66fc25e Mon Sep 17 00:00:00 2001
From: Damien Daspit <damien_daspit@sil.org>
Date: Mon, 15 Jan 2018 16:45:24 +0700
Subject: [PATCH] segment and align USX texts

---
 .../translate/core/machine.service.ts         |   7 +-
 .../translate/editor/document-editor.ts       |  89 +++++-------
 .../translate/editor/editor.component.html    |   4 +-
 .../translate/editor/editor.component.ts      |  16 ++-
 .../languageforge/translate/editor/segment.ts |   2 +-
 .../translate/editor/segmenter.ts             | 135 ++++++++++++++++++
 .../Services/DeltaUsxMapper.cs                |  28 +---
 tsconfig.json                                 |   2 +
 8 files changed, 196 insertions(+), 87 deletions(-)
 create mode 100644 src/angular-app/languageforge/translate/editor/segmenter.ts

diff --git a/src/angular-app/languageforge/translate/core/machine.service.ts b/src/angular-app/languageforge/translate/core/machine.service.ts
index 74f3b0bd0b..9e7f831345 100644
--- a/src/angular-app/languageforge/translate/core/machine.service.ts
+++ b/src/angular-app/languageforge/translate/core/machine.service.ts
@@ -29,12 +29,11 @@ export class MachineService {
     }
   }
 
-  initialise(projectId: string, isScripture: boolean): void {
+  initialise(projectId: string): void {
     this.engine = new TranslationEngine(this.$window.location.origin + '/machine', projectId);
     this.updateConfidence();
-    const segmentType = isScripture ? 'line' : 'latin';
-    this.sourceSegmentTokenizer = new SegmentTokenizer(segmentType);
-    this.targetSegmentTokenizer = new SegmentTokenizer(segmentType);
+    this.sourceSegmentTokenizer = new SegmentTokenizer('latin');
+    this.targetSegmentTokenizer = new SegmentTokenizer('latin');
   }
 
   translate(sourceSegment: string): angular.IPromise<void> {
diff --git a/src/angular-app/languageforge/translate/editor/document-editor.ts b/src/angular-app/languageforge/translate/editor/document-editor.ts
index 78b8ad7f95..17e972c629 100644
--- a/src/angular-app/languageforge/translate/editor/document-editor.ts
+++ b/src/angular-app/languageforge/translate/editor/document-editor.ts
@@ -1,28 +1,27 @@
 import * as angular from 'angular';
 import Quill, { RangeStatic } from 'quill';
 
+import { InputSystem } from '../../../bellows/shared/model/input-system.model';
 import { DocType, SaveState } from '../core/constants';
 import { MachineService } from '../core/machine.service';
 import { RealTimeService } from '../core/realtime.service';
 import { MetricService } from './metric.service';
 import { SuggestionsTheme } from './quill/suggestions-theme';
 import { Segment } from './segment';
+import { MachineSegmenter, Segmenter, UsxSegmenter } from './segmenter';
 
 export abstract class DocumentEditor {
-  static isSelectionCollapsed(selection: RangeStatic): boolean {
-    return selection != null && selection.length === 0;
-  }
-
   static isTextEmpty(text: string): boolean {
     text = text.endsWith('\n') ? text.substr(0, text.length - 1) : text;
     return text === '';
   }
 
   modulesConfig: any = {};
-  inputSystem: any = {};
+  inputSystem: InputSystem = new InputSystem();
 
   protected currentSegment: Segment;
-  protected segmentRanges: RangeStatic[];
+  protected segmenter: Segmenter;
+  private _isScripture: boolean = false;
 
   private documentSetId: string = '';
   private readonly _created: angular.IDeferred<boolean>;
@@ -31,6 +30,7 @@ export abstract class DocumentEditor {
   constructor(private readonly $q: angular.IQService, protected readonly machine: MachineService,
               private readonly realTime: RealTimeService) {
     this._created = this.$q.defer();
+    this.segmenter = new MachineSegmenter(this, machine);
   }
 
   abstract get docType(): string;
@@ -57,8 +57,8 @@ export abstract class DocumentEditor {
     return this.currentSegment == null ? '' : this.currentSegment.documentSetId;
   }
 
-  get currentSegmentIndex(): number {
-    return this.currentSegment == null ? -1 : this.currentSegment.index;
+  get currentSegmentRef(): string {
+    return this.currentSegment == null ? '' : this.currentSegment.ref;
   }
 
   get saveState(): SaveState {
@@ -69,11 +69,22 @@ export abstract class DocumentEditor {
     return DocumentEditor.isTextEmpty(this.quill.getText());
   }
 
+  get isScripture(): boolean {
+    return this._isScripture;
+  }
+
+  set isScripture(value: boolean) {
+    if (value !== this._isScripture) {
+      this._isScripture = value;
+      this.segmenter = value ? new UsxSegmenter(this) : new MachineSegmenter(this, this.machine);
+    }
+  }
+
   openDocumentSet(collection: string, documentSetId: string): void {
     if (this.documentSetId !== documentSetId) {
       this.documentSetId = documentSetId;
       this.realTime.createAndSubscribeRichTextDoc(collection, this.docId, this.quill);
-      this.segmentRanges = null;
+      this.segmenter.reset();
     }
   }
 
@@ -86,22 +97,18 @@ export abstract class DocumentEditor {
   }
 
   update(textChange: boolean): boolean {
-    if (this.segmentRanges == null || textChange) {
-      this.segmentRanges = this.getSegmentRanges();
-    }
+    this.segmenter.update(textChange);
+
     const selection = this.quill.getSelection();
     if (selection == null) {
       return false;
     }
-    let segmentIndex = -1;
-    if (DocumentEditor.isSelectionCollapsed(selection)) {
-      segmentIndex = this.segmentRanges.findIndex(range => selection.index <= range.index + range.length);
-    }
-    if (segmentIndex === -1) {
-      segmentIndex = this.currentSegment == null ? this.segmentRanges.length - 1 : this.currentSegment.index;
+    let segmentRef = this.segmenter.getSegmentRef(selection);
+    if (segmentRef == null) {
+      segmentRef = this.currentSegment == null ? this.segmenter.lastSegmentRef : this.currentSegment.ref;
     }
 
-    if (this.switchCurrentSegment(segmentIndex)) {
+    if (this.switchCurrentSegment(segmentRef)) {
       // the selection has changed to a different segment
       return true;
     } else {
@@ -111,15 +118,15 @@ export abstract class DocumentEditor {
     }
   }
 
-  switchCurrentSegment(segmentIndex: number): boolean {
+  switchCurrentSegment(segmentRef: string): boolean {
     if (this.currentSegment != null && this.documentSetId === this.currentSegment.documentSetId
-      && segmentIndex === this.currentSegment.index
+      && segmentRef === this.currentSegment.ref
     ) {
       // the selection has not changed to a different segment
       return false;
     }
 
-    this.currentSegment = new Segment(this.documentSetId, segmentIndex);
+    this.currentSegment = new Segment(this.documentSetId, segmentRef);
     this.updateCurrentSegment();
     return true;
   }
@@ -162,32 +169,8 @@ export abstract class DocumentEditor {
     return this.documentSetId + ':' + this.docType;
   }
 
-  private getSegmentRange(index: number): RangeStatic {
-    if (this.isTextEmpty) {
-      return { index: 0, length: 0 };
-    }
-
-    const segments = this.getSegmentRanges();
-    return index < segments.length ? segments[index] : { index: this.quill.getLength() - 1, length: 0 };
-  }
-
-  private getSegmentRanges(): RangeStatic[] {
-    const text = this.quill.getText().substr(0, this.quill.getLength() - 1);
-    const segmentRanges = this.machine.tokenizeDocumentText(this.docType, text);
-    if (segmentRanges.length === 0) {
-      segmentRanges.push({ index: 0, length: 0 });
-    } else {
-      const lastSegmentRange = segmentRanges[segmentRanges.length - 1];
-      const lastSegmentEnd = lastSegmentRange.index + lastSegmentRange.length;
-      if (lastSegmentEnd < text.length) {
-        segmentRanges.push({ index: text.length, length: 0 });
-      }
-    }
-    return segmentRanges;
-  }
-
   private updateCurrentSegment() {
-    const range = this.getSegmentRange(this.currentSegment.index);
+    const range = this.segmenter.getSegmentRange(this.currentSegment.ref);
     const text = this.quill.getText(range.index, range.length);
     this.currentSegment.update(text, range);
   }
@@ -247,16 +230,16 @@ export class TargetDocumentEditor extends DocumentEditor {
       this.updateSuggestions();
     }
 
-    if (textChange && this.currentSegment != null && this.currentSegment.index === this.segmentRanges.length - 1) {
+    if (textChange && this.currentSegment != null && this.currentSegment.ref === this.segmenter.lastSegmentRef) {
       this.updateHighlight(this.currentSegment.range);
     }
 
     return segmentChanged;
   }
 
-  switchCurrentSegment(segmentIndex: number): boolean {
+  switchCurrentSegment(segmentRef: string): boolean {
     const previousSegment = this.currentSegment;
-    const segmentChanged = super.switchCurrentSegment(segmentIndex);
+    const segmentChanged = super.switchCurrentSegment(segmentRef);
     if (segmentChanged) {
       this.trainSegment(previousSegment);
     }
@@ -358,7 +341,7 @@ export class TargetDocumentEditor extends DocumentEditor {
     return this.machine.trainSegment()
       .then(() => {
         segment.acceptChanges();
-        this.$window.console.log('Segment ' + segment.index + ' of document ' + segment.documentSetId
+        this.$window.console.log('Segment ' + segment.ref + ' of document ' + segment.documentSetId
           + ' was trained successfully.');
       })
       .finally(() => this.pendingTrainCount--);
@@ -428,11 +411,11 @@ export class SourceDocumentEditor extends DocumentEditor {
     return segmentChanged;
   }
 
-  switchCurrentSegment(segmentIndex: number): boolean {
+  switchCurrentSegment(segmentRef: string): boolean {
     if (!this.hasFocus) {
       this.isCurrentSegmentHighlighted = false;
     }
-    const segmentChanged = super.switchCurrentSegment(segmentIndex);
+    const segmentChanged = super.switchCurrentSegment(segmentRef);
     if (!this.hasFocus) {
       this.isCurrentSegmentHighlighted = true;
     }
diff --git a/src/angular-app/languageforge/translate/editor/editor.component.html b/src/angular-app/languageforge/translate/editor/editor.component.html
index eb90d81ede..79b9994f6e 100644
--- a/src/angular-app/languageforge/translate/editor/editor.component.html
+++ b/src/angular-app/languageforge/translate/editor/editor.component.html
@@ -136,7 +136,7 @@
                         <i class="fa fa-exchange"></i></button>
                 </div>
                 <ng-quill-editor class="notranslate"
-                    id="editorSource" data-modules="$ctrl.source.modulesConfig" data-theme="suggestions"
+                    id="editorSource" spellcheck="false" data-modules="$ctrl.source.modulesConfig" data-theme="suggestions"
                     data-on-editor-created="$ctrl.onQuillCreated(editor, $ctrl.source)"
                     data-on-content-changed="$ctrl.onContentChanged($ctrl.source)"
                     data-on-selection-changed="$ctrl.onSelectionChanged($ctrl.source)"
@@ -152,7 +152,7 @@
                     <label class="editor-label">{{$ctrl.getEditorLabel($ctrl.right)}}</label>
                 </div>
                 <ng-quill-editor class="notranslate"
-                    id="editorTarget" data-modules="$ctrl.target.modulesConfig" data-theme="suggestions"
+                    id="editorTarget" spellcheck="false" data-modules="$ctrl.target.modulesConfig" data-theme="suggestions"
                     data-on-editor-created="$ctrl.onQuillCreated(editor, $ctrl.target)"
                     data-on-content-changed="$ctrl.onContentChanged($ctrl.target)"
                     data-on-selection-changed="$ctrl.onSelectionChanged($ctrl.target)"
diff --git a/src/angular-app/languageforge/translate/editor/editor.component.ts b/src/angular-app/languageforge/translate/editor/editor.component.ts
index e3c67109f6..4cc339cba1 100644
--- a/src/angular-app/languageforge/translate/editor/editor.component.ts
+++ b/src/angular-app/languageforge/translate/editor/editor.component.ts
@@ -1,5 +1,4 @@
 import * as angular from 'angular';
-import { setTimeout } from 'core-js/library/web/timers';
 import { SmtTrainProgress } from 'machine';
 import Quill from 'quill';
 
@@ -141,9 +140,12 @@ export class TranslateEditorController implements angular.IController {
             new TranslateUserPreferences();
           this.source.inputSystem = this.tecProject.config.source.inputSystem;
           this.target.inputSystem = this.tecProject.config.target.inputSystem;
-          this.machine.initialise(this.tecProject.slug, this.tecProject.config.isTranslationDataScripture);
+          this.machine.initialise(this.tecProject.slug);
           this.showFormats =  this.tecProject.config.userPreferences.isFormattingOptionsShown;
 
+          this.source.isScripture = this.tecProject.config.isTranslationDataScripture;
+          this.target.isScripture = this.tecProject.config.isTranslationDataScripture;
+
           if (this.tecProject.config.documentSets.idsOrdered != null &&
             this.tecProject.config.documentSets.idsOrdered.length > 0
           ) {
@@ -204,7 +206,11 @@ export class TranslateEditorController implements angular.IController {
         }
       });
 
-      this.machine.initialise(this.tecProject.slug, this.tecProject.config.isTranslationDataScripture);
+      this.machine.initialise(this.tecProject.slug);
+
+      this.source.isScripture = this.tecProject.config.isTranslationDataScripture;
+      this.target.isScripture = this.tecProject.config.isTranslationDataScripture;
+
       this.listenForTrainingStatus();
     }
   }
@@ -600,7 +606,7 @@ export class TranslateEditorController implements angular.IController {
 
         if (segmentChanged) {
           // select the corresponding source segment
-          this.source.switchCurrentSegment(this.target.currentSegmentIndex);
+          this.source.switchCurrentSegment(this.target.currentSegmentRef);
 
           if (this.currentDocType) {
             this.metricService.sendMetrics(true, this.target.currentSegmentDocumentSetId);
@@ -625,7 +631,7 @@ export class TranslateEditorController implements angular.IController {
 
       case DocType.SOURCE:
         if (segmentChanged) {
-          this.target.switchCurrentSegment(this.source.currentSegmentIndex);
+          this.target.switchCurrentSegment(this.source.currentSegmentRef);
 
           if (!this.currentDocType && this.selectedDocumentSetIndex in this.documentSets) {
             this.metricService.currentDocumentSetId = this.documentSets[this.selectedDocumentSetIndex].id;
diff --git a/src/angular-app/languageforge/translate/editor/segment.ts b/src/angular-app/languageforge/translate/editor/segment.ts
index 4d49737490..59737daaae 100644
--- a/src/angular-app/languageforge/translate/editor/segment.ts
+++ b/src/angular-app/languageforge/translate/editor/segment.ts
@@ -5,7 +5,7 @@ export class Segment {
   private _range: RangeStatic;
   private initialText: string;
 
-  constructor(public readonly documentSetId: string, public readonly index: number) { }
+  constructor(public readonly documentSetId: string, public readonly ref: string) { }
 
   get text(): string {
     return this._text;
diff --git a/src/angular-app/languageforge/translate/editor/segmenter.ts b/src/angular-app/languageforge/translate/editor/segmenter.ts
new file mode 100644
index 0000000000..0bd25d8e20
--- /dev/null
+++ b/src/angular-app/languageforge/translate/editor/segmenter.ts
@@ -0,0 +1,135 @@
+import { RangeStatic } from 'quill';
+import { MachineService } from '../core/machine.service';
+import { DocumentEditor } from './document-editor';
+
+export abstract class Segmenter {
+  protected readonly segments: Map<string, RangeStatic> = new Map<string, RangeStatic>();
+  protected _lastSegmentRef: string = '';
+
+  constructor(protected readonly doc: DocumentEditor) { }
+
+  get lastSegmentRef(): string {
+    return this._lastSegmentRef;
+  }
+
+  update(textChange: boolean): void {
+    if (this._lastSegmentRef === '' || textChange) {
+      this.updateSegments();
+    }
+  }
+
+  reset(): void {
+    this.segments.clear();
+    this._lastSegmentRef = '';
+  }
+
+  getSegmentRange(ref: string): RangeStatic {
+    if (this.doc.isTextEmpty) {
+      return { index: 0, length: 0 };
+    }
+
+    return this.segments.has(ref) ? this.segments.get(ref) : { index: this.doc.quill.getLength() - 1, length: 0 };
+  }
+
+  getSegmentRef(range: RangeStatic): string {
+    let segmentRef: string;
+    if (range != null && range.length === 0) {
+      for (const [ref, segmentRange] of this.segments) {
+        if (range.index <= segmentRange.index + segmentRange.length) {
+          segmentRef = ref;
+          break;
+        }
+      }
+    }
+    return segmentRef;
+  }
+
+  protected abstract updateSegments(): void;
+}
+
+export class MachineSegmenter extends Segmenter {
+  constructor(doc: DocumentEditor, private readonly machine: MachineService) {
+    super(doc);
+  }
+
+  protected updateSegments(): void {
+    const text = this.doc.quill.getText().substr(0, this.doc.quill.getLength() - 1);
+    const segmentRanges = this.machine.tokenizeDocumentText(this.doc.docType, text);
+    if (segmentRanges.length === 0) {
+      segmentRanges.push({ index: 0, length: 0 });
+    } else {
+      const lastSegmentRange = segmentRanges[segmentRanges.length - 1];
+      const lastSegmentEnd = lastSegmentRange.index + lastSegmentRange.length;
+      if (lastSegmentEnd < text.length) {
+        segmentRanges.push({ index: text.length, length: 0 });
+      }
+    }
+    this.reset();
+    for (let i = 0; i < segmentRanges.length; i++) {
+      this.segments.set(i.toString(), segmentRanges[i]);
+    }
+    this._lastSegmentRef = (segmentRanges.length - 1).toString();
+  }
+}
+
+export class UsxSegmenter extends Segmenter {
+  protected updateSegments(): void {
+    const delta = this.doc.quill.getContents();
+    this.reset();
+    const nextStyleIds = new Map<string, number>();
+    let chapter = '';
+    let verse = '';
+    let curIndex = 0;
+    let curRangeLen = 0;
+    delta.forEach(op => {
+      const len = typeof op.insert === 'string' ? op.insert.length : 1;
+      if (op.attributes == null) {
+        curRangeLen += len;
+      } else {
+        if (op.attributes.para != null) {
+          const style = op.attributes.para.style as string;
+          let nextId = nextStyleIds.get(style);
+          if (nextId == null) {
+            nextId = 0;
+            nextStyleIds.set(style, nextId);
+          }
+
+          if (curRangeLen > 0) {
+            this._lastSegmentRef = style + '_' + nextId;
+            this.segments.set(this._lastSegmentRef, { index: curIndex, length: curRangeLen });
+          }
+          curIndex += curRangeLen + len;
+          curRangeLen = 0;
+          nextId++;
+          nextStyleIds.set(style, nextId);
+        } else if (op.attributes.chapter != null) {
+          chapter = op.attributes.chapter.number as string;
+          verse = '';
+          this._lastSegmentRef = 'chapter_' + chapter;
+          this.segments.set(this._lastSegmentRef, { index: curIndex, length: curRangeLen });
+          curIndex += curRangeLen + len;
+          curRangeLen = 0;
+        } else if (op.attributes.verse != null) {
+          if (verse !== '') {
+            const verseText = this.doc.quill.getText(curIndex, curRangeLen);
+            let verseRangeLen = curRangeLen;
+            for (let i = verseText.length - 1; i >= 0 && UsxSegmenter.isWhitespace(verseText[i]); i--) {
+              verseRangeLen--;
+            }
+            this._lastSegmentRef = 'verse_' + chapter + ':' + verse;
+            this.segments.set(this._lastSegmentRef, { index: curIndex, length: verseRangeLen });
+          }
+          curIndex += curRangeLen;
+          curRangeLen = len;
+          verse = op.attributes.verse.number as string;
+        } else {
+          curRangeLen += len;
+        }
+      }
+    });
+  }
+
+  private static isWhitespace(char: string): boolean {
+    return /\s/.test(char);
+  }
+}
diff --git a/src/netcore-api/SIL.XForge.WebApi.Server/Services/DeltaUsxMapper.cs b/src/netcore-api/SIL.XForge.WebApi.Server/Services/DeltaUsxMapper.cs
index c53bd2819e..04d9caf89f 100644
--- a/src/netcore-api/SIL.XForge.WebApi.Server/Services/DeltaUsxMapper.cs
+++ b/src/netcore-api/SIL.XForge.WebApi.Server/Services/DeltaUsxMapper.cs
@@ -14,26 +14,25 @@ public static Delta ToDelta(XElement usxElem)
         {
             var newDelta = new Delta();
             int nextNoteId = 1;
-            bool inVerse = false;
             foreach (XElement elem in usxElem.Elements())
             {
                 switch (elem.Name.LocalName)
                 {
                     case "para":
-                        ProcessChildNodes(newDelta, elem, ref inVerse, ref nextNoteId);
+                        ProcessChildNodes(newDelta, elem, ref nextNoteId);
                         newDelta.Insert("\n", OpAttributes("para", elem));
                         break;
                     case "chapter":
                         newDelta.Insert((string) elem.Attribute("number"));
                         newDelta.Insert("\n", OpAttributes("chapter", elem));
-                        inVerse = false;
                         break;
                 }
             }
+            newDelta.Insert("\n");
             return newDelta;
         }
 
-        private static void ProcessChildNodes(Delta newDelta, XElement elem, ref bool inVerse, ref int nextNoteId,
+        private static void ProcessChildNodes(Delta newDelta, XElement elem, ref int nextNoteId,
             JObject parentAttrs = null)
         {
             foreach (XNode node in elem.Nodes())
@@ -44,19 +43,7 @@ private static void ProcessChildNodes(Delta newDelta, XElement elem, ref bool in
                         switch (e.Name.LocalName)
                         {
                             case "verse":
-                                if (inVerse)
-                                {
-                                    JToken lastOp = newDelta.Ops[newDelta.Ops.Count - 1];
-                                    var text = (string) lastOp[Delta.InsertType];
-                                    if (text == null || text != "\n")
-                                    {
-                                        newDelta.Insert("\n", new JObject(
-                                            new JProperty("para", new JObject(
-                                                new JProperty("verse-alignment", "")))));
-                                    }
-                                }
                                 newDelta.Insert((string) e.Attribute("number"), OpAttributes("verse", e, parentAttrs));
-                                inVerse = true;
                                 break;
 
                             case "char":
@@ -67,7 +54,7 @@ private static void ProcessChildNodes(Delta newDelta, XElement elem, ref bool in
                                 string noteId = $"_note_{nextNoteId}";
                                 nextNoteId++;
                                 JObject noteAttrs = OpAttributes("note", e, parentAttrs, noteId);
-                                ProcessChildNodes(newDelta, e, ref inVerse, ref nextNoteId, noteAttrs);
+                                ProcessChildNodes(newDelta, e, ref nextNoteId, noteAttrs);
                                 break;
                         }
                         break;
@@ -119,11 +106,8 @@ public static XElement ToUsx(string bookId, string desc, Delta delta)
                         switch (prop.Name)
                         {
                             case "para":
-                                if (prop.Value["verse-alignment"] == null)
-                                {
-                                    newUsxElem.Add(CreateContainerElement("para", prop.Value, childNodes));
-                                    childNodes.Clear();
-                                }
+                                newUsxElem.Add(CreateContainerElement("para", prop.Value, childNodes));
+                                childNodes.Clear();
                                 break;
 
                             case "chapter":
diff --git a/tsconfig.json b/tsconfig.json
index 71e404687b..da30e09fef 100644
--- a/tsconfig.json
+++ b/tsconfig.json
@@ -10,8 +10,10 @@
   "compileOnSave": false,
   "compilerOptions": {
     "alwaysStrict": true,
+    "downlevelIteration": true,
     "emitDecoratorMetadata": true,
     "experimentalDecorators": true,
+    "importHelpers": true,
     "lib": ["es7", "dom"],
     "module": "es6",
     "moduleResolution": "node",