Handle glossary link entries (#474)

* Handle glossary link entries * Minor changes * Lint format issues * Eliminate glossary reference in ScriptureViewSophria and general cleanup * Check in isBibleBook * Check in tests for isBibleBook * Fix tests and commit changes to make tests run independently of installed application * Fix lint check * Fix test imports * Added comments
sillsdev · Apr 26, 2024 · 1e7e8f7 · 1e7e8f7
1 parent e463432
commit 1e7e8f7
Show file tree

Hide file tree

Showing 18 changed files with 17,878 additions and 69 deletions.
diff --git a/.eslintignore b/.eslintignore
@@ -10,6 +10,7 @@ data
 src/config.js
 static
 example_data
+test_data
 
 # Ignore files for PNPM, NPM and YARN
 pnpm-lock.yaml

diff --git a/.prettierignore b/.prettierignore
@@ -12,6 +12,7 @@ src/lib/data/catalog.js
 src/lib/data/firebase-config.js
 static
 example_data
+test_data
 
 # Ignore files for PNPM, NPM and YARN
 pnpm-lock.yaml

diff --git a/scripts/convertBooks.ts b/scripts/convertBooks.ts
@@ -3,11 +3,12 @@
 
 import { ConfigTaskOutput } from './convertConfig';
 import { TaskOutput, Task, Promisable } from './Task';
-import { readFile, writeFile, writeFileSync, mkdirSync, existsSync } from 'fs';
+import { readFile, readFileSync, writeFile, writeFileSync, mkdirSync, existsSync } from 'fs';
 import path from 'path';
 import { SABProskomma } from '../sab-proskomma';
 import { queries, postQueries, freeze } from '../sab-proskomma-tools';
 import { convertMarkdownsToMilestones } from './convertMarkdown';
+import { verifyGlossaryEntries } from './verifyGlossaryEntries';
 
 /**
  * Loops through bookCollections property of configData.
@@ -24,7 +25,26 @@ function replaceVideoTags(text: string, _bcId: string, _bookId: string): string
 function replacePageTags(text: string, _bcId: string, _bookId: string): string {
     return text.replace(/\\page (.*)/g, '\\zpage |id="$1"\\*');
 }
-
+function loadGlossary(collection: any, configData: ConfigTaskOutput, dataDir: string): string[] {
+    const glossary: string[] = [];
+    for (const book of collection.books) {
+        if (book.type && book.type === 'glossary') {
+            const glossaryContent = readFileSync(
+                path.join(dataDir, 'books', collection.id, book.file),
+                'utf8'
+            );
+            // Regular expression pattern
+            const regex = /\\k\s*([^\\]+)\s*\\k\*/g;
+            let match;
+            // Loop through all matches
+            while ((match = regex.exec(glossaryContent)) !== null) {
+                // match[1] contains the text between \k and \k*
+                glossary.push(match[1]);
+            }
+        }
+    }
+    return glossary;
+}
 function removeStrongNumberReferences(text: string, _bcId: string, _bookId: string): string {
     //remove strong number references
     // \v 1  \w In|strong="H0430"\w* \w the|strong="H0853"\w* \w beginning|strong="H7225"\w*, (Gen 1:1 WEBBE)
@@ -102,6 +122,7 @@ export async function convertBooks(
     for (const collection of collections!) {
         const pk = new SABProskomma();
         const lang = collection.languageCode;
+        let bcGlossary: string[] = [];
         if (verbose && usedLangs.has(lang)) {
             console.warn(`Language ${lang} already used in another collection. Proceeding anyway.`);
         }
@@ -115,6 +136,10 @@ export async function convertBooks(
         const docs: Promise<void>[] = [];
         //loop through books in collection
         const ignoredBooks = [];
+        // If the collection has a glossary, load it
+        if (configData.data.traits['has-glossary']) {
+            bcGlossary = loadGlossary(collection, configData, dataDir);
+        }
         for (const book of collection.books) {
             if (book.type && unsupportedBookTypes.includes(book.type)) {
                 // Ignore non-default books for now
@@ -132,7 +157,9 @@ export async function convertBooks(
                             if (err) throw err;
                             process.stdout.write(` ${book.id}`);
                             content = applyFilters(content, bcId, book.id);
-
+                            if (configData.data.traits['has-glossary']) {
+                                content = verifyGlossaryEntries(content, bcGlossary);
+                            }
                             //query Proskomma with a mutation to add a document
                             //more efficient than original pk.addDocument call
                             //as it can be run asynchronously

diff --git a/scripts/convertConfig.ts b/scripts/convertConfig.ts
@@ -963,7 +963,6 @@ function filterFeaturesNotReady(data: ConfigData) {
     data.mainFeatures['share-apple-app-link'] = false;
 
     // Some settings are not done
-    data.mainFeatures['settings-glossary-links'] = false;
     data.mainFeatures['settings-verse-of-the-day'] = false;
     data.mainFeatures['settings-verse-of-the-day-time'] = false;
     data.mainFeatures['settings-verse-of-the-day-book-collection'] = false;

diff --git a/scripts/convertMarkdown.test.ts b/scripts/convertMarkdown.test.ts
@@ -4,10 +4,7 @@ import path from 'path';
 import { convertMarkdownsToMilestones } from './convertMarkdown';
 
 describe('convertMarkdown', () => {
-    const data = readFileSync(
-        path.join('example_data', 'books', 'C01', '01GENengWEBbd.usfm'),
-        'utf8'
-    );
+    const data = readFileSync(path.join('test_data', 'books', 'C01', '01GENengWEBbd.usfm'), 'utf8');
     let modifiedContent: string;
     beforeEach(() => {
         modifiedContent = convertMarkdownsToMilestones(data, 'C01', 'GEN');

diff --git a/scripts/stringUtils.ts b/scripts/stringUtils.ts
@@ -175,3 +175,18 @@ export function padWithInitialZeros(input: string, length: number): string {
 
     return result;
 }
+function ciEqualsInner(a: string, b: string) {
+    return a.localeCompare(b, undefined, { sensitivity: 'accent' }) === 0;
+}
+
+export function ciEquals(a: any, b: any) {
+    if (typeof a !== 'string' || typeof b !== 'string') {
+        return a === b;
+    }
+
+    //      v--- feature detection
+    return ciEqualsInner('A', 'a')
+        ? ciEqualsInner(a, b)
+        : /*  fallback approach here  */
+          a.toUpperCase() === b.toUpperCase();
+}
diff --git a/scripts/verifyGlossaryEntries.test.ts b/scripts/verifyGlossaryEntries.test.ts
@@ -0,0 +1,95 @@
+import { describe, expect, beforeEach, it, test } from 'vitest';
+import { readFile, readFileSync, writeFile, writeFileSync, mkdirSync, existsSync } from 'fs';
+import path from 'path';
+import { verifyGlossaryEntries } from './verifyGlossaryEntries';
+
+describe('verifyGlossaryEntries', () => {
+    // Tests using Genesis 1: 1 & 2
+    const data = readFileSync(path.join('test_data', 'books', 'C01', '01GENengWEBbd.usfm'), 'utf8');
+    describe('with all entries in the glossary', () => {
+        let modifiedContent: string;
+        beforeEach(() => {
+            const glossary = ['excess', 'serpent', 'middle', 'subtle', 'tree', 'extra'];
+            modifiedContent = verifyGlossaryEntries(data, glossary);
+        });
+        it('leaves in place simple entry', () => {
+            expect(modifiedContent).toContain('Now the \\w serpent\\w* was more');
+        });
+        it('leaves in place entry with an extra space', () => {
+            expect(modifiedContent).toContain('more \\w subtle \\w*than any animal');
+        });
+        it('leaves in place entry using lemma', () => {
+            expect(modifiedContent).toContain(
+                'We may eat fruit from the \\w trees|tree \\w* of the garden'
+            );
+        });
+    });
+    describe('with all entries case mismatch', () => {
+        let modifiedContent: string;
+        beforeEach(() => {
+            const glossary = ['Excess', 'Serpent', 'Middle', 'Subtle', 'Tree', 'Extra'];
+            modifiedContent = verifyGlossaryEntries(data, glossary);
+        });
+        it('leaves in place simple entry', () => {
+            expect(modifiedContent).toContain('Now the \\w serpent\\w* was more');
+        });
+        it('leaves in place entry with an extra space', () => {
+            expect(modifiedContent).toContain('more \\w subtle \\w*than any animal');
+        });
+        it('leaves in place entry using lemma', () => {
+            expect(modifiedContent).toContain(
+                'We may eat fruit from the \\w trees|tree \\w* of the garden'
+            );
+        });
+    });
+    describe('with one mismatch', () => {
+        let modifiedContent: string;
+        beforeEach(() => {
+            const glossary = ['excess', 'serpent', 'middle', 'subtle', 'trees', 'extra'];
+            modifiedContent = verifyGlossaryEntries(data, glossary);
+        });
+        it('leaves in place simple entry', () => {
+            expect(modifiedContent).toContain('Now the \\w serpent\\w* was more');
+        });
+        it('leaves in place entry with an extra space', () => {
+            expect(modifiedContent).toContain('more \\w subtle \\w*than any animal');
+        });
+        it('removes when matches first but not lemma', () => {
+            expect(modifiedContent).toContain('We may eat fruit from the trees of the garden');
+        });
+    });
+    describe('with missing entries in the glossary', () => {
+        let modifiedContent: string;
+        beforeEach(() => {
+            const glossary = ['excess', 'serpent', 'more', 'middle', 'tree', 'extra'];
+            modifiedContent = verifyGlossaryEntries(data, glossary);
+        });
+        it('leaves in place simple entry', () => {
+            expect(modifiedContent).toContain('Now the \\w serpent\\w* was more');
+        });
+        it('removes mismatched entry', () => {
+            expect(modifiedContent).toContain('more subtle than any animal');
+        });
+        it('leaves in place entry using lemma', () => {
+            expect(modifiedContent).toContain(
+                'We may eat fruit from the \\w trees|tree \\w* of the garden'
+            );
+        });
+    });
+    describe('with empty glossary', () => {
+        let modifiedContent: string;
+        beforeEach(() => {
+            const glossary: string[] = [];
+            modifiedContent = verifyGlossaryEntries(data, glossary);
+        });
+        it('removes simple entry', () => {
+            expect(modifiedContent).toContain('Now the serpent was more');
+        });
+        it('removes entry with an extra space', () => {
+            expect(modifiedContent).toContain('more subtle than any animal');
+        });
+        it('removes entry using lemma', () => {
+            expect(modifiedContent).toContain('We may eat fruit from the trees of the garden');
+        });
+    });
+});
diff --git a/scripts/verifyGlossaryEntries.ts b/scripts/verifyGlossaryEntries.ts
@@ -0,0 +1,52 @@
+import { ciEquals } from './stringUtils';
+
+export function verifyGlossaryEntries(content: string, glossary: string[]): string {
+    // Regular expression pattern
+    const regex = /\\w\s*([^\\]+)\s*\\w\*/;
+    let result: string = '';
+    result = content;
+    const sb = [];
+    let inputString = content;
+    let match;
+    // Loop through all matches
+    while ((match = regex.exec(inputString)) !== null) {
+        // Append text segment with 1st part of string
+        sb.push(inputString.substring(0, match.index));
+        // match[1] contains the text between \k and \k*
+        const matchWord = entryToMatch(match[1]);
+        const originalEntry: string = match[0];
+        const textOnlyEntry: string = textFromMatch(match[1]);
+        let matchFound = false;
+        glossary.every((glossaryEntry) => {
+            if (ciEquals(glossaryEntry.trim(), matchWord)) {
+                matchFound = true;
+                return false;
+            }
+            return true;
+        });
+        if (matchFound) {
+            sb.push(originalEntry);
+        } else {
+            sb.push(textFromMatch(textOnlyEntry));
+        }
+        const oldLength = inputString.length;
+        inputString = inputString.substring(match.index + match[0].length);
+    }
+    sb.push(inputString);
+    result = sb.join('');
+    return result;
+}
+function entryToMatch(match: string): string {
+    let result: string;
+    const parts = match.split('|');
+    if (parts.length > 1) {
+        result = parts[1].trim();
+    } else {
+        result = parts[0].trim();
+    }
+    return result;
+}
+function textFromMatch(match: string): string {
+    const parts = match.split('|');
+    return parts[0];
+}