add extra info entries to definitions (#185)

* add extra info keys to lemma entries Adds morphemes, etymology, and head info to lemma entries. Also deletes old lemma and form data before writing the new JSONs. * write tests Adds those extra info keys to the test entries. * add new info entries to dictionary generation Used those extra keys in the dictionary creation process. Changed up how morpheme text is extracted. Added styling. Added extra archive type to git ignore. * write tests
yomidevs · Jan 3, 2025 · b4f1e36 · b4f1e36
1 parent 0814121
commit b4f1e36
Show file tree

Hide file tree

Showing 31 changed files with 3,805 additions and 55 deletions.
diff --git a/.gitignore b/.gitignore
@@ -16,6 +16,7 @@
 !data/test/ipa/**/*.json
 
 *.zip
+*.gz
 data/**/*.css
 !data/styles.css
 

diff --git a/3-tidy-up.js b/3-tidy-up.js
@@ -1,4 +1,4 @@
-const { writeFileSync } = require('fs');
+const { writeFileSync, readdirSync, unlinkSync } = require('fs');
 
 const LineByLineReader = require('line-by-line');
 
@@ -125,7 +125,7 @@ lr.on('line', (line) => {
  * @param {KaikkiLine} parsedLine 
  */
 function handleLine(parsedLine) {
-    const { pos, sounds, forms, etymology_number = 0 } = parsedLine;
+    const { pos, sounds, forms, etymology_number = 0, etymology_text} = parsedLine;
     if(!pos) return;
     const word = getCanonicalWordForm(parsedLine);
     if (!word) return;
@@ -209,6 +209,33 @@ function handleLine(parsedLine) {
         saveIpaResult(word, readings, pos, String(etymology_number), ipaObj);
     }
 
+    for (const reading of readings) {
+        const currentEntry = lemmaDict[word][reading][pos][etymology_number];
+
+        if (etymology_text) {
+            const morphemeText = getMorphemes(etymology_text);
+
+            if (targetIso === 'en' && morphemeText) {
+                if (morphemeText === etymology_text) {
+                    currentEntry.morpheme_text = morphemeText;
+                } else {
+                    currentEntry.etymology_text = etymology_text;
+                    currentEntry.morpheme_text = morphemeText;
+                }
+            } else {
+                currentEntry.etymology_text = etymology_text;
+            }
+        }
+
+        if (head_templates) {
+            const headInfo = getHeadInfo(head_templates);
+
+            if (headInfo) {
+                lemmaDict[word][reading][pos][etymology_number].head_info_text = headInfo;
+            }
+        }
+    }
+
     const glossTree = getGlossTree(sensesWithoutInflectionGlosses);
 
     for (const reading of readings) {
@@ -229,6 +256,32 @@ function handleLine(parsedLine) {
 
 }
 
+/**
+ * @param {string} text
+ * @returns {string}
+ * */
+function getMorphemes(text) {
+    for (const part of text.split(/(?<=\.)/g).map(item => item.trim())) {
+        if (part.includes(' + ') && !/Proto|Inherited from/.test(part)) { return part; }
+    }
+
+    return '';
+}
+
+/**
+ * @param {HeadTemplate[]} head_templates
+ * @returns {string}
+ * */
+function getHeadInfo(head_templates) {
+    for (const entry of head_templates) {
+        if (entry.expansion) {
+            if (/(?<=\().+?(?=\))/.test(entry.expansion)) return entry.expansion;
+        }
+    }
+
+    return '';
+}
+
 /**
  * @param {Example} example
  * @returns {StandardizedExample}
@@ -638,6 +691,12 @@ lr.on('end', () => {
     clearConsoleLine();
     process.stdout.write(`Processed ${lineCount} lines...\n`);
 
+    for (const file of readdirSync(writeFolder)) {
+        if (file.includes(`${sourceIso}-${targetIso}`)) {
+            unlinkSync(`${writeFolder}/${file}`);
+        }
+    }
+
     const lemmasFilePath = `${writeFolder}/${sourceIso}-${targetIso}-lemmas.json`;
     consoleOverwrite(`3-tidy-up.js: Writing lemma dict to ${lemmasFilePath}...`);
     writeFileSync(lemmasFilePath, JSON.stringify(lemmaDict, mapJsonReplacer));

diff --git a/4-make-yomitan.js b/4-make-yomitan.js
@@ -145,6 +145,66 @@ function getStructuredExamples(examples) {
     });
 }
 
+/**
+ * @param {string} type
+ * @param {string} content
+ * @returns {import('types').TermBank.StructuredContent}
+ */
+function buildDetailsEntry(type, content) {
+    return {
+        "tag": "details",
+        "data": {
+            "content": `details-entry-${type}`
+        },
+        "content": [
+            {
+                "tag": "summary",
+                "data": {
+                    "content": "summary-entry"
+                },
+                "content": type
+            },
+            {
+                "tag": "div",
+                "data": {
+                    "content": `${type}-content`
+                },
+                "content": content
+            }
+        ]
+    };
+}
+
+/**
+ * @param {LemmaInfo} info 
+ * @returns {import('types').TermBank.StructuredContent}
+ */
+function getStructuredDetails(info) {
+    const result = [];
+
+    const {
+        etymology_text: etymology,
+        morpheme_text: morphemes,
+        head_info_text: headInfo
+    } = info;
+
+    for (const [title, content] of [
+        ['mophemes', morphemes],
+        ['etymology', etymology],
+        ['head-info', headInfo],
+    ]) {
+        if (title && content) result.push(buildDetailsEntry(title, content));
+    }
+
+    return {
+        "tag": "div",
+        "data": {
+            "content": "details-section"
+        },
+        "content": [...result]
+    };
+}
+
 /**
  * @param {GlossTwig} glossTwig
  * @param {string[]} senseTags
@@ -315,6 +375,20 @@ let lastTermBankIndex = 0;
 
                     debug(entries);
                     for (const [tags, entry] of Object.entries(entries)) {
+                        if (info.etymology_text || info.head_info_text || info.morpheme_text) {
+                            const lastDef = entry[5][entry[5].length - 1];
+
+                            if (
+                                lastDef &&
+                                typeof lastDef === 'object' &&
+                                'type' in lastDef &&
+                                lastDef.type === 'structured-content' &&
+                                Array.isArray(lastDef.content)
+                            ) {
+                                lastDef.content.push(getStructuredDetails(info));
+                            }
+                        }
+
                         ymtLemmas.push(entry);
                     }
                 }

diff --git a/data/styles.css b/data/styles.css
@@ -17,4 +17,32 @@ div[data-sc-content="example-sentence-a"] {
 }
 div[data-sc-content="example-sentence-b"] {
     font-size: 0.8em;
+}
+div[data-sc-content="details-section"] {
+    margin: 0.25em 0;
+}
+details[data-sc-content^="details-entry"] {
+    padding-left: 0;
+}
+summary[data-sc-content="summary-entry"] {
+    user-select: none;
+    width: max-content;
+}
+ul.gloss-list[data-count="1"] summary[data-sc-content="summary-entry"] {
+    list-style-position: inside;
+}
+summary[data-sc-content="summary-entry"]::marker {
+    color: var(--checkbox-disabled-color);
+}
+summary[data-sc-content="summary-entry"] {
+    color: var(--text-color-light4);
+}
+details[data-sc-content^="details-entry"][open=""] summary[data-sc-content="summary-entry"] {
+    color: var(--text-color);
+}
+summary[data-sc-content="summary-entry"]:hover {
+    cursor: pointer;
+}
+summary[data-sc-content="summary-entry"] ~ div {
+    margin: 0.5em 0;
 }
diff --git a/data/test/dict/cs/en/term_bank_1.json b/data/test/dict/cs/en/term_bank_1.json
@@ -137,6 +137,36 @@
                 }
               }
             ]
+          },
+          {
+            "tag": "div",
+            "data": {
+              "content": "details-section"
+            },
+            "content": [
+              {
+                "tag": "details",
+                "data": {
+                  "content": "details-entry-etymology"
+                },
+                "content": [
+                  {
+                    "tag": "summary",
+                    "data": {
+                      "content": "summary-entry"
+                    },
+                    "content": "etymology"
+                  },
+                  {
+                    "tag": "div",
+                    "data": {
+                      "content": "etymology-content"
+                    },
+                    "content": "Deverbal from zpravit."
+                  }
+                ]
+              }
+            ]
           }
         ]
       }
@@ -187,6 +217,36 @@
                 }
               }
             ]
+          },
+          {
+            "tag": "div",
+            "data": {
+              "content": "details-section"
+            },
+            "content": [
+              {
+                "tag": "details",
+                "data": {
+                  "content": "details-entry-etymology"
+                },
+                "content": [
+                  {
+                    "tag": "summary",
+                    "data": {
+                      "content": "summary-entry"
+                    },
+                    "content": "etymology"
+                  },
+                  {
+                    "tag": "div",
+                    "data": {
+                      "content": "etymology-content"
+                    },
+                    "content": "Inherited from Old Czech pro, from Proto-Slavic *pro."
+                  }
+                ]
+              }
+            ]
           }
         ]
       }
@@ -209,6 +269,36 @@
             "content": [
               "(reflexive with se) to dispute"
             ]
+          },
+          {
+            "tag": "div",
+            "data": {
+              "content": "details-section"
+            },
+            "content": [
+              {
+                "tag": "details",
+                "data": {
+                  "content": "details-entry-etymology"
+                },
+                "content": [
+                  {
+                    "tag": "summary",
+                    "data": {
+                      "content": "summary-entry"
+                    },
+                    "content": "etymology"
+                  },
+                  {
+                    "tag": "div",
+                    "data": {
+                      "content": "etymology-content"
+                    },
+                    "content": "Inherited from Old Czech přieti, from Proto-Slavic *pьrěti."
+                  }
+                ]
+              }
+            ]
           }
         ]
       }
-Original file line number
+Diff line change
@@ Expand Up / @@ -16,6 +16,7 @@ @@
     !data/test/ipa/**/*.json
     *.zip
+    *.gz
     data/**/*.css
     !data/styles.css
@@ Expand Down @@