Skip to content

Commit

Permalink
add extra info entries to definitions (#185)
Browse files Browse the repository at this point in the history
* add extra info keys to lemma entries

Adds morphemes, etymology, and head info to lemma entries. Also deletes old lemma and form data before writing the new JSONs.

* write tests

Adds those extra info keys to the test entries.

* add new info entries to dictionary generation

Used those extra keys in the dictionary creation process. Changed up how morpheme text is extracted. Added styling. Added extra archive type to git ignore.

* write tests
  • Loading branch information
seth-js authored Jan 3, 2025
1 parent 0814121 commit b4f1e36
Show file tree
Hide file tree
Showing 31 changed files with 3,805 additions and 55 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
!data/test/ipa/**/*.json

*.zip
*.gz
data/**/*.css
!data/styles.css

Expand Down
63 changes: 61 additions & 2 deletions 3-tidy-up.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
const { writeFileSync } = require('fs');
const { writeFileSync, readdirSync, unlinkSync } = require('fs');

const LineByLineReader = require('line-by-line');

Expand Down Expand Up @@ -125,7 +125,7 @@ lr.on('line', (line) => {
* @param {KaikkiLine} parsedLine
*/
function handleLine(parsedLine) {
const { pos, sounds, forms, etymology_number = 0 } = parsedLine;
const { pos, sounds, forms, etymology_number = 0, etymology_text} = parsedLine;
if(!pos) return;
const word = getCanonicalWordForm(parsedLine);
if (!word) return;
Expand Down Expand Up @@ -209,6 +209,33 @@ function handleLine(parsedLine) {
saveIpaResult(word, readings, pos, String(etymology_number), ipaObj);
}

for (const reading of readings) {
const currentEntry = lemmaDict[word][reading][pos][etymology_number];

if (etymology_text) {
const morphemeText = getMorphemes(etymology_text);

if (targetIso === 'en' && morphemeText) {
if (morphemeText === etymology_text) {
currentEntry.morpheme_text = morphemeText;
} else {
currentEntry.etymology_text = etymology_text;
currentEntry.morpheme_text = morphemeText;
}
} else {
currentEntry.etymology_text = etymology_text;
}
}

if (head_templates) {
const headInfo = getHeadInfo(head_templates);

if (headInfo) {
lemmaDict[word][reading][pos][etymology_number].head_info_text = headInfo;
}
}
}

const glossTree = getGlossTree(sensesWithoutInflectionGlosses);

for (const reading of readings) {
Expand All @@ -229,6 +256,32 @@ function handleLine(parsedLine) {

}

/**
* @param {string} text
* @returns {string}
* */
function getMorphemes(text) {
for (const part of text.split(/(?<=\.)/g).map(item => item.trim())) {
if (part.includes(' + ') && !/Proto|Inherited from/.test(part)) { return part; }
}

return '';
}

/**
* @param {HeadTemplate[]} head_templates
* @returns {string}
* */
function getHeadInfo(head_templates) {
for (const entry of head_templates) {
if (entry.expansion) {
if (/(?<=\().+?(?=\))/.test(entry.expansion)) return entry.expansion;
}
}

return '';
}

/**
* @param {Example} example
* @returns {StandardizedExample}
Expand Down Expand Up @@ -638,6 +691,12 @@ lr.on('end', () => {
clearConsoleLine();
process.stdout.write(`Processed ${lineCount} lines...\n`);

for (const file of readdirSync(writeFolder)) {
if (file.includes(`${sourceIso}-${targetIso}`)) {
unlinkSync(`${writeFolder}/${file}`);
}
}

const lemmasFilePath = `${writeFolder}/${sourceIso}-${targetIso}-lemmas.json`;
consoleOverwrite(`3-tidy-up.js: Writing lemma dict to ${lemmasFilePath}...`);
writeFileSync(lemmasFilePath, JSON.stringify(lemmaDict, mapJsonReplacer));
Expand Down
74 changes: 74 additions & 0 deletions 4-make-yomitan.js
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,66 @@ function getStructuredExamples(examples) {
});
}

/**
* @param {string} type
* @param {string} content
* @returns {import('types').TermBank.StructuredContent}
*/
function buildDetailsEntry(type, content) {
return {
"tag": "details",
"data": {
"content": `details-entry-${type}`
},
"content": [
{
"tag": "summary",
"data": {
"content": "summary-entry"
},
"content": type
},
{
"tag": "div",
"data": {
"content": `${type}-content`
},
"content": content
}
]
};
}

/**
* @param {LemmaInfo} info
* @returns {import('types').TermBank.StructuredContent}
*/
function getStructuredDetails(info) {
const result = [];

const {
etymology_text: etymology,
morpheme_text: morphemes,
head_info_text: headInfo
} = info;

for (const [title, content] of [
['mophemes', morphemes],
['etymology', etymology],
['head-info', headInfo],
]) {
if (title && content) result.push(buildDetailsEntry(title, content));
}

return {
"tag": "div",
"data": {
"content": "details-section"
},
"content": [...result]
};
}

/**
* @param {GlossTwig} glossTwig
* @param {string[]} senseTags
Expand Down Expand Up @@ -315,6 +375,20 @@ let lastTermBankIndex = 0;

debug(entries);
for (const [tags, entry] of Object.entries(entries)) {
if (info.etymology_text || info.head_info_text || info.morpheme_text) {
const lastDef = entry[5][entry[5].length - 1];

if (
lastDef &&
typeof lastDef === 'object' &&
'type' in lastDef &&
lastDef.type === 'structured-content' &&
Array.isArray(lastDef.content)
) {
lastDef.content.push(getStructuredDetails(info));
}
}

ymtLemmas.push(entry);
}
}
Expand Down
28 changes: 28 additions & 0 deletions data/styles.css
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,32 @@ div[data-sc-content="example-sentence-a"] {
}
div[data-sc-content="example-sentence-b"] {
font-size: 0.8em;
}
div[data-sc-content="details-section"] {
margin: 0.25em 0;
}
details[data-sc-content^="details-entry"] {
padding-left: 0;
}
summary[data-sc-content="summary-entry"] {
user-select: none;
width: max-content;
}
ul.gloss-list[data-count="1"] summary[data-sc-content="summary-entry"] {
list-style-position: inside;
}
summary[data-sc-content="summary-entry"]::marker {
color: var(--checkbox-disabled-color);
}
summary[data-sc-content="summary-entry"] {
color: var(--text-color-light4);
}
details[data-sc-content^="details-entry"][open=""] summary[data-sc-content="summary-entry"] {
color: var(--text-color);
}
summary[data-sc-content="summary-entry"]:hover {
cursor: pointer;
}
summary[data-sc-content="summary-entry"] ~ div {
margin: 0.5em 0;
}
90 changes: 90 additions & 0 deletions data/test/dict/cs/en/term_bank_1.json
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,36 @@
}
}
]
},
{
"tag": "div",
"data": {
"content": "details-section"
},
"content": [
{
"tag": "details",
"data": {
"content": "details-entry-etymology"
},
"content": [
{
"tag": "summary",
"data": {
"content": "summary-entry"
},
"content": "etymology"
},
{
"tag": "div",
"data": {
"content": "etymology-content"
},
"content": "Deverbal from zpravit."
}
]
}
]
}
]
}
Expand Down Expand Up @@ -187,6 +217,36 @@
}
}
]
},
{
"tag": "div",
"data": {
"content": "details-section"
},
"content": [
{
"tag": "details",
"data": {
"content": "details-entry-etymology"
},
"content": [
{
"tag": "summary",
"data": {
"content": "summary-entry"
},
"content": "etymology"
},
{
"tag": "div",
"data": {
"content": "etymology-content"
},
"content": "Inherited from Old Czech pro, from Proto-Slavic *pro."
}
]
}
]
}
]
}
Expand All @@ -209,6 +269,36 @@
"content": [
"(reflexive with se) to dispute"
]
},
{
"tag": "div",
"data": {
"content": "details-section"
},
"content": [
{
"tag": "details",
"data": {
"content": "details-entry-etymology"
},
"content": [
{
"tag": "summary",
"data": {
"content": "summary-entry"
},
"content": "etymology"
},
{
"tag": "div",
"data": {
"content": "etymology-content"
},
"content": "Inherited from Old Czech přieti, from Proto-Slavic *pьrěti."
}
]
}
]
}
]
}
Expand Down
Loading

0 comments on commit b4f1e36

Please sign in to comment.