diff --git a/.eslintrc.json b/.eslintrc.json index 4e7503ad71..6fdb8a14d6 100644 --- a/.eslintrc.json +++ b/.eslintrc.json @@ -647,6 +647,7 @@ "ext/js/language/ja/japanese-transforms.js", "ext/js/language/ja/japanese-wanakana.js", "ext/js/language/ja/japanese.js", + "ext/js/language/ko/korean-hangul.js", "ext/js/language/ko/korean-text-processors.js", "ext/js/language/la/latin-text-preprocessors.js", "ext/js/language/language-descriptors.js", diff --git a/ext/js/language/ko/korean-hangul.js b/ext/js/language/ko/korean-hangul.js new file mode 100644 index 0000000000..d9863af79a --- /dev/null +++ b/ext/js/language/ko/korean-hangul.js @@ -0,0 +1,586 @@ +/* + * Copyright (C) 2024 Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +const HANGUL_OFFSET = 0xAC00; + +const CHO = [ + 'ㄱ', + 'ㄲ', + 'ㄴ', + 'ㄷ', + 'ㄸ', + 'ㄹ', + 'ㅁ', + 'ㅂ', + 'ㅃ', + 'ㅅ', + 'ㅆ', + 'ㅇ', + 'ㅈ', + 'ㅉ', + 'ㅊ', + 'ㅋ', + 'ㅌ', + 'ㅍ', + 'ㅎ' +]; + +const JUNG = [ + 'ㅏ', + 'ㅐ', + 'ㅑ', + 'ㅒ', + 'ㅓ', + 'ㅔ', + 'ㅕ', + 'ㅖ', + 'ㅗ', + ['ㅗ', 'ㅏ'], + ['ㅗ', 'ㅐ'], + ['ㅗ', 'ㅣ'], + 'ㅛ', + 'ㅜ', + ['ㅜ', 'ㅓ'], + ['ㅜ', 'ㅔ'], + ['ㅜ', 'ㅣ'], + 'ㅠ', + 'ㅡ', + ['ㅡ', 'ㅣ'], + 'ㅣ' +]; + +const JONG = [ + '', + 'ㄱ', + 'ㄲ', + ['ㄱ', 'ㅅ'], + 'ㄴ', + ['ㄴ', 'ㅈ'], + ['ㄴ', 'ㅎ'], + 'ㄷ', + 'ㄹ', + ['ㄹ', 'ㄱ'], + ['ㄹ', 'ㅁ'], + ['ㄹ', 'ㅂ'], + ['ㄹ', 'ㅅ'], + ['ㄹ', 'ㅌ'], + ['ㄹ', 'ㅍ'], + ['ㄹ', 'ㅎ'], + 'ㅁ', + 'ㅂ', + ['ㅂ', 'ㅅ'], + 'ㅅ', + 'ㅆ', + 'ㅇ', + 'ㅈ', + 'ㅊ', + 'ㅋ', + 'ㅌ', + 'ㅍ', + 'ㅎ' +]; + +const CONSONANTS = [ + 'ㄱ', + 'ㄲ', + 'ㄳ', + 'ㄴ', + 'ㄵ', + 'ㄶ', + 'ㄷ', + 'ㄸ', + 'ㄹ', + 'ㄺ', + 'ㄻ', + 'ㄼ', + 'ㄽ', + 'ㄾ', + 'ㄿ', + 'ㅀ', + 'ㅁ', + 'ㅂ', + 'ㅃ', + 'ㅄ', + 'ㅅ', + 'ㅆ', + 'ㅇ', + 'ㅈ', + 'ㅉ', + 'ㅊ', + 'ㅋ', + 'ㅌ', + 'ㅍ', + 'ㅎ' +]; + +const COMPLETE_CHO = [ + 'ㄱ', + 'ㄲ', + 'ㄴ', + 'ㄷ', + 'ㄸ', + 'ㄹ', + 'ㅁ', + 'ㅂ', + 'ㅃ', + 'ㅅ', + 'ㅆ', + 'ㅇ', + 'ㅈ', + 'ㅉ', + 'ㅊ', + 'ㅋ', + 'ㅌ', + 'ㅍ', + 'ㅎ' +]; + +const COMPLETE_JUNG = [ + 'ㅏ', + 'ㅐ', + 'ㅑ', + 'ㅒ', + 'ㅓ', + 'ㅔ', + 'ㅕ', + 'ㅖ', + 'ㅗ', + 'ㅘ', + 'ㅙ', + 'ㅚ', + 'ㅛ', + 'ㅜ', + 'ㅝ', + 'ㅞ', + 'ㅟ', + 'ㅠ', + 'ㅡ', + 'ㅢ', + 'ㅣ' +]; + +const COMPLETE_JONG = [ + '', + 'ㄱ', + 'ㄲ', + 'ㄳ', + 'ㄴ', + 'ㄵ', + 'ㄶ', + 'ㄷ', + 'ㄹ', + 'ㄺ', + 'ㄻ', + 'ㄼ', + 'ㄽ', + 'ㄾ', + 'ㄿ', + 'ㅀ', + 'ㅁ', + 'ㅂ', + 'ㅄ', + 'ㅅ', + 'ㅆ', + 'ㅇ', + 'ㅈ', + 'ㅊ', + 'ㅋ', + 'ㅌ', + 'ㅍ', + 'ㅎ' +]; + +const COMPLEX_CONSONANTS = [ + ['ㄱ', 'ㅅ', 'ㄳ'], + ['ㄴ', 'ㅈ', 'ㄵ'], + ['ㄴ', 'ㅎ', 'ㄶ'], + ['ㄹ', 'ㄱ', 'ㄺ'], + ['ㄹ', 'ㅁ', 'ㄻ'], + ['ㄹ', 'ㅂ', 'ㄼ'], + ['ㄹ', 'ㅅ', 'ㄽ'], + ['ㄹ', 'ㅌ', 'ㄾ'], + ['ㄹ', 'ㅍ', 'ㄿ'], + ['ㄹ', 'ㅎ', 'ㅀ'], + ['ㅂ', 'ㅅ', 'ㅄ'] +]; + +const COMPLEX_VOWELS = [ + ['ㅗ', 'ㅏ', 'ㅘ'], + ['ㅗ', 'ㅐ', 'ㅙ'], + ['ㅗ', 'ㅣ', 'ㅚ'], + ['ㅜ', 'ㅓ', 'ㅝ'], + ['ㅜ', 'ㅔ', 'ㅞ'], + ['ㅜ', 'ㅣ', 'ㅟ'], + ['ㅡ', 'ㅣ', 'ㅢ'] +]; + +const makeHash = (array) => { + const hash = {0: 0}; + for (let i = 0; i < array.length; i++) { + if (array[i]) { + hash[array[i].charCodeAt(0)] = i; + } + } + return hash; +}; + +const CONSONANTS_HASH = makeHash(CONSONANTS); + +const CHO_HASH = makeHash(COMPLETE_CHO); + +const JUNG_HASH = makeHash(COMPLETE_JUNG); + +const JONG_HASH = makeHash(COMPLETE_JONG); + +const makeComplexHash = (array) => { + const hash = {}; + let code1, code2; + for (let i = 0; i < array.length; i++) { + code1 = array[i][0].charCodeAt(0); + code2 = array[i][1].charCodeAt(0); + if (typeof hash[code1] === 'undefined') { + hash[code1] = {}; + } + hash[code1][code2] = array[i][2].charCodeAt(0); + } + return hash; +}; + +const COMPLEX_CONSONANTS_HASH = makeComplexHash(COMPLEX_CONSONANTS); + +const COMPLEX_VOWELS_HASH = makeComplexHash(COMPLEX_VOWELS); + +/** + * Checks if the given character is a Korean consonant. + * @param {number} c The character to check. + * @returns {boolean} True if the character is a Korean consonant, false otherwise. + */ +function isConsonant(c) { + return typeof CONSONANTS_HASH[c] !== 'undefined'; +} + +/** + * Checks if the given character is a Korean initial consonant (cho). + * @param {number} c The character to check. + * @returns {boolean} True if the character is a Korean initial consonant, false otherwise. + */ +function isCho(c) { + return typeof CHO_HASH[c] !== 'undefined'; +} + +/** + * Checks if the given character is a Korean vowel (jung). + * @param {number} c The character to check. + * @returns {boolean} True if the character is a Korean vowel, false otherwise. + */ +function isJung(c) { + return typeof JUNG_HASH[c] !== 'undefined'; +} + +/** + * Checks if the given character is a Korean final consonant (jong). + * @param {number} c The character to check. + * @returns {boolean} True if the character is a Korean final consonant, false otherwise. + */ +function isJong(c) { + return typeof JONG_HASH[c] !== 'undefined'; +} + +/** + * Checks if the given character code represents a Hangul character. + * @param {number} charCode The character code to check. + * @returns {boolean} True if the character code represents a Hangul character, false otherwise. + */ +function isHangul(charCode) { + return HANGUL_OFFSET <= charCode && charCode <= 0xd7a3; +} + +/** + * Retrieves the indices of the initial consonant (cho), vowel (jung), and final consonant (jong) + * that make up the given Hangul character code. + * @param {number} charCode The character code of the Hangul character. + * @returns {object} An object containing the indices of cho, jung, and jong. + */ +function getHangulIndices(charCode) { + const baseCode = charCode - HANGUL_OFFSET; + return { + cho: Math.floor(baseCode / 588), + jung: Math.floor((baseCode % 588) / 28), + jong: baseCode % 28 + }; +} + +/** + * Checks if the given characters 'a' and 'b' can be combined to form a complex vowel. + * @param {number} a The character code of the first vowel. + * @param {number} b The character code of the second vowel. + * @returns {(number|boolean)} The character code of the combined complex vowel, or false if they cannot be combined. + */ +function isJungJoinable(a, b) { + return (COMPLEX_VOWELS_HASH[a] && COMPLEX_VOWELS_HASH[a][b]) ? COMPLEX_VOWELS_HASH[a][b] : false; +} + +/** + * Checks if the given characters 'a' and 'b' can be combined to form a complex final consonant. + * @param {number} a The character code of the first final consonant. + * @param {number} b The character code of the second final consonant. + * @returns {(number|boolean)} The character code of the combined complex final consonant, or false if they cannot be combined. + */ +function isJongJoinable(a, b) { + return COMPLEX_CONSONANTS_HASH[a] && COMPLEX_CONSONANTS_HASH[a][b] ? COMPLEX_CONSONANTS_HASH[a][b] : false; +} + +/** + * Disassembles a given string into an array of individual Hangul characters or character components. + * @param {string} string The string to be disassembled. + * @param {boolean} [grouped=false] Whether to group the components of each Hangul character. + * @returns {(string|string[])} An array of individual Hangul characters or character components, or a single string if `grouped` is false. + * @throws {Error} If the input string is null. + */ +export const disassemble = (string, grouped = false) => { + if (string === null) { + throw new Error('Arguments cannot be null'); + } + + string = Array.isArray(string) ? string.join('') : string; + + const result = []; + + for (const character of string) { + const charCode = character.charCodeAt(0); + + if (isHangul(charCode)) { + const {cho, jung, jong} = getHangulIndices(charCode); + + const disassembled = [CHO[cho]]; + + if (Array.isArray(JUNG[jung])) { + disassembled.push(JUNG[jung].join('')); // eslint-disable-line @typescript-eslint/no-unsafe-argument + } else { + disassembled.push(JUNG[jung]); + } + + if (JONG[jong]) { + if (Array.isArray(JONG[jong])) { + disassembled.push(JONG[jong].join('')); // eslint-disable-line @typescript-eslint/no-unsafe-argument + } else { + disassembled.push(JONG[jong]); + } + } + + if (grouped) { + result.push(disassembled); + } else { + result.push(...disassembled); + } + } else if (isConsonant(charCode)) { + if (isCho(charCode)) { + result.push(CHO[CHO_HASH[charCode]]); + } else { + if (Array.isArray(JONG[JONG_HASH[charCode]])) { + result.push(JONG[JONG_HASH[charCode]].join('')); + } else { + result.push(JONG[JONG_HASH[charCode]]); + } + } + } else if (isJung(charCode)) { + if (Array.isArray(JUNG[JUNG_HASH[charCode]])) { + result.push(JUNG[JUNG_HASH[charCode]].join('')); + } else { + result.push(JUNG[JUNG_HASH[charCode]]); + } + } else { + result.push(character); + } + } + return grouped ? result : result.join(''); +}; + +/** + * Assembles an array of individual Hangul characters or character components into a single string. + * @param {string} string The string containing individual Hangul characters or character components to be assembled. + * @returns {string} The assembled string. + * @throws {Error} If the input string is null. + */ +export const assemble = (string) => { + if (string === null) { + throw new Error('Arguments cannot be null'); + } + + const array = [...disassemble(string)]; + + const result = []; + + let complete_index = -1, + jong_joined = false; + + /** + * Helper function to combine jamo into hangul + * @param {number} index Index of a hangul + */ + function makeHangul(index) { + let cho, + jung1, + jung2, + jong1 = 0, + jong2, + hangul = ''; + + jong_joined = false; + + if (complete_index + 1 > index) { + return; + } + + for (let step = 1; ; step++) { + // eslint-disable-next-line unicorn/prefer-switch + if (step === 1) { + cho = array[complete_index + step].charCodeAt(0); + if (isJung(cho)) { + if (complete_index + step + 1 <= index && isJung(jung1 = array[complete_index + step + 1].charCodeAt(0))) { + result.push(String.fromCharCode(isJungJoinable(cho, jung1))); + complete_index = index; + return; + } else { + result.push(array[complete_index + step]); + complete_index = index; + return; + } + } else if (!isCho(cho)) { + result.push(array[complete_index + step]); + complete_index = index; + return; + } + hangul = array[complete_index + step]; + } else if (step === 2) { + jung1 = array[complete_index + step].charCodeAt(0); + if (isCho(jung1)) { + result.push(String.fromCharCode(isJongJoinable(cho, jung1))); + complete_index = index; + return; + } else { + hangul = String.fromCharCode((CHO_HASH[cho] * 21 + JUNG_HASH[jung1]) * 28 + HANGUL_OFFSET); + } + } else if (step === 3) { + jung2 = array[complete_index + step].charCodeAt(0); + if (isJungJoinable(jung1, jung2)) { + jung1 = isJungJoinable(jung1, jung2); + } else { + jong1 = jung2; + } + hangul = String.fromCharCode((CHO_HASH[cho] * 21 + JUNG_HASH[jung1]) * 28 + JONG_HASH[jong1] + HANGUL_OFFSET); // eslint-disable-line @typescript-eslint/no-unsafe-argument + } else if (step === 4) { + jong2 = array[complete_index + step].charCodeAt(0); + jong1 = isJongJoinable(jong1, jong2) ?? jong2; + hangul = String.fromCharCode((CHO_HASH[cho] * 21 + JUNG_HASH[jung1]) * 28 + JONG_HASH[jong1] + HANGUL_OFFSET); // eslint-disable-line @typescript-eslint/no-unsafe-argument + } else if (step === 5) { + jong2 = array[complete_index + step].charCodeAt(0); + jong1 = isJongJoinable(jong1, jong2); + hangul = String.fromCharCode((CHO_HASH[cho] * 21 + JUNG_HASH[jung1]) * 28 + JONG_HASH[jong1] + HANGUL_OFFSET); // eslint-disable-line @typescript-eslint/no-unsafe-argument + } + if (complete_index + step >= index) { + result.push(hangul); + complete_index = index; + return; + } + } + } + + let index, + stage = 0, + previousCharCode; + + for (index = 0; index < array.length; index++) { + const charCode = array[index].charCodeAt(0); + + if (!isCho(charCode) && !isJung(charCode) && !isJong(charCode)) { + makeHangul(index - 1); + makeHangul(index); + stage = 0; + continue; + } + + // eslint-disable-next-line unicorn/prefer-switch + if (stage === 0) { + if (isCho(charCode)) { + stage = 1; + } else if (isJung(charCode)) { + stage = 4; + } + } else if (stage === 1) { + if (isJung(charCode)) { + stage = 2; + } else { + if (isJongJoinable(previousCharCode, charCode)) { + stage = 5; + } else { + makeHangul(index - 1); + } + } + } else if (stage === 2) { + if (isJong(charCode)) { + stage = 3; + } else if (isJung(charCode)) { + if (!isJungJoinable(previousCharCode, charCode)) { + makeHangul(index - 1); + stage = 4; + } + } else { + makeHangul(index - 1); + stage = 1; + } + } else if (stage === 3) { + if (isJong(charCode)) { + if (!jong_joined && isJongJoinable(previousCharCode, charCode)) { + jong_joined = true; + } else { + makeHangul(index - 1); + stage = 1; + } + } else if (isCho(charCode)) { + makeHangul(index - 1); + stage = 1; + } else if (isJung(charCode)) { + makeHangul(index - 2); + stage = 2; + } + } else if (stage === 4) { + if (isJung(charCode)) { + if (isJungJoinable(previousCharCode, charCode)) { + makeHangul(index); + stage = 0; + } else { + makeHangul(index - 1); + } + } else { + makeHangul(index - 1); + stage = 1; + } + } else if (stage === 5) { + if (isJung(charCode)) { + makeHangul(index - 2); + stage = 2; + } else { + makeHangul(index - 1); + stage = 1; + } + } + previousCharCode = charCode; + } + makeHangul(index - 1); + return result.join(''); +}; diff --git a/ext/js/language/ko/korean-text-processors.js b/ext/js/language/ko/korean-text-processors.js index 859ddc80ae..b0df4b413e 100644 --- a/ext/js/language/ko/korean-text-processors.js +++ b/ext/js/language/ko/korean-text-processors.js @@ -15,13 +15,15 @@ * along with this program. If not, see . */ +import {assemble, disassemble} from './korean-hangul.js'; + /** @type {import('language').TextProcessor} */ export const disassembleHangul = { name: 'Disassemble Hangul', description: 'Disassemble Hangul characters into jamo.', options: [true], // Could probably also be set to [false, true], but this way it is always on process: (str) => { - return str; // Import from hangul.js + return disassemble(str); } }; @@ -31,6 +33,6 @@ export const reassembleHangul = { description: 'Reassemble Hangul characters from jamo.', options: [true], // Could probably also be set to [false, true], but this way it is always on process: (str) => { - return str; // Import from hangul.js + return assemble(str); } };