From efd900b1fa19fe5d962b31de3a9f0e56be2606c5 Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Wed, 28 Feb 2024 21:07:45 -0500 Subject: [PATCH] Support KS X 1026-1 --- .github/workflows/rust.yml | 4 +- Cargo.toml | 4 + README.md | 17 ++- src/ks_x_1026_1.rs | 233 +++++++++++++++++++++++++++++++++++++ src/lib.rs | 134 +++++++++++++++++++-- src/normalize.rs | 37 +++--- tests/ks_x_1026_1.rs | 103 ++++++++++++++++ 7 files changed, 503 insertions(+), 29 deletions(-) create mode 100644 src/ks_x_1026_1.rs create mode 100644 tests/ks_x_1026_1.rs diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index db9efdc..e19421d 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -11,7 +11,7 @@ env: CARGO_TERM_COLOR: always RUST_BACKTRACE: 1 RUSTFLAGS: -D warnings - RUSTDOCFLAGS: -D warnings --cfg docsrs + RUSTDOCFLAGS: -D warnings jobs: build: @@ -44,6 +44,8 @@ jobs: run: cd $(find target/package/ -maxdepth 1 -mindepth 1 -type d) && cargo test --no-default-features - name: Build docs if: matrix.rust == 'nightly' + env: + RUSTDOCFLAGS: -D warnings --cfg docsrs run: cargo doc --all-features --verbose - name: Check formatting if: matrix.rust == 'stable' diff --git a/Cargo.toml b/Cargo.toml index 3545601..bf1a0ae 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -40,4 +40,8 @@ features = ["alloc"] [features] default = ["std"] +ks_x_1026-1 = [] std = [] + +[package.metadata.docs.rs] +rustc-args = ["--cfg", "feature=\"ks_x_1026-1\""] diff --git a/README.md b/README.md index 7d10e4d..5e169ed 100644 --- a/README.md +++ b/README.md @@ -26,8 +26,8 @@ fn main() { ## crates.io -You can use this package in your project by adding the following -to your `Cargo.toml`: +You can use this package in your project by adding the following to your +`Cargo.toml`: ```toml [dependencies] @@ -36,4 +36,15 @@ unicode-normalization = "0.1.23" ## `no_std` + `alloc` support -This crate is completely `no_std` + `alloc` compatible. This can be enabled by disabling the `std` feature, i.e. specifying `default-features = false` for this crate on your `Cargo.toml`. +This crate is completely `no_std` + `alloc` compatible. This can be enabled by +disabling the `std` feature, i.e. specifying `default-features = false` for this +crate on your `Cargo.toml`. + +## KS X 1026-1 + +Korean Standard KS X 1026-1 ([Korean](https://standard.go.kr/KSCI/standardIntro/getStandardSearchView.do?ksNo=KSX1026-1), +[English](http://std.dkuug.dk/jtc1/sc2/wg2/docs/n3422.pdf)) is an ROK government +standard that corrects some defects and makes some changes to the Unicode NFC, +NFKC, and NFKD normalization forms for certain Korean characters. The +`ks_x_1026-1` crate feature (disabled by default) adds methods to support these +alternate normalizations. diff --git a/src/ks_x_1026_1.rs b/src/ks_x_1026_1.rs new file mode 100644 index 0000000..dad327e --- /dev/null +++ b/src/ks_x_1026_1.rs @@ -0,0 +1,233 @@ +//! Annex B + +use core::{ + convert::{TryFrom, TryInto}, + iter::FusedIterator, +}; + +use tinyvec::ArrayVec; + +// § B.1.1 + +use crate::normalize::hangul_constants::{ + L_BASE, L_LAST, N_COUNT, S_BASE, S_COUNT, T_BASE, T_COUNT, T_LAST, V_BASE, V_LAST, +}; + +// § B.1.2 + +fn is_old_jongseong(t: char) -> bool { + match t { + '\u{11C3}'..='\u{11FF}' | '\u{D7CB}'..='\u{D7FB}' => true, + _ => false, + } +} + +/// Iterator that decomposes modern Hangul LV syllables immediately followed by old Hangul T jamo +/// into a 3-character L V T sequences, as specified in KS X 1026-1 annex B.1.5. +#[derive(Clone, Debug)] +pub struct RecomposeHangul { + /// Medial vowel of a decomposed LV syllable + v: Option, + /// Character yielded by inner iterator in last call to its `next()` + last: Option, + inner: I, +} + +impl> Iterator for RecomposeHangul { + type Item = char; + + fn next(&mut self) -> Option { + if let Some(v) = self.v { + // If an LV syllable was decomposed in the last call to `next`, + // yield its medial vowel. + self.v = None; + Some(v) + } else { + let prev = self.last; + self.last = self.inner.next(); + + if let (Some(prev), Some(next)) = (prev, self.last) { + let s_index = u32::from(prev).wrapping_sub(S_BASE); + if s_index < S_COUNT && s_index % T_COUNT == 0 && is_old_jongseong(next) { + // We have an LV syllable followed by an old jongseong, decompose into L V + let l: char = (L_BASE + s_index / N_COUNT).try_into().unwrap(); + self.v = Some((V_BASE + (s_index % N_COUNT) / T_COUNT).try_into().unwrap()); + return Some(l); + } + } + + prev + } + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + let (inner_lo, inner_hi) = self.inner.size_hint(); + let add_factor: usize = self.v.map_or(0, |_| 1) + self.last.map_or(0, |_| 1); + ( + inner_lo.saturating_add(add_factor), + inner_hi + .and_then(|h| h.checked_mul(2)) + .and_then(|h| h.checked_add(add_factor)), + ) + } +} + +impl + FusedIterator> FusedIterator for RecomposeHangul {} + +impl> RecomposeHangul { + #[inline] + pub(crate) fn new(mut iter: I) -> Self { + RecomposeHangul { + v: None, + last: iter.next(), + inner: iter, + } + } +} + +// B.2.1 + +static CP_JAMO: [char; 94] = [ + '\u{1100}', '\u{1101}', '\u{11AA}', '\u{1102}', '\u{11AC}', '\u{11AD}', '\u{1103}', '\u{1104}', + '\u{1105}', '\u{11B0}', '\u{11B1}', '\u{11B2}', '\u{11B3}', '\u{11B4}', '\u{11B5}', '\u{111A}', + '\u{1106}', '\u{1107}', '\u{1108}', '\u{1121}', '\u{1109}', '\u{110A}', '\u{110B}', '\u{110C}', + '\u{110D}', '\u{110E}', '\u{110F}', '\u{1110}', '\u{1111}', '\u{1112}', '\u{1161}', '\u{1162}', + '\u{1163}', '\u{1164}', '\u{1165}', '\u{1166}', '\u{1167}', '\u{1168}', '\u{1169}', '\u{116A}', + '\u{116B}', '\u{116C}', '\u{116D}', '\u{116E}', '\u{116F}', '\u{1170}', '\u{1171}', '\u{1172}', + '\u{1173}', '\u{1174}', '\u{1175}', '\u{1160}', '\u{1114}', '\u{1115}', '\u{11C7}', '\u{11C8}', + '\u{11CC}', '\u{11CE}', '\u{11D3}', '\u{11D7}', '\u{11D9}', '\u{111C}', '\u{11DD}', '\u{11DF}', + '\u{111D}', '\u{111E}', '\u{1120}', '\u{1122}', '\u{1123}', '\u{1127}', '\u{1129}', '\u{112B}', + '\u{112C}', '\u{112D}', '\u{112E}', '\u{112F}', '\u{1132}', '\u{1136}', '\u{1140}', '\u{1147}', + '\u{114C}', '\u{11F1}', '\u{11F2}', '\u{1157}', '\u{1158}', '\u{1159}', '\u{1184}', '\u{1185}', + '\u{1188}', '\u{1191}', '\u{1192}', '\u{1194}', '\u{119E}', '\u{11A1}', +]; + +// § B.2.2 + +static HW_JAMO: [char; 64] = [ + '\u{1160}', '\u{1100}', '\u{1101}', '\u{11AA}', '\u{1102}', '\u{11AC}', '\u{11AD}', '\u{1103}', + '\u{1104}', '\u{1105}', '\u{11B0}', '\u{11B1}', '\u{11B2}', '\u{11B3}', '\u{11B4}', '\u{11B5}', + '\u{111A}', '\u{1106}', '\u{1107}', '\u{1108}', '\u{1121}', '\u{1109}', '\u{110A}', '\u{110B}', + '\u{110C}', '\u{110D}', '\u{110E}', '\u{110F}', '\u{1110}', '\u{1111}', '\u{1112}', '\u{FFBF}', + '\u{FFC0}', '\u{FFC1}', '\u{1161}', '\u{1162}', '\u{1163}', '\u{1164}', '\u{1165}', '\u{1166}', + '\u{FFC8}', '\u{FFC9}', '\u{1167}', '\u{1168}', '\u{1169}', '\u{116A}', '\u{116B}', '\u{116C}', + '\u{FFD0}', '\u{FFD1}', '\u{116D}', '\u{116E}', '\u{116F}', '\u{1170}', '\u{1171}', '\u{1172}', + '\u{FFD8}', '\u{FFD9}', '\u{1173}', '\u{1174}', '\u{1175}', '\u{FFDD}', '\u{FFDE}', '\u{FFDF}', +]; + +// § B.2.3 + +static PC_JAMO: [char; 14] = [ + '\u{1100}', '\u{1102}', '\u{1103}', '\u{1105}', '\u{1106}', '\u{1107}', '\u{1109}', '\u{110B}', + '\u{110C}', '\u{110E}', '\u{110F}', '\u{1110}', '\u{1111}', '\u{1112}', +]; + +// § B.2.4 + +/// Iterator that decomposes compatibility characters containing Hangul jamo +/// in a manner that avoids introducing new nonstandard jamo sequences, +/// as specified in KS X 1026-1 annex B.2.4. +#[derive(Clone, Debug)] +pub struct NormalizeJamoKdkc { + inner: I, + // Buffer for when a character normalizes into multiple. + // Characters are pushed to and popped from the end. + // Length 3 is sufficient, as the longest possible expansion + // is for a parenthesized choseong like U+3200, + // which expands into ['(', , '\u{1160}', ')'] (length 4). + // (There are no parenthesized jungseong or jongseong.) + buf: ArrayVec<[char; 3]>, +} + +impl> Iterator for NormalizeJamoKdkc { + type Item = char; + + fn next(&mut self) -> Option { + if let Some(c) = self.buf.pop() { + // Empty buffer before yielding from underlying iterator. + Some(c) + } else { + let ch = self.inner.next()?; + // Whether ch is a parenthesized Hangul letter + let mut pf = false; + + let uch: u32 = ch.into(); + let base_jamo: char = match uch { + // Hangul compatibility letter + 0x3131..=0x318E => CP_JAMO[usize::try_from(uch - 0x3131).unwrap()], + + // Parenthesized Hangul letter + 0x3200..=0x320D => { + pf = true; + self.buf.push(')'); + PC_JAMO[usize::try_from(uch - 0x3200).unwrap()] + } + + // Circled Hangul letter + 0x3260..=0x326D => PC_JAMO[usize::try_from(uch - 0x3260).unwrap()], + + // Halfwidth Hangul letter + 0xFFA0..=0xFFDF => HW_JAMO[usize::try_from(uch - 0xFFA0).unwrap()], + + _ => return Some(ch), + }; + + // Insert fillers + let first_ret: char = match base_jamo.into() { + // `base_jamo` is choseong, yield a jungseong filler after + L_BASE..=L_LAST => { + self.buf.push('\u{1160}'); + base_jamo + } + + // `base_jamo` is jungseong, yield a choseong filler before + V_BASE..=V_LAST => { + self.buf.push(base_jamo); + '\u{115F}' + } + + // `base_jamo` is jongseong, yield a choseong and a jungseong filler before + T_BASE..=T_LAST => { + self.buf.push(base_jamo); + self.buf.push('\u{1160}'); + '\u{115F}' + } + + _ => unreachable!("`base_jamo` shluld be a jamo, but is not"), + }; + + if pf { + // Parenthesized Hangul letter, yield open paren before + self.buf.push(first_ret); + Some('(') + } else { + Some(first_ret) + } + } + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + let (inner_lo, inner_hi) = self.inner.size_hint(); + let add_factor: usize = self.buf.len(); + ( + inner_lo.saturating_add(add_factor), + inner_hi + .and_then(|h| h.checked_mul(4)) // Why 4? See comment on `buf` field + .and_then(|h| h.checked_add(add_factor)), + ) + } +} + +impl + FusedIterator> FusedIterator for NormalizeJamoKdkc {} + +impl> NormalizeJamoKdkc { + #[inline] + pub(crate) fn new(iter: I) -> Self { + NormalizeJamoKdkc { + inner: iter, + buf: ArrayVec::new(), + } + } +} diff --git a/src/lib.rs b/src/lib.rs index cc0a850..32ef120 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -36,23 +36,36 @@ //! [dependencies] //! unicode-normalization = "0.1.20" //! ``` +//! +//! # KS X 1026-1 +//! +//! Korean Standard KS X 1026-1 ([Korean](https://standard.go.kr/KSCI/standardIntro/getStandardSearchView.do?ksNo=KSX1026-1), +//! [English](http://std.dkuug.dk/jtc1/sc2/wg2/docs/n3422.pdf)) is an ROK government +//! standard that corrects some defects and makes some changes to the Unicode NFC, +//! NFKC, and NFKD normalization forms for certain Korean characters. The +//! `ks_x_1026-1` crate feature (disabled by default) adds methods to support these +//! alternate normalizations. #![deny(missing_docs, unsafe_code)] #![doc( html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png", html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png" )] +#![cfg_attr(docsrs, feature(doc_cfg))] #![cfg_attr(not(feature = "std"), no_std)] #[cfg(not(feature = "std"))] extern crate alloc; -#[cfg(feature = "std")] -extern crate core; - extern crate tinyvec; pub use crate::decompose::Decompositions; +#[cfg(feature = "ks_x_1026-1")] +#[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))] +pub use crate::ks_x_1026_1::NormalizeJamoKdkc; +#[cfg(feature = "ks_x_1026-1")] +#[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))] +pub use crate::ks_x_1026_1::RecomposeHangul; pub use crate::quick_check::{ is_nfc, is_nfc_quick, is_nfc_stream_safe, is_nfc_stream_safe_quick, is_nfd, is_nfd_quick, is_nfd_stream_safe, is_nfd_stream_safe_quick, is_nfkc, is_nfkc_quick, is_nfkd, is_nfkd_quick, @@ -65,6 +78,8 @@ pub use crate::tables::UNICODE_VERSION; use core::{option, str::Chars}; mod decompose; +#[cfg(feature = "ks_x_1026-1")] +mod ks_x_1026_1; mod lookups; mod normalize; mod perfect_hash; @@ -97,19 +112,19 @@ pub mod char { /// as described in /// [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/). pub trait UnicodeNormalization> { - /// Returns an iterator over the string in Unicode Normalization Form D + /// An iterator over the string in Unicode Normalization Form D /// (canonical decomposition). fn nfd(self) -> Decompositions; - /// Returns an iterator over the string in Unicode Normalization Form KD + /// An iterator over the string in Unicode Normalization Form KD /// (compatibility decomposition). fn nfkd(self) -> Decompositions; - /// An Iterator over the string in Unicode Normalization Form C + /// An iterator over the string in Unicode Normalization Form C /// (canonical decomposition followed by canonical composition). fn nfc(self) -> Recompositions; - /// An Iterator over the string in Unicode Normalization Form KC + /// An iterator over the string in Unicode Normalization Form KC /// (compatibility decomposition followed by canonical composition). fn nfkc(self) -> Recompositions; @@ -125,9 +140,41 @@ pub trait UnicodeNormalization> { /// implementations the option to recognize them. fn cjk_compat_variants(self) -> Replacements; - /// An Iterator over the string with Conjoining Grapheme Joiner characters - /// inserted according to the Stream-Safe Text Process (UAX15-D4) + /// An iterator over the string with Conjoining Grapheme Joiner characters + /// inserted according to the Stream-Safe Text Process ([UAX15-D4](https://unicode.org/reports/tr15/#UAX15-D4)) fn stream_safe(self) -> StreamSafe; + + /// An iterator over the string in the variant of Unicode Normalization Form KD + /// defined by Korean Standard X 1026-1. This normalization differs from that defined by Unicode + /// in that it will not produce nonstandard Korean jamo sequences if none were present in the input. + /// (Any string that is in KS X 1026-1 modified NFKD is also in standard Unicode NFKD, + /// but the reverse may not hold.) + #[cfg(feature = "ks_x_1026-1")] + #[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))] + + fn nfkd_ks_x_1026_1(self) -> Decompositions>; + + /// An iterator over the string in the variant of Unicode Normalization Form C + /// defined by Korean Standard X 1026-1. This normalization differs from that defined by Unicode + /// in that it will not contain any precomposed LV Hangul syllables immediately followed by conjoining T jamo. + /// (A string that is in KS X 1026-1 modified NFC might not be in standard Unicode NFC, + /// and vice versa.) + #[cfg(feature = "ks_x_1026-1")] + #[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))] + + fn nfc_ks_x_1026_1(self) -> RecomposeHangul>; + + /// An iterator over the string in the variant of Unicode Normalization Form KC + /// defined by Korean Standard X 1026-1. This normalization differs from that defined by Unicode + /// in that it will not produce nonstandard Korean jamo sequences if none were present in the input, + /// and it will also not contain any precomposed LV Hangul syllables immediately followed + /// by conjoining T jamo. + /// (A string that is in KS X 1026-1 modified NFKC might not be in standard Unicode NFKC, + /// and vice versa.) + #[cfg(feature = "ks_x_1026-1")] + #[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))] + + fn nfkc_ks_x_1026_1(self) -> RecomposeHangul>>; } impl<'a> UnicodeNormalization> for &'a str { @@ -160,6 +207,29 @@ impl<'a> UnicodeNormalization> for &'a str { fn stream_safe(self) -> StreamSafe> { StreamSafe::new(self.chars()) } + + #[cfg(feature = "ks_x_1026-1")] + #[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))] + #[inline] + fn nfkd_ks_x_1026_1(self) -> Decompositions>> { + decompose::new_compatible(NormalizeJamoKdkc::new(self.chars())) + } + + #[cfg(feature = "ks_x_1026-1")] + #[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))] + #[inline] + fn nfc_ks_x_1026_1(self) -> RecomposeHangul>> { + RecomposeHangul::new(self.nfc()) + } + + #[cfg(feature = "ks_x_1026-1")] + #[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))] + #[inline] + fn nfkc_ks_x_1026_1(self) -> RecomposeHangul>>> { + RecomposeHangul::new(recompose::new_compatible(NormalizeJamoKdkc::new( + self.chars(), + ))) + } } impl UnicodeNormalization> for char { @@ -192,6 +262,31 @@ impl UnicodeNormalization> for char { fn stream_safe(self) -> StreamSafe> { StreamSafe::new(Some(self).into_iter()) } + + #[cfg(feature = "ks_x_1026-1")] + #[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))] + #[inline] + fn nfkd_ks_x_1026_1(self) -> Decompositions>> { + decompose::new_compatible(NormalizeJamoKdkc::new(Some(self).into_iter())) + } + + #[cfg(feature = "ks_x_1026-1")] + #[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))] + #[inline] + fn nfc_ks_x_1026_1(self) -> RecomposeHangul>> { + RecomposeHangul::new(self.nfc()) + } + + #[cfg(feature = "ks_x_1026-1")] + #[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))] + #[inline] + fn nfkc_ks_x_1026_1( + self, + ) -> RecomposeHangul>>> { + RecomposeHangul::new(recompose::new_compatible(NormalizeJamoKdkc::new( + Some(self).into_iter(), + ))) + } } impl> UnicodeNormalization for I { @@ -224,4 +319,25 @@ impl> UnicodeNormalization for I { fn stream_safe(self) -> StreamSafe { StreamSafe::new(self) } + + #[cfg(feature = "ks_x_1026-1")] + #[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))] + #[inline] + fn nfkd_ks_x_1026_1(self) -> Decompositions> { + decompose::new_compatible(NormalizeJamoKdkc::new(self)) + } + + #[cfg(feature = "ks_x_1026-1")] + #[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))] + #[inline] + fn nfc_ks_x_1026_1(self) -> RecomposeHangul> { + RecomposeHangul::new(self.nfc()) + } + + #[cfg(feature = "ks_x_1026-1")] + #[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))] + #[inline] + fn nfkc_ks_x_1026_1(self) -> RecomposeHangul>> { + RecomposeHangul::new(recompose::new_compatible(NormalizeJamoKdkc::new(self))) + } } diff --git a/src/normalize.rs b/src/normalize.rs index e878642..b742927 100644 --- a/src/normalize.rs +++ b/src/normalize.rs @@ -106,22 +106,27 @@ pub fn compose(a: char, b: char) -> Option { compose_hangul(a, b).or_else(|| composition_table(a, b)) } -// Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior -// http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior -const S_BASE: u32 = 0xAC00; -const L_BASE: u32 = 0x1100; -const V_BASE: u32 = 0x1161; -const T_BASE: u32 = 0x11A7; -const L_COUNT: u32 = 19; -const V_COUNT: u32 = 21; -const T_COUNT: u32 = 28; -const N_COUNT: u32 = V_COUNT * T_COUNT; -const S_COUNT: u32 = L_COUNT * N_COUNT; - -const S_LAST: u32 = S_BASE + S_COUNT - 1; -const L_LAST: u32 = L_BASE + L_COUNT - 1; -const V_LAST: u32 = V_BASE + V_COUNT - 1; -const T_LAST: u32 = T_BASE + T_COUNT - 1; +/// Constants from Unicode 15.0.0 Section 3.12 Conjoining Jamo Behavior +/// +/// (also found in KS X 1026-1 annex B.1.1 ). +pub mod hangul_constants { + pub const S_BASE: u32 = 0xAC00; + pub const L_BASE: u32 = 0x1100; + pub const V_BASE: u32 = 0x1161; + pub const T_BASE: u32 = 0x11A7; + pub const L_COUNT: u32 = 19; + pub const V_COUNT: u32 = 21; + pub const T_COUNT: u32 = 28; + pub const N_COUNT: u32 = V_COUNT * T_COUNT; + pub const S_COUNT: u32 = L_COUNT * N_COUNT; + + pub const S_LAST: u32 = S_BASE + S_COUNT - 1; + pub const L_LAST: u32 = L_BASE + L_COUNT - 1; + pub const V_LAST: u32 = V_BASE + V_COUNT - 1; + pub const T_LAST: u32 = T_BASE + T_COUNT - 1; +} + +use hangul_constants::*; // Composition only occurs for `TPart`s in `U+11A8 ..= U+11C2`, // i.e. `T_BASE + 1 ..= T_LAST`. diff --git a/tests/ks_x_1026_1.rs b/tests/ks_x_1026_1.rs new file mode 100644 index 0000000..55dc81c --- /dev/null +++ b/tests/ks_x_1026_1.rs @@ -0,0 +1,103 @@ +#![cfg(feature = "ks_x_1026-1")] + +use unicode_normalization::UnicodeNormalization; + +macro_rules! norm_string { + ($method: ident, $input: expr) => { + $input.$method().collect::() + }; +} + +/// § 6.2 +#[test] +fn compatibility_and_halfwidth_hangul_letters() { + // Compatibility + let orig = "\u{3131}\u{314F}"; + assert_eq!(norm_string!(nfkd, orig), "\u{1100}\u{1161}"); + assert_eq!(norm_string!(nfkc, orig), "\u{AC00}"); + assert_eq!( + norm_string!(nfkd_ks_x_1026_1, orig), + "\u{1100}\u{1160}\u{115F}\u{1161}" + ); + assert_eq!( + norm_string!(nfkc_ks_x_1026_1, orig), + "\u{1100}\u{1160}\u{115F}\u{1161}" + ); + + // Halfwidth + let orig = "\u{FFA1}\u{FFC6}"; + assert_eq!(norm_string!(nfd, orig), "\u{FFA1}\u{FFC6}"); + assert_eq!(norm_string!(nfc, orig), "\u{FFA1}\u{FFC6}"); + assert_eq!(norm_string!(nfkd, orig), "\u{1100}\u{1165}"); + assert_eq!(norm_string!(nfkc, orig), "\u{AC70}"); + assert_eq!(norm_string!(nfc_ks_x_1026_1, orig), "\u{FFA1}\u{FFC6}"); + assert_eq!( + norm_string!(nfkd_ks_x_1026_1, orig), + "\u{1100}\u{1160}\u{115F}\u{1165}" + ); + assert_eq!( + norm_string!(nfkc_ks_x_1026_1, orig), + "\u{1100}\u{1160}\u{115F}\u{1165}" + ); +} + +/// § 6.3 +#[test] +fn hangul_embedded_symbols() { + // Circled + let orig = "\u{3260}"; + assert_eq!(norm_string!(nfd, orig), "\u{3260}"); + assert_eq!(norm_string!(nfc, orig), "\u{3260}"); + assert_eq!(norm_string!(nfkd, orig), "\u{1100}"); + assert_eq!(norm_string!(nfkc, orig), "\u{1100}"); + assert_eq!(norm_string!(nfc_ks_x_1026_1, orig), "\u{3260}"); + assert_eq!(norm_string!(nfkd_ks_x_1026_1, orig), "\u{1100}\u{1160}"); + assert_eq!(norm_string!(nfkc_ks_x_1026_1, orig), "\u{1100}\u{1160}"); + + // Parenthesized + let orig = "\u{3200}"; + assert_eq!(norm_string!(nfd, orig), "\u{3200}"); + assert_eq!(norm_string!(nfc, orig), "\u{3200}"); + assert_eq!(norm_string!(nfkd, orig), "(\u{1100})"); + assert_eq!(norm_string!(nfkc, orig), "(\u{1100})"); + assert_eq!(norm_string!(nfc_ks_x_1026_1, orig), "\u{3200}"); + assert_eq!(norm_string!(nfkd_ks_x_1026_1, orig), "(\u{1100}\u{1160})"); + assert_eq!(norm_string!(nfkc_ks_x_1026_1, orig), "(\u{1100}\u{1160})"); +} + +/// § 6.4 +#[test] +fn hangul_syllable_blocks() { + let orig = "\u{1100}\u{1161}\u{11EB}"; + assert_eq!(norm_string!(nfd, orig), "\u{1100}\u{1161}\u{11EB}"); + assert_eq!(norm_string!(nfc, orig), "\u{AC00}\u{11EB}"); + assert_eq!(norm_string!(nfkd, orig), "\u{1100}\u{1161}\u{11EB}"); + assert_eq!(norm_string!(nfkc, orig), "\u{AC00}\u{11EB}"); + assert_eq!( + norm_string!(nfc_ks_x_1026_1, orig), + "\u{1100}\u{1161}\u{11EB}" + ); + assert_eq!( + norm_string!(nfkd_ks_x_1026_1, orig), + "\u{1100}\u{1161}\u{11EB}" + ); + assert_eq!( + norm_string!(nfkc_ks_x_1026_1, orig), + "\u{1100}\u{1161}\u{11EB}" + ); +} + +#[test] +fn non_hangul() { + let orig = "ab\u{010D}de\u{0301}"; + assert_eq!(norm_string!(nfd, orig), "abc\u{030C}de\u{0301}"); + assert_eq!(norm_string!(nfc, orig), "ab\u{010D}d\u{00E9}"); + assert_eq!(norm_string!(nfkd, orig), "abc\u{030C}de\u{0301}"); + assert_eq!(norm_string!(nfkc, orig), "ab\u{010D}d\u{00E9}"); + assert_eq!(norm_string!(nfc_ks_x_1026_1, orig), "ab\u{010D}d\u{00E9}"); + assert_eq!( + norm_string!(nfkd_ks_x_1026_1, orig), + "abc\u{030C}de\u{0301}" + ); + assert_eq!(norm_string!(nfkc_ks_x_1026_1, orig), "ab\u{010D}d\u{00E9}"); +}