unicode-rs · Manishearth · Apr 23, 2024 · Apr 23, 2024
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
@@ -32,6 +32,9 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v3
+    - uses: actions/setup-python@v5
+      with:
+        python-version: '3.12'
     - name: Regen
       run: cd scripts && python3 unicode.py
     - name: Diff

diff --git a/benches/benches.rs b/benches/benches.rs
@@ -104,3 +104,11 @@ fn jawiki(b: &mut Bencher) {
     let string = std::fs::read_to_string(data_path).unwrap_or_default();
     b.iter(|| test::black_box(UnicodeWidthStr::width(string.as_str())));
 }
+
+#[bench]
+fn emoji(b: &mut Bencher) {
+    // To benchmark, download emoji-style.txt from https://www.unicode.org/emoji/charts/emoji-style.txt
+    let data_path = "bench_data/emoji-style.txt";
+    let string = std::fs::read_to_string(data_path).unwrap_or_default();
+    b.iter(|| test::black_box(UnicodeWidthStr::width(string.as_str())));
+}
diff --git a/scripts/unicode.py b/scripts/unicode.py
@@ -17,6 +17,7 @@
 # - HangulSyllableType.txt
 # - PropList.txt
 # - ReadMe.txt
+# - emoji/emoji-variation-sequences.txt
 #
 # Since this should not require frequent updates, we just store this
 # out-of-line and check the generated module into git.
@@ -26,6 +27,8 @@
 import os
 import re
 import sys
+from collections import defaultdict
+from itertools import batched
 
 NUM_CODEPOINTS = 0x110000
 """An upper bound for which `range(0, NUM_CODEPOINTS)` contains Unicode's codespace."""
@@ -69,12 +72,13 @@ def fetch_open(filename: str):
     """Opens `filename` and return its corresponding file object. If `filename` isn't on disk,
     fetches it from `http://www.unicode.org/Public/UNIDATA/`. Exits with code 1 on failure.
     """
-    if not os.path.exists(os.path.basename(filename)):
+    basename = os.path.basename(filename)
+    if not os.path.exists(basename):
         os.system(f"curl -O http://www.unicode.org/Public/UNIDATA/{filename}")
     try:
-        return open(filename, encoding="utf-8")
+        return open(basename, encoding="utf-8")
     except OSError:
-        sys.stderr.write(f"cannot load {filename}")
+        sys.stderr.write(f"cannot load {basename}")
         sys.exit(1)
 
 
@@ -384,8 +388,71 @@ def make_tables(
     return tables
 
 
+def load_variation_sequences() -> "list[int]":
+    """Outputs a list of character ranages, corresponding to all the valid characters for starting
+    an emoji presentation sequence."""
+
+    with fetch_open("emoji/emoji-variation-sequences.txt") as sequences:
+        # Match all emoji presentation sequences
+        # (one codepoint followed by U+FE0F, and labeled "emoji style")
+        sequence = re.compile(r"^([0-9A-F]+)\s+FE0F\s*;\s+emoji style")
+        codepoints = []
+        for line in sequences.readlines():
+            if match := sequence.match(line):
+                cp = int(match.group(1), 16)
+                codepoints.append(cp)
+    return codepoints
+
+
+def make_variation_sequence_table(
+    seqs: "list[int]",
+    width_map: "list[EffectiveWidth]",
+) -> "tuple[list[int], list[list[int]]]":
+    """Generates 2-level lookup table for whether a codepoint might start an emoji presentation sequence.
+    (Characters that are always wide may be excluded.)
+    The first level is a match on all but the 10 LSB, the second level is a 1024-bit bitmap for those 10 LSB.
+    """
+
+    prefixes_dict = defaultdict(set)
+    for cp in seqs:
+        prefixes_dict[cp >> 10].add(cp & 0x3FF)
+
+    # We don't strictly need to keep track of characters that are always wide,
+    # because being in an emoji variation seq won't affect their width.
+    # So store their info only when it wouldn't inflate the size of the tables.
+    for k in list(prefixes_dict.keys()):
+        if all(
+            map(
+                lambda cp: width_map[(k << 10) | cp] == EffectiveWidth.WIDE,
+                prefixes_dict[k],
+            )
+        ):
+            del prefixes_dict[k]
+
+    indexes = list(prefixes_dict.keys())
+
+    # Similarly, we can spuriously return `true` for always-wide characters
+    # even if not part of a presentation seq; this saves an additional lookup,
+    # so we should do it where there is no size cost.
+    for cp, width in enumerate(width_map):
+        if width == EffectiveWidth.WIDE and (cp >> 10) in indexes:
+            prefixes_dict[cp >> 10].add(cp & 0x3FF)
+
+    leaves = []
+    for cps in prefixes_dict.values():
+        leaf = [0] * 128
+        for cp in cps:
+            idx_in_leaf, bit_shift = divmod(cp, 8)
+            leaf[idx_in_leaf] |= 1 << bit_shift
+        leaves.append(leaf)
+    return (indexes, leaves)
+
+
 def emit_module(
-    out_name: str, unicode_version: "tuple[int, int, int]", tables: "list[Table]"
+    out_name: str,
+    unicode_version: "tuple[int, int, int]",
+    tables: "list[Table]",
+    variation_table: "tuple[list[int], list[list[int]]]",
 ):
     """Outputs a Rust module to `out_name` using table data from `tables`.
     If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`.
@@ -462,6 +529,40 @@ def emit_module(
 """
         )
 
+        variation_idx, variation_leaves = variation_table
+
+        module.write(
+            """
+    /// Whether this character forms an [emoji presentation sequence]
+    /// (https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence)
+    /// when followed by `'\\u{FEOF}'`.
+    /// Emoji presentation sequences are considered to have width 2.
+    /// This may spuriously return `true` or `false` for characters that are always wide.
+    #[inline]
+    pub fn starts_emoji_presentation_seq(c: char) -> bool {
+        let cp: u32 = c.into();
+        // First level of lookup uses all but 10 LSB
+        let top_bits = cp >> 10;
+        let idx_of_leaf: usize = match top_bits {
+"""
+        )
+
+        for i, msbs in enumerate(variation_idx):
+            module.write(f"            {msbs} => {i},\n")
+
+        module.write(
+            """            _ => return false,
+        };
+        // Extract the 3-9th (0-indexed) least significant bits of `cp`,
+        // and use them to index into `leaf_row`.
+        let idx_within_leaf = usize::try_from((cp >> 3) & 0x7F).unwrap();
+        let leaf_byte = EMOJI_PRESENTATION_LEAVES.0[idx_of_leaf][idx_within_leaf];
+        // Use the 3 LSB of `cp` to index into `leaf_byte`.
+        ((leaf_byte >> (cp & 7)) & 1) == 1
+    }
+"""
+        )
+
         module.write(
             """
     /// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or
@@ -510,6 +611,29 @@ def emit_module(
                 module.write(f" 0x{byte:02X},")
             module.write("\n    ];\n")
             subtable_count = new_subtable_count
+
+        # emoji table
+
+        module.write(
+            f"""
+    #[repr(align(128))]
+    struct Align128<T>(T);
+    /// Array of 1024-bit bitmaps. Index into the correct (obtained from `EMOJI_PRESENTATION_INDEX`)
+    /// bitmap with the 10 LSB of your codepoint to get whether it can start an emoji presentation seq.
+    static EMOJI_PRESENTATION_LEAVES: Align128<[[u8; 128]; {len(variation_leaves)}]> = Align128([
+"""
+        )
+        for leaf in variation_leaves:
+            module.write("        [\n")
+            for row in batched(leaf, 14):
+                module.write("           ")
+                for entry in row:
+                    module.write(f" 0x{entry:02X},")
+                module.write("\n")
+            module.write("        ],\n")
+
+        module.write("    ]);\n")
+
         module.write("}\n")
 
 
@@ -520,6 +644,7 @@ def main(module_filename: str):
 
     We obey the following rules, in decreasing order of importance:
 
+    - Emoji presentation sequences are double-width.
     - The soft hyphen (`U+00AD`) is single-width. (https://archive.is/fCT3c)
     - Hangul jamo medial vowels & final consonants are zero-width.
     - `Default_Ignorable_Code_Point`s are zero-width, except for U+115F HANGUL CHOSEONG FILLER.
@@ -549,16 +674,25 @@ def main(module_filename: str):
 
     tables = make_tables(TABLE_CFGS, enumerate(width_map))
 
+    emoji_variations = load_variation_sequences()
+    variation_table = make_variation_sequence_table(emoji_variations, width_map)
+
     print("------------------------")
     total_size = 0
     for i, table in enumerate(tables):
         size_bytes = len(table.to_bytes())
-        print(f"Table {i} Size: {size_bytes} bytes")
+        print(f"Table {i} size: {size_bytes} bytes")
         total_size += size_bytes
+    emoji_index_size = len(variation_table[0]) * 4
+    print(f"Emoji presentation index size: {emoji_index_size} bytes")
+    total_size += emoji_index_size
+    emoji_leaves_size = len(variation_table[1]) * len(variation_table[1][0])
+    print(f"Emoji presentation leaves size: {emoji_leaves_size} bytes")
+    total_size += emoji_leaves_size
     print("------------------------")
-    print(f"  Total Size: {total_size} bytes")
+    print(f"  Total size: {total_size} bytes")
 
-    emit_module(module_filename, version, tables)
+    emit_module(module_filename, version, tables, variation_table)
     print(f'Wrote to "{module_filename}"')
 
 

diff --git a/src/lib.rs b/src/lib.rs
@@ -33,9 +33,11 @@
 //! This crate currently uses the following rules to determine the width of a
 //! character or string, in order of decreasing precedence. These may be tweaked in the future.
 //!
-//! 1. [`'\u{00AD}'` SOFT HYPHEN](https://util.unicode.org/UnicodeJsps/character.jsp?a=00AD) has width 1.
-//! 2. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
-//! 3. The following have width 0:
+//! 1. [Emoji presentation sequences](https://unicode.org/reports/tr51/#def_emoji_presentation_sequence)
+//!    have width 2. (The width of a string may therefore differ from the sum of the widths of its characters.)
+//! 2. [`'\u{00AD}'` SOFT HYPHEN](https://util.unicode.org/UnicodeJsps/character.jsp?a=00AD) has width 1.
+//! 3. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
+//! 4. The following have width 0:
 //!    - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BDefault_Ignorable_Code_Point%7D)
 //!       with the [`Default_Ignorable_Code_Point`](https://www.unicode.org/versions/Unicode15.0.0/ch05.pdf#G40095) property.
 //!    - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Extend%7D)
@@ -53,15 +55,15 @@
 //!       with a [`Hangul_Syllable_Type`](https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G45593)
 //!       of `Vowel_Jamo` (`V`) or `Trailing_Jamo` (`T`).
 //!    - [`'\0'` NUL](https://util.unicode.org/UnicodeJsps/character.jsp?a=0000).
-//! 4. The [control characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BCc%7D)
+//! 5. The [control characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BCc%7D)
 //!    have no defined width, and are ignored when determining the width of a string.
-//! 5. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
+//! 6. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
 //!    with an [`East_Asian_Width`] of [`Fullwidth` (`F`)](https://www.unicode.org/reports/tr11/#ED2)
 //!    or [`Wide` (`W`)](https://www.unicode.org/reports/tr11/#ED4) have width 2.
-//! 6. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DA%7D)
+//! 7. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DA%7D)
 //!    with an [`East_Asian_Width`] of [`Ambiguous` (`A`)](https://www.unicode.org/reports/tr11/#ED6)
 //!    have width 2 in an East Asian context, and width 1 otherwise.
-//! 7. All other characters have width 1.
+//! 8. All other characters have width 1.
 //!
 //! [`East_Asian_Width`]: https://www.unicode.org/reports/tr11/#ED1
 //! [`Grapheme_Extend`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G52443
@@ -122,7 +124,9 @@ impl UnicodeWidthChar for char {
 pub trait UnicodeWidthStr {
     /// Returns the string's displayed width in columns.
     ///
-    /// Control characters are treated as having zero width.
+    /// Control characters are treated as having zero width,
+    /// and [emoji presentation sequences](https://unicode.org/reports/tr51/#def_emoji_presentation_sequence)
+    /// are assigned width 2.
     ///
     /// This function treats characters in the Ambiguous category according
     /// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
@@ -132,7 +136,9 @@ pub trait UnicodeWidthStr {
 
     /// Returns the string's displayed width in columns.
     ///
-    /// Control characters are treated as having zero width.
+    /// Control characters are treated as having zero width,
+    /// and [emoji presentation sequences](https://unicode.org/reports/tr51/#def_emoji_presentation_sequence)
+    /// are assigned width 2.
     ///
     /// This function treats characters in the Ambiguous category according
     /// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
@@ -144,11 +150,28 @@ pub trait UnicodeWidthStr {
 impl UnicodeWidthStr for str {
     #[inline]
     fn width(&self) -> usize {
-        self.chars().map(|c| cw::width(c, false).unwrap_or(0)).sum()
+        str_width(self, false)
     }
 
     #[inline]
     fn width_cjk(&self) -> usize {
-        self.chars().map(|c| cw::width(c, true).unwrap_or(0)).sum()
+        str_width(self, true)
     }
 }
+
+fn str_width(s: &str, is_cjk: bool) -> usize {
+    s.chars()
+        .rfold((0, false), |(sum, was_fe0f), c| {
+            if c == '\u{FE0F}' {
+                (sum, true)
+            } else {
+                let add = if was_fe0f && cw::starts_emoji_presentation_seq(c) {
+                    2
+                } else {
+                    cw::width(c, is_cjk).unwrap_or(0)
+                };
+                (sum + add, false)
+            }
+        })
+        .0
+}