Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support emoji presentation sequences #41

Merged
merged 1 commit into from
Apr 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v5
with:
python-version: '3.12'
- name: Regen
run: cd scripts && python3 unicode.py
- name: Diff
Expand Down
8 changes: 8 additions & 0 deletions benches/benches.rs
Original file line number Diff line number Diff line change
Expand Up @@ -104,3 +104,11 @@ fn jawiki(b: &mut Bencher) {
let string = std::fs::read_to_string(data_path).unwrap_or_default();
b.iter(|| test::black_box(UnicodeWidthStr::width(string.as_str())));
}

#[bench]
fn emoji(b: &mut Bencher) {
// To benchmark, download emoji-style.txt from https://www.unicode.org/emoji/charts/emoji-style.txt
let data_path = "bench_data/emoji-style.txt";
let string = std::fs::read_to_string(data_path).unwrap_or_default();
b.iter(|| test::black_box(UnicodeWidthStr::width(string.as_str())));
}
148 changes: 141 additions & 7 deletions scripts/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
# - HangulSyllableType.txt
# - PropList.txt
# - ReadMe.txt
# - emoji/emoji-variation-sequences.txt
#
# Since this should not require frequent updates, we just store this
# out-of-line and check the generated module into git.
Expand All @@ -26,6 +27,8 @@
import os
import re
import sys
from collections import defaultdict
from itertools import batched

NUM_CODEPOINTS = 0x110000
"""An upper bound for which `range(0, NUM_CODEPOINTS)` contains Unicode's codespace."""
Expand Down Expand Up @@ -69,12 +72,13 @@ def fetch_open(filename: str):
"""Opens `filename` and return its corresponding file object. If `filename` isn't on disk,
fetches it from `http://www.unicode.org/Public/UNIDATA/`. Exits with code 1 on failure.
"""
if not os.path.exists(os.path.basename(filename)):
basename = os.path.basename(filename)
if not os.path.exists(basename):
os.system(f"curl -O http://www.unicode.org/Public/UNIDATA/{filename}")
try:
return open(filename, encoding="utf-8")
return open(basename, encoding="utf-8")
except OSError:
sys.stderr.write(f"cannot load {filename}")
sys.stderr.write(f"cannot load {basename}")
sys.exit(1)


Expand Down Expand Up @@ -384,8 +388,71 @@ def make_tables(
return tables


def load_variation_sequences() -> "list[int]":
"""Outputs a list of character ranages, corresponding to all the valid characters for starting
an emoji presentation sequence."""

with fetch_open("emoji/emoji-variation-sequences.txt") as sequences:
# Match all emoji presentation sequences
# (one codepoint followed by U+FE0F, and labeled "emoji style")
sequence = re.compile(r"^([0-9A-F]+)\s+FE0F\s*;\s+emoji style")
codepoints = []
for line in sequences.readlines():
if match := sequence.match(line):
cp = int(match.group(1), 16)
codepoints.append(cp)
return codepoints


def make_variation_sequence_table(
seqs: "list[int]",
width_map: "list[EffectiveWidth]",
) -> "tuple[list[int], list[list[int]]]":
"""Generates 2-level lookup table for whether a codepoint might start an emoji presentation sequence.
(Characters that are always wide may be excluded.)
The first level is a match on all but the 10 LSB, the second level is a 1024-bit bitmap for those 10 LSB.
"""

prefixes_dict = defaultdict(set)
for cp in seqs:
prefixes_dict[cp >> 10].add(cp & 0x3FF)

# We don't strictly need to keep track of characters that are always wide,
# because being in an emoji variation seq won't affect their width.
# So store their info only when it wouldn't inflate the size of the tables.
for k in list(prefixes_dict.keys()):
if all(
map(
lambda cp: width_map[(k << 10) | cp] == EffectiveWidth.WIDE,
prefixes_dict[k],
)
):
del prefixes_dict[k]

indexes = list(prefixes_dict.keys())

# Similarly, we can spuriously return `true` for always-wide characters
# even if not part of a presentation seq; this saves an additional lookup,
# so we should do it where there is no size cost.
for cp, width in enumerate(width_map):
if width == EffectiveWidth.WIDE and (cp >> 10) in indexes:
prefixes_dict[cp >> 10].add(cp & 0x3FF)

leaves = []
for cps in prefixes_dict.values():
leaf = [0] * 128
for cp in cps:
idx_in_leaf, bit_shift = divmod(cp, 8)
leaf[idx_in_leaf] |= 1 << bit_shift
leaves.append(leaf)
return (indexes, leaves)


def emit_module(
out_name: str, unicode_version: "tuple[int, int, int]", tables: "list[Table]"
out_name: str,
unicode_version: "tuple[int, int, int]",
tables: "list[Table]",
variation_table: "tuple[list[int], list[list[int]]]",
):
"""Outputs a Rust module to `out_name` using table data from `tables`.
If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`.
Expand Down Expand Up @@ -462,6 +529,40 @@ def emit_module(
"""
)

variation_idx, variation_leaves = variation_table

module.write(
"""
/// Whether this character forms an [emoji presentation sequence]
/// (https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence)
/// when followed by `'\\u{FEOF}'`.
/// Emoji presentation sequences are considered to have width 2.
/// This may spuriously return `true` or `false` for characters that are always wide.
#[inline]
pub fn starts_emoji_presentation_seq(c: char) -> bool {
let cp: u32 = c.into();
// First level of lookup uses all but 10 LSB
let top_bits = cp >> 10;
let idx_of_leaf: usize = match top_bits {
"""
)

for i, msbs in enumerate(variation_idx):
module.write(f" {msbs} => {i},\n")

module.write(
""" _ => return false,
};
// Extract the 3-9th (0-indexed) least significant bits of `cp`,
// and use them to index into `leaf_row`.
let idx_within_leaf = usize::try_from((cp >> 3) & 0x7F).unwrap();
let leaf_byte = EMOJI_PRESENTATION_LEAVES.0[idx_of_leaf][idx_within_leaf];
// Use the 3 LSB of `cp` to index into `leaf_byte`.
((leaf_byte >> (cp & 7)) & 1) == 1
}
"""
)

module.write(
"""
/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or
Expand Down Expand Up @@ -510,6 +611,29 @@ def emit_module(
module.write(f" 0x{byte:02X},")
module.write("\n ];\n")
subtable_count = new_subtable_count

# emoji table

module.write(
f"""
#[repr(align(128))]
struct Align128<T>(T);
/// Array of 1024-bit bitmaps. Index into the correct (obtained from `EMOJI_PRESENTATION_INDEX`)
/// bitmap with the 10 LSB of your codepoint to get whether it can start an emoji presentation seq.
static EMOJI_PRESENTATION_LEAVES: Align128<[[u8; 128]; {len(variation_leaves)}]> = Align128([
"""
)
for leaf in variation_leaves:
module.write(" [\n")
for row in batched(leaf, 14):
module.write(" ")
for entry in row:
module.write(f" 0x{entry:02X},")
module.write("\n")
module.write(" ],\n")

module.write(" ]);\n")

module.write("}\n")


Expand All @@ -520,6 +644,7 @@ def main(module_filename: str):

We obey the following rules, in decreasing order of importance:

- Emoji presentation sequences are double-width.
- The soft hyphen (`U+00AD`) is single-width. (https://archive.is/fCT3c)
- Hangul jamo medial vowels & final consonants are zero-width.
- `Default_Ignorable_Code_Point`s are zero-width, except for U+115F HANGUL CHOSEONG FILLER.
Expand Down Expand Up @@ -549,16 +674,25 @@ def main(module_filename: str):

tables = make_tables(TABLE_CFGS, enumerate(width_map))

emoji_variations = load_variation_sequences()
variation_table = make_variation_sequence_table(emoji_variations, width_map)

print("------------------------")
total_size = 0
for i, table in enumerate(tables):
size_bytes = len(table.to_bytes())
print(f"Table {i} Size: {size_bytes} bytes")
print(f"Table {i} size: {size_bytes} bytes")
total_size += size_bytes
emoji_index_size = len(variation_table[0]) * 4
print(f"Emoji presentation index size: {emoji_index_size} bytes")
total_size += emoji_index_size
emoji_leaves_size = len(variation_table[1]) * len(variation_table[1][0])
print(f"Emoji presentation leaves size: {emoji_leaves_size} bytes")
total_size += emoji_leaves_size
print("------------------------")
print(f" Total Size: {total_size} bytes")
print(f" Total size: {total_size} bytes")

emit_module(module_filename, version, tables)
emit_module(module_filename, version, tables, variation_table)
print(f'Wrote to "{module_filename}"')


Expand Down
45 changes: 34 additions & 11 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,11 @@
//! This crate currently uses the following rules to determine the width of a
//! character or string, in order of decreasing precedence. These may be tweaked in the future.
//!
//! 1. [`'\u{00AD}'` SOFT HYPHEN](https://util.unicode.org/UnicodeJsps/character.jsp?a=00AD) has width 1.
//! 2. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
//! 3. The following have width 0:
//! 1. [Emoji presentation sequences](https://unicode.org/reports/tr51/#def_emoji_presentation_sequence)
//! have width 2. (The width of a string may therefore differ from the sum of the widths of its characters.)
//! 2. [`'\u{00AD}'` SOFT HYPHEN](https://util.unicode.org/UnicodeJsps/character.jsp?a=00AD) has width 1.
//! 3. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
//! 4. The following have width 0:
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BDefault_Ignorable_Code_Point%7D)
//! with the [`Default_Ignorable_Code_Point`](https://www.unicode.org/versions/Unicode15.0.0/ch05.pdf#G40095) property.
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Extend%7D)
Expand All @@ -53,15 +55,15 @@
//! with a [`Hangul_Syllable_Type`](https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G45593)
//! of `Vowel_Jamo` (`V`) or `Trailing_Jamo` (`T`).
//! - [`'\0'` NUL](https://util.unicode.org/UnicodeJsps/character.jsp?a=0000).
//! 4. The [control characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BCc%7D)
//! 5. The [control characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BCc%7D)
//! have no defined width, and are ignored when determining the width of a string.
//! 5. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
//! 6. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
//! with an [`East_Asian_Width`] of [`Fullwidth` (`F`)](https://www.unicode.org/reports/tr11/#ED2)
//! or [`Wide` (`W`)](https://www.unicode.org/reports/tr11/#ED4) have width 2.
//! 6. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DA%7D)
//! 7. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DA%7D)
//! with an [`East_Asian_Width`] of [`Ambiguous` (`A`)](https://www.unicode.org/reports/tr11/#ED6)
//! have width 2 in an East Asian context, and width 1 otherwise.
//! 7. All other characters have width 1.
//! 8. All other characters have width 1.
//!
//! [`East_Asian_Width`]: https://www.unicode.org/reports/tr11/#ED1
//! [`Grapheme_Extend`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G52443
Expand Down Expand Up @@ -122,7 +124,9 @@ impl UnicodeWidthChar for char {
pub trait UnicodeWidthStr {
/// Returns the string's displayed width in columns.
///
/// Control characters are treated as having zero width.
/// Control characters are treated as having zero width,
/// and [emoji presentation sequences](https://unicode.org/reports/tr51/#def_emoji_presentation_sequence)
/// are assigned width 2.
///
/// This function treats characters in the Ambiguous category according
/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
Expand All @@ -132,7 +136,9 @@ pub trait UnicodeWidthStr {

/// Returns the string's displayed width in columns.
///
/// Control characters are treated as having zero width.
/// Control characters are treated as having zero width,
/// and [emoji presentation sequences](https://unicode.org/reports/tr51/#def_emoji_presentation_sequence)
/// are assigned width 2.
///
/// This function treats characters in the Ambiguous category according
/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
Expand All @@ -144,11 +150,28 @@ pub trait UnicodeWidthStr {
impl UnicodeWidthStr for str {
#[inline]
fn width(&self) -> usize {
self.chars().map(|c| cw::width(c, false).unwrap_or(0)).sum()
str_width(self, false)
}

#[inline]
fn width_cjk(&self) -> usize {
self.chars().map(|c| cw::width(c, true).unwrap_or(0)).sum()
str_width(self, true)
}
}

fn str_width(s: &str, is_cjk: bool) -> usize {
s.chars()
.rfold((0, false), |(sum, was_fe0f), c| {
if c == '\u{FE0F}' {
(sum, true)
} else {
let add = if was_fe0f && cw::starts_emoji_presentation_seq(c) {
2
} else {
cw::width(c, is_cjk).unwrap_or(0)
};
(sum + add, false)
}
})
.0
}
Loading
Loading