Skip to content

Commit

Permalink
util: move UTF-8 decoding to its own module
Browse files Browse the repository at this point in the history
This moves UTF-8 decoding from an internal helper in
`jiff::util::escape` to a crate-wide helper in `jiff::util::utf8`.

This makes it easier for other code to use this. I may end up using it
for #111, for example.
  • Loading branch information
BurntSushi committed Sep 8, 2024
1 parent ce9c60e commit b329427
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 50 deletions.
53 changes: 3 additions & 50 deletions src/util/escape.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ Provides convenience routines for escaping raw bytes.
This was copied from `regex-automata` with a few light edits.
*/

use crate::util::utf8;

/// Provides a convenient `Debug` implementation for a `u8`.
///
/// The `Debug` impl treats the byte as an ASCII, and emits a human readable
Expand Down Expand Up @@ -52,7 +54,7 @@ impl<'a> core::fmt::Display for Bytes<'a> {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
// This is a sad re-implementation of a similar impl found in bstr.
let mut bytes = self.0;
while let Some(result) = utf8_decode(bytes) {
while let Some(result) = utf8::decode(bytes) {
let ch = match result {
Ok(ch) => ch,
Err(byte) => {
Expand Down Expand Up @@ -89,52 +91,3 @@ impl<'a> core::fmt::Debug for Bytes<'a> {
Ok(())
}
}

/// Decodes the next UTF-8 encoded codepoint from the given byte slice.
///
/// If no valid encoding of a codepoint exists at the beginning of the given
/// byte slice, then the first byte is returned instead.
///
/// This returns `None` if and only if `bytes` is empty.
///
/// This never panics.
///
/// *WARNING*: This is not designed for performance. If you're looking for a
/// fast UTF-8 decoder, this is not it. If you feel like you need one in this
/// crate, then please file an issue and discuss your use case.
fn utf8_decode(bytes: &[u8]) -> Option<Result<char, u8>> {
if bytes.is_empty() {
return None;
}
let len = match utf8_len(bytes[0]) {
None => return Some(Err(bytes[0])),
Some(len) if len > bytes.len() => return Some(Err(bytes[0])),
Some(1) => return Some(Ok(char::from(bytes[0]))),
Some(len) => len,
};
match core::str::from_utf8(&bytes[..len]) {
Ok(s) => Some(Ok(s.chars().next().unwrap())),
Err(_) => Some(Err(bytes[0])),
}
}

/// Given a UTF-8 leading byte, this returns the total number of code units
/// in the following encoded codepoint.
///
/// If the given byte is not a valid UTF-8 leading byte, then this returns
/// `None`.
fn utf8_len(byte: u8) -> Option<usize> {
if byte <= 0x7F {
return Some(1);
} else if byte & 0b1100_0000 == 0b1000_0000 {
return None;
} else if byte <= 0b1101_1111 {
Some(2)
} else if byte <= 0b1110_1111 {
Some(3)
} else if byte <= 0b1111_0111 {
Some(4)
} else {
None
}
}
1 change: 1 addition & 0 deletions src/util/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ pub(crate) mod parse;
pub(crate) mod rangeint;
pub(crate) mod round;
pub(crate) mod t;
pub(crate) mod utf8;
48 changes: 48 additions & 0 deletions src/util/utf8.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
/// Decodes the next UTF-8 encoded codepoint from the given byte slice.
///
/// If no valid encoding of a codepoint exists at the beginning of the given
/// byte slice, then the first byte is returned instead.
///
/// This returns `None` if and only if `bytes` is empty.
///
/// This never panics.
///
/// *WARNING*: This is not designed for performance. If you're looking for a
/// fast UTF-8 decoder, this is not it. If you feel like you need one in this
/// crate, then please file an issue and discuss your use case.
pub(crate) fn decode(bytes: &[u8]) -> Option<Result<char, u8>> {
if bytes.is_empty() {
return None;
}
let len = match utf8_len(bytes[0]) {
None => return Some(Err(bytes[0])),
Some(len) if len > bytes.len() => return Some(Err(bytes[0])),
Some(1) => return Some(Ok(char::from(bytes[0]))),
Some(len) => len,
};
match core::str::from_utf8(&bytes[..len]) {
Ok(s) => Some(Ok(s.chars().next().unwrap())),
Err(_) => Some(Err(bytes[0])),
}
}

/// Given a UTF-8 leading byte, this returns the total number of code units
/// in the following encoded codepoint.
///
/// If the given byte is not a valid UTF-8 leading byte, then this returns
/// `None`.
fn utf8_len(byte: u8) -> Option<usize> {
if byte <= 0x7F {
return Some(1);
} else if byte & 0b1100_0000 == 0b1000_0000 {
return None;
} else if byte <= 0b1101_1111 {
Some(2)
} else if byte <= 0b1110_1111 {
Some(3)
} else if byte <= 0b1111_0111 {
Some(4)
} else {
None
}
}

0 comments on commit b329427

Please sign in to comment.