diff --git a/src/util/escape.rs b/src/util/escape.rs index a2ebf94..4528f4b 100644 --- a/src/util/escape.rs +++ b/src/util/escape.rs @@ -4,6 +4,8 @@ Provides convenience routines for escaping raw bytes. This was copied from `regex-automata` with a few light edits. */ +use crate::util::utf8; + /// Provides a convenient `Debug` implementation for a `u8`. /// /// The `Debug` impl treats the byte as an ASCII, and emits a human readable @@ -52,7 +54,7 @@ impl<'a> core::fmt::Display for Bytes<'a> { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { // This is a sad re-implementation of a similar impl found in bstr. let mut bytes = self.0; - while let Some(result) = utf8_decode(bytes) { + while let Some(result) = utf8::decode(bytes) { let ch = match result { Ok(ch) => ch, Err(byte) => { @@ -89,52 +91,3 @@ impl<'a> core::fmt::Debug for Bytes<'a> { Ok(()) } } - -/// Decodes the next UTF-8 encoded codepoint from the given byte slice. -/// -/// If no valid encoding of a codepoint exists at the beginning of the given -/// byte slice, then the first byte is returned instead. -/// -/// This returns `None` if and only if `bytes` is empty. -/// -/// This never panics. -/// -/// *WARNING*: This is not designed for performance. If you're looking for a -/// fast UTF-8 decoder, this is not it. If you feel like you need one in this -/// crate, then please file an issue and discuss your use case. -fn utf8_decode(bytes: &[u8]) -> Option> { - if bytes.is_empty() { - return None; - } - let len = match utf8_len(bytes[0]) { - None => return Some(Err(bytes[0])), - Some(len) if len > bytes.len() => return Some(Err(bytes[0])), - Some(1) => return Some(Ok(char::from(bytes[0]))), - Some(len) => len, - }; - match core::str::from_utf8(&bytes[..len]) { - Ok(s) => Some(Ok(s.chars().next().unwrap())), - Err(_) => Some(Err(bytes[0])), - } -} - -/// Given a UTF-8 leading byte, this returns the total number of code units -/// in the following encoded codepoint. -/// -/// If the given byte is not a valid UTF-8 leading byte, then this returns -/// `None`. -fn utf8_len(byte: u8) -> Option { - if byte <= 0x7F { - return Some(1); - } else if byte & 0b1100_0000 == 0b1000_0000 { - return None; - } else if byte <= 0b1101_1111 { - Some(2) - } else if byte <= 0b1110_1111 { - Some(3) - } else if byte <= 0b1111_0111 { - Some(4) - } else { - None - } -} diff --git a/src/util/mod.rs b/src/util/mod.rs index 1ebb616..d54371b 100644 --- a/src/util/mod.rs +++ b/src/util/mod.rs @@ -9,3 +9,4 @@ pub(crate) mod parse; pub(crate) mod rangeint; pub(crate) mod round; pub(crate) mod t; +pub(crate) mod utf8; diff --git a/src/util/utf8.rs b/src/util/utf8.rs new file mode 100644 index 0000000..25cb333 --- /dev/null +++ b/src/util/utf8.rs @@ -0,0 +1,48 @@ +/// Decodes the next UTF-8 encoded codepoint from the given byte slice. +/// +/// If no valid encoding of a codepoint exists at the beginning of the given +/// byte slice, then the first byte is returned instead. +/// +/// This returns `None` if and only if `bytes` is empty. +/// +/// This never panics. +/// +/// *WARNING*: This is not designed for performance. If you're looking for a +/// fast UTF-8 decoder, this is not it. If you feel like you need one in this +/// crate, then please file an issue and discuss your use case. +pub(crate) fn decode(bytes: &[u8]) -> Option> { + if bytes.is_empty() { + return None; + } + let len = match utf8_len(bytes[0]) { + None => return Some(Err(bytes[0])), + Some(len) if len > bytes.len() => return Some(Err(bytes[0])), + Some(1) => return Some(Ok(char::from(bytes[0]))), + Some(len) => len, + }; + match core::str::from_utf8(&bytes[..len]) { + Ok(s) => Some(Ok(s.chars().next().unwrap())), + Err(_) => Some(Err(bytes[0])), + } +} + +/// Given a UTF-8 leading byte, this returns the total number of code units +/// in the following encoded codepoint. +/// +/// If the given byte is not a valid UTF-8 leading byte, then this returns +/// `None`. +fn utf8_len(byte: u8) -> Option { + if byte <= 0x7F { + return Some(1); + } else if byte & 0b1100_0000 == 0b1000_0000 { + return None; + } else if byte <= 0b1101_1111 { + Some(2) + } else if byte <= 0b1110_1111 { + Some(3) + } else if byte <= 0b1111_0111 { + Some(4) + } else { + None + } +}