util: move UTF-8 decoding to its own module

This moves UTF-8 decoding from an internal helper in `jiff::util::escape` to a crate-wide helper in `jiff::util::utf8`. This makes it easier for other code to use this. I may end up using it for #111, for example.
BurntSushi · Sep 8, 2024 · b329427 · b329427
1 parent ce9c60e
commit b329427
Show file tree

Hide file tree

Showing 3 changed files with 52 additions and 50 deletions.
diff --git a/src/util/escape.rs b/src/util/escape.rs
@@ -4,6 +4,8 @@ Provides convenience routines for escaping raw bytes.
 This was copied from `regex-automata` with a few light edits.
 */
 
+use crate::util::utf8;
+
 /// Provides a convenient `Debug` implementation for a `u8`.
 ///
 /// The `Debug` impl treats the byte as an ASCII, and emits a human readable
@@ -52,7 +54,7 @@ impl<'a> core::fmt::Display for Bytes<'a> {
     fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
         // This is a sad re-implementation of a similar impl found in bstr.
         let mut bytes = self.0;
-        while let Some(result) = utf8_decode(bytes) {
+        while let Some(result) = utf8::decode(bytes) {
             let ch = match result {
                 Ok(ch) => ch,
                 Err(byte) => {
@@ -89,52 +91,3 @@ impl<'a> core::fmt::Debug for Bytes<'a> {
         Ok(())
     }
 }
-
-/// Decodes the next UTF-8 encoded codepoint from the given byte slice.
-///
-/// If no valid encoding of a codepoint exists at the beginning of the given
-/// byte slice, then the first byte is returned instead.
-///
-/// This returns `None` if and only if `bytes` is empty.
-///
-/// This never panics.
-///
-/// *WARNING*: This is not designed for performance. If you're looking for a
-/// fast UTF-8 decoder, this is not it. If you feel like you need one in this
-/// crate, then please file an issue and discuss your use case.
-fn utf8_decode(bytes: &[u8]) -> Option<Result<char, u8>> {
-    if bytes.is_empty() {
-        return None;
-    }
-    let len = match utf8_len(bytes[0]) {
-        None => return Some(Err(bytes[0])),
-        Some(len) if len > bytes.len() => return Some(Err(bytes[0])),
-        Some(1) => return Some(Ok(char::from(bytes[0]))),
-        Some(len) => len,
-    };
-    match core::str::from_utf8(&bytes[..len]) {
-        Ok(s) => Some(Ok(s.chars().next().unwrap())),
-        Err(_) => Some(Err(bytes[0])),
-    }
-}
-
-/// Given a UTF-8 leading byte, this returns the total number of code units
-/// in the following encoded codepoint.
-///
-/// If the given byte is not a valid UTF-8 leading byte, then this returns
-/// `None`.
-fn utf8_len(byte: u8) -> Option<usize> {
-    if byte <= 0x7F {
-        return Some(1);
-    } else if byte & 0b1100_0000 == 0b1000_0000 {
-        return None;
-    } else if byte <= 0b1101_1111 {
-        Some(2)
-    } else if byte <= 0b1110_1111 {
-        Some(3)
-    } else if byte <= 0b1111_0111 {
-        Some(4)
-    } else {
-        None
-    }
-}
diff --git a/src/util/mod.rs b/src/util/mod.rs
@@ -9,3 +9,4 @@ pub(crate) mod parse;
 pub(crate) mod rangeint;
 pub(crate) mod round;
 pub(crate) mod t;
+pub(crate) mod utf8;
diff --git a/src/util/utf8.rs b/src/util/utf8.rs
@@ -0,0 +1,48 @@
+/// Decodes the next UTF-8 encoded codepoint from the given byte slice.
+///
+/// If no valid encoding of a codepoint exists at the beginning of the given
+/// byte slice, then the first byte is returned instead.
+///
+/// This returns `None` if and only if `bytes` is empty.
+///
+/// This never panics.
+///
+/// *WARNING*: This is not designed for performance. If you're looking for a
+/// fast UTF-8 decoder, this is not it. If you feel like you need one in this
+/// crate, then please file an issue and discuss your use case.
+pub(crate) fn decode(bytes: &[u8]) -> Option<Result<char, u8>> {
+    if bytes.is_empty() {
+        return None;
+    }
+    let len = match utf8_len(bytes[0]) {
+        None => return Some(Err(bytes[0])),
+        Some(len) if len > bytes.len() => return Some(Err(bytes[0])),
+        Some(1) => return Some(Ok(char::from(bytes[0]))),
+        Some(len) => len,
+    };
+    match core::str::from_utf8(&bytes[..len]) {
+        Ok(s) => Some(Ok(s.chars().next().unwrap())),
+        Err(_) => Some(Err(bytes[0])),
+    }
+}
+
+/// Given a UTF-8 leading byte, this returns the total number of code units
+/// in the following encoded codepoint.
+///
+/// If the given byte is not a valid UTF-8 leading byte, then this returns
+/// `None`.
+fn utf8_len(byte: u8) -> Option<usize> {
+    if byte <= 0x7F {
+        return Some(1);
+    } else if byte & 0b1100_0000 == 0b1000_0000 {
+        return None;
+    } else if byte <= 0b1101_1111 {
+        Some(2)
+    } else if byte <= 0b1110_1111 {
+        Some(3)
+    } else if byte <= 0b1111_0111 {
+        Some(4)
+    } else {
+        None
+    }
+}