-
-
Notifications
You must be signed in to change notification settings - Fork 33
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
util: move UTF-8 decoding to its own module
This moves UTF-8 decoding from an internal helper in `jiff::util::escape` to a crate-wide helper in `jiff::util::utf8`. This makes it easier for other code to use this. I may end up using it for #111, for example.
- Loading branch information
1 parent
ce9c60e
commit b329427
Showing
3 changed files
with
52 additions
and
50 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,3 +9,4 @@ pub(crate) mod parse; | |
pub(crate) mod rangeint; | ||
pub(crate) mod round; | ||
pub(crate) mod t; | ||
pub(crate) mod utf8; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
/// Decodes the next UTF-8 encoded codepoint from the given byte slice. | ||
/// | ||
/// If no valid encoding of a codepoint exists at the beginning of the given | ||
/// byte slice, then the first byte is returned instead. | ||
/// | ||
/// This returns `None` if and only if `bytes` is empty. | ||
/// | ||
/// This never panics. | ||
/// | ||
/// *WARNING*: This is not designed for performance. If you're looking for a | ||
/// fast UTF-8 decoder, this is not it. If you feel like you need one in this | ||
/// crate, then please file an issue and discuss your use case. | ||
pub(crate) fn decode(bytes: &[u8]) -> Option<Result<char, u8>> { | ||
if bytes.is_empty() { | ||
return None; | ||
} | ||
let len = match utf8_len(bytes[0]) { | ||
None => return Some(Err(bytes[0])), | ||
Some(len) if len > bytes.len() => return Some(Err(bytes[0])), | ||
Some(1) => return Some(Ok(char::from(bytes[0]))), | ||
Some(len) => len, | ||
}; | ||
match core::str::from_utf8(&bytes[..len]) { | ||
Ok(s) => Some(Ok(s.chars().next().unwrap())), | ||
Err(_) => Some(Err(bytes[0])), | ||
} | ||
} | ||
|
||
/// Given a UTF-8 leading byte, this returns the total number of code units | ||
/// in the following encoded codepoint. | ||
/// | ||
/// If the given byte is not a valid UTF-8 leading byte, then this returns | ||
/// `None`. | ||
fn utf8_len(byte: u8) -> Option<usize> { | ||
if byte <= 0x7F { | ||
return Some(1); | ||
} else if byte & 0b1100_0000 == 0b1000_0000 { | ||
return None; | ||
} else if byte <= 0b1101_1111 { | ||
Some(2) | ||
} else if byte <= 0b1110_1111 { | ||
Some(3) | ||
} else if byte <= 0b1111_0111 { | ||
Some(4) | ||
} else { | ||
None | ||
} | ||
} |