From aedab94d8fd4e2d4f287b6719c7e43aaec93e018 Mon Sep 17 00:00:00 2001 From: liyixin <601947961@qq.com> Date: Tue, 31 Oct 2023 15:10:55 +0800 Subject: [PATCH 1/3] use simd-utf8 to speed up utf8 validation --- Cargo.toml | 25 ++++++++++++++++++++----- src/de/error.rs | 9 +++++---- src/de/mod.rs | 6 +++--- src/de/raw.rs | 26 +++++--------------------- src/raw/error.rs | 2 +- src/raw/mod.rs | 9 +++------ 6 files changed, 37 insertions(+), 40 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index acdca236..82b46b69 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,7 +28,7 @@ exclude = [ "rustfmt.toml", ".travis.yml", ".evergreen/**", - ".gitignore" + ".gitignore", ] [features] @@ -54,20 +54,31 @@ name = "bson" [dependencies] ahash = "0.8.0" -chrono = { version = "0.4.15", features = ["std"], default-features = false, optional = true } +chrono = { version = "0.4.15", features = [ + "std", +], default-features = false, optional = true } rand = "0.8" serde = { version = "1.0", features = ["derive"] } serde_json = { version = "1.0", features = ["preserve_order"] } +simdutf8 = "0.1.4" indexmap = "1.6.2" hex = "0.4.2" base64 = "0.13.0" once_cell = "1.5.1" -uuid-0_8 = { package = "uuid", version = "0.8.1", features = ["serde", "v4"], optional = true } +uuid-0_8 = { package = "uuid", version = "0.8.1", features = [ + "serde", + "v4", +], optional = true } uuid = { version = "1.1.2", features = ["serde", "v4"] } serde_bytes = "0.11.5" serde_with = { version = "1.3.1", optional = true } serde_with-3 = { package = "serde_with", version = "3.1.0", optional = true } -time = { version = "0.3.9", features = ["formatting", "parsing", "macros", "large-dates"] } +time = { version = "0.3.9", features = [ + "formatting", + "parsing", + "macros", + "large-dates", +] } bitvec = "1.0.1" [target.'cfg(target_arch = "wasm32")'.dependencies] @@ -78,7 +89,11 @@ criterion = "0.3.0" pretty_assertions = "0.6.1" proptest = "1.0.0" serde_bytes = "0.11" -chrono = { version = "0.4", features = ["serde", "clock", "std"], default-features = false } +chrono = { version = "0.4", features = [ + "serde", + "clock", + "std", +], default-features = false } [package.metadata.docs.rs] all-features = true diff --git a/src/de/error.rs b/src/de/error.rs index 30d4f51a..41d17643 100644 --- a/src/de/error.rs +++ b/src/de/error.rs @@ -1,6 +1,7 @@ -use std::{error, fmt, fmt::Display, io, string, sync::Arc}; +use std::{error, fmt, fmt::Display, io, sync::Arc}; use serde::de::{self, Unexpected}; +use simdutf8::basic::Utf8Error; use crate::Bson; @@ -13,7 +14,7 @@ pub enum Error { /// A [`std::string::FromUtf8Error`](https://doc.rust-lang.org/std/string/struct.FromUtf8Error.html) encountered /// while decoding a UTF-8 String from the input data. - InvalidUtf8String(string::FromUtf8Error), + InvalidUtf8String(Utf8Error), /// While decoding a [`Document`](crate::Document) from bytes, an unexpected or unsupported /// element type was encountered. @@ -44,8 +45,8 @@ impl From for Error { } } -impl From for Error { - fn from(err: string::FromUtf8Error) -> Error { +impl From for Error { + fn from(err: Utf8Error) -> Error { Error::InvalidUtf8String(err) } } diff --git a/src/de/mod.rs b/src/de/mod.rs index 909b71cd..d53ac220 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -38,8 +38,7 @@ use crate::{ raw::RawBinaryRef, ser::write_i32, spec::{self, BinarySubtype}, - Binary, - Decimal128, + Binary, Decimal128, }; use ::serde::{ @@ -152,7 +151,8 @@ fn read_cstring(reader: &mut R) -> Result { v.push(c); } - Ok(String::from_utf8(v)?) + let _ = simdutf8::basic::from_utf8(&v)?; + unsafe { Ok(String::from_utf8_unchecked(v)) } } #[inline] diff --git a/src/de/raw.rs b/src/de/raw.rs index 874d025e..5564e7cb 100644 --- a/src/de/raw.rs +++ b/src/de/raw.rs @@ -7,8 +7,7 @@ use std::{ use serde::{ de::{EnumAccess, Error as SerdeError, IntoDeserializer, MapAccess, VariantAccess}, - forward_to_deserialize_any, - Deserializer as SerdeDeserializer, + forward_to_deserialize_any, Deserializer as SerdeDeserializer, }; use crate::{ @@ -16,27 +15,12 @@ use crate::{ raw::{RawBinaryRef, RAW_ARRAY_NEWTYPE, RAW_BSON_NEWTYPE, RAW_DOCUMENT_NEWTYPE}, spec::{BinarySubtype, ElementType}, uuid::UUID_NEWTYPE_NAME, - Bson, - DateTime, - Decimal128, - DeserializerOptions, - RawDocument, - Timestamp, + Bson, DateTime, Decimal128, DeserializerOptions, RawDocument, Timestamp, }; use super::{ - read_bool, - read_f128, - read_f64, - read_i32, - read_i64, - read_string, - read_u8, - DeserializerHint, - Error, - Result, - MAX_BSON_SIZE, - MIN_CODE_WITH_SCOPE_SIZE, + read_bool, read_f128, read_f64, read_i32, read_i64, read_string, read_u8, DeserializerHint, + Error, Result, MAX_BSON_SIZE, MIN_CODE_WITH_SCOPE_SIZE, }; use crate::de::serde::MapDeserializer; @@ -1759,7 +1743,7 @@ impl<'a> BsonBuf<'a> { let s = if utf8_lossy_override.unwrap_or(self.utf8_lossy) { String::from_utf8_lossy(bytes) } else { - Cow::Borrowed(std::str::from_utf8(bytes).map_err(Error::custom)?) + Cow::Borrowed(simdutf8::basic::from_utf8(bytes).map_err(Error::custom)?) }; // consume the null byte diff --git a/src/raw/error.rs b/src/raw/error.rs index 556b7fa0..02207ac0 100644 --- a/src/raw/error.rs +++ b/src/raw/error.rs @@ -1,4 +1,4 @@ -use std::str::Utf8Error; +use simdutf8::basic::Utf8Error; use crate::spec::ElementType; diff --git a/src/raw/mod.rs b/src/raw/mod.rs index a96f6d13..f1de8c67 100644 --- a/src/raw/mod.rs +++ b/src/raw/mod.rs @@ -133,11 +133,7 @@ pub use self::{ array_buf::RawArrayBuf, bson::{RawBson, RawJavaScriptCodeWithScope}, bson_ref::{ - RawBinaryRef, - RawBsonRef, - RawDbPointerRef, - RawJavaScriptCodeWithScopeRef, - RawRegexRef, + RawBinaryRef, RawBsonRef, RawDbPointerRef, RawJavaScriptCodeWithScopeRef, RawRegexRef, }, document::RawDocument, document_buf::RawDocumentBuf, @@ -255,7 +251,8 @@ fn read_lenencoded(buf: &[u8]) -> Result<&str> { } fn try_to_str(data: &[u8]) -> Result<&str> { - std::str::from_utf8(data).map_err(|e| Error::new_without_key(ErrorKind::Utf8EncodingError(e))) + simdutf8::basic::from_utf8(data) + .map_err(|e| Error::new_without_key(ErrorKind::Utf8EncodingError(e))) } fn usize_try_from_i32(i: i32) -> Result { From 17781ef18d5c17f469cc3d2e4735e5862ef95bdb Mon Sep 17 00:00:00 2001 From: liyixin <601947961@qq.com> Date: Wed, 8 Nov 2023 15:54:16 +0800 Subject: [PATCH 2/3] add safety comment --- src/de/mod.rs | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/de/mod.rs b/src/de/mod.rs index d53ac220..60cd187e 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -107,14 +107,12 @@ pub(crate) fn read_string(reader: &mut R, utf8_lossy: bool) -> )); } + let mut buf = Vec::with_capacity(len as usize - 1); + reader.take(len as u64 - 1).read_to_end(&mut buf)?; let s = if utf8_lossy { - let mut buf = Vec::with_capacity(len as usize - 1); - reader.take(len as u64 - 1).read_to_end(&mut buf)?; String::from_utf8_lossy(&buf).to_string() } else { - let mut s = String::with_capacity(len as usize - 1); - reader.take(len as u64 - 1).read_to_string(&mut s)?; - s + to_string(buf)? }; // read the null terminator @@ -151,7 +149,12 @@ fn read_cstring(reader: &mut R) -> Result { v.push(c); } + to_string(v) +} + +fn to_string(v: Vec) -> Result { let _ = simdutf8::basic::from_utf8(&v)?; + // Safety: `v` is a valid UTF-8 string. unsafe { Ok(String::from_utf8_unchecked(v)) } } From 3aee962f3bcac30eaa3995f7f4e89519d8f87d95 Mon Sep 17 00:00:00 2001 From: liyixin <601947961@qq.com> Date: Wed, 15 Nov 2023 10:39:21 +0800 Subject: [PATCH 3/3] rustfmt --- src/de/mod.rs | 5 +++-- src/de/raw.rs | 24 ++++++++++++++++++++---- src/raw/mod.rs | 6 +++++- 3 files changed, 28 insertions(+), 7 deletions(-) diff --git a/src/de/mod.rs b/src/de/mod.rs index 60cd187e..900eb9d3 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -38,10 +38,11 @@ use crate::{ raw::RawBinaryRef, ser::write_i32, spec::{self, BinarySubtype}, - Binary, Decimal128, + Binary, + Decimal128, }; -use ::serde::{ +use serde::{ de::{DeserializeOwned, Error as _, Unexpected}, Deserialize, }; diff --git a/src/de/raw.rs b/src/de/raw.rs index 5564e7cb..d71e1bde 100644 --- a/src/de/raw.rs +++ b/src/de/raw.rs @@ -7,7 +7,8 @@ use std::{ use serde::{ de::{EnumAccess, Error as SerdeError, IntoDeserializer, MapAccess, VariantAccess}, - forward_to_deserialize_any, Deserializer as SerdeDeserializer, + forward_to_deserialize_any, + Deserializer as SerdeDeserializer, }; use crate::{ @@ -15,12 +16,27 @@ use crate::{ raw::{RawBinaryRef, RAW_ARRAY_NEWTYPE, RAW_BSON_NEWTYPE, RAW_DOCUMENT_NEWTYPE}, spec::{BinarySubtype, ElementType}, uuid::UUID_NEWTYPE_NAME, - Bson, DateTime, Decimal128, DeserializerOptions, RawDocument, Timestamp, + Bson, + DateTime, + Decimal128, + DeserializerOptions, + RawDocument, + Timestamp, }; use super::{ - read_bool, read_f128, read_f64, read_i32, read_i64, read_string, read_u8, DeserializerHint, - Error, Result, MAX_BSON_SIZE, MIN_CODE_WITH_SCOPE_SIZE, + read_bool, + read_f128, + read_f64, + read_i32, + read_i64, + read_string, + read_u8, + DeserializerHint, + Error, + Result, + MAX_BSON_SIZE, + MIN_CODE_WITH_SCOPE_SIZE, }; use crate::de::serde::MapDeserializer; diff --git a/src/raw/mod.rs b/src/raw/mod.rs index f1de8c67..02da2d3e 100644 --- a/src/raw/mod.rs +++ b/src/raw/mod.rs @@ -133,7 +133,11 @@ pub use self::{ array_buf::RawArrayBuf, bson::{RawBson, RawJavaScriptCodeWithScope}, bson_ref::{ - RawBinaryRef, RawBsonRef, RawDbPointerRef, RawJavaScriptCodeWithScopeRef, RawRegexRef, + RawBinaryRef, + RawBsonRef, + RawDbPointerRef, + RawJavaScriptCodeWithScopeRef, + RawRegexRef, }, document::RawDocument, document_buf::RawDocumentBuf,