Skip to content

Commit

Permalink
Remove simdutf8 dependency (#8)
Browse files Browse the repository at this point in the history
aumetra authored Dec 8, 2023
1 parent f744fd0 commit 69522e2
Showing 5 changed files with 142 additions and 111 deletions.
183 changes: 118 additions & 65 deletions Cargo.lock
9 changes: 2 additions & 7 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -23,8 +23,7 @@ harness = false
ahash = "0.8.6"
bytecount = "0.6.7"
lol_html = "1.2.0"
once_cell = "1.18.0"
simdutf8 = { version = "0.1.4", optional = true }
once_cell = "1.19.0"
slab = "0.4.9"
thiserror = "1.0.50"

@@ -33,11 +32,7 @@ default = ["simd"]
# Enables the `lol_html` `debug_trace` feature. Do not use in production!
debug_trace = ["lol_html/debug_trace"]
# Enables SIMD acceleration for some operations we have to perform
simd = [
"bytecount/runtime-dispatch-simd",
"dep:simdutf8",
"simdutf8/aarch64_neon",
]
simd = ["bytecount/runtime-dispatch-simd"]

[dev-dependencies]
ammonia = "3.3.0"
51 changes: 17 additions & 34 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -16,9 +16,7 @@ use lol_html::{
};
use once_cell::sync::Lazy;
use slab::Slab;
use std::{
borrow::Cow, cell::RefCell, fmt::Write, iter, rc::Rc, str::FromStr, string::FromUtf8Error,
};
use std::{borrow::Cow, cell::RefCell, fmt::Write, iter, rc::Rc, str::FromStr};
use thiserror::Error;

#[doc(hidden)]
@@ -41,10 +39,7 @@ static SELECT_ALL: Lazy<Selector> = Lazy::new(|| Selector::from_str("*").unwrap(
///
/// See [`BubbleBath::clean`] documentation
#[inline]
pub fn clean<C>(content: C) -> Result<String, Error>
where
C: AsRef<[u8]>,
{
pub fn clean(content: &str) -> Result<String, Error> {
GLOBAL_BUBBLE_BATH.clean(content)
}

@@ -81,15 +76,6 @@ pub enum Error {
/// The rewriting of the HTML content failed
#[error(transparent)]
Rewriting(#[from] RewritingError),

/// The bytes were not valid UTF8
#[error(transparent)]
Utf8(#[from] FromUtf8Error),

/// The bytes were not valid UTF8 (SIMD-accelerated check)
#[cfg(feature = "simd")]
#[error(transparent)]
SimdUtf8(#[from] simdutf8::basic::Utf8Error),
}

/// HTML sanitizer
@@ -385,25 +371,22 @@ impl BubbleBath<'_> {
///
/// Check [`Self::clean_streaming`] for additional errors
#[inline]
pub fn clean<C>(&self, content: C) -> Result<String, Error>
where
C: AsRef<[u8]>,
{
let content = content.as_ref();
pub fn clean(&self, content: &str) -> Result<String, Error> {
let mut acc = Vec::with_capacity(content.len());
self.clean_streaming(iter::once(content), |out| acc.extend_from_slice(out))?;

#[cfg(feature = "simd")]
{
simdutf8::basic::from_utf8(&acc)?;

// SAFETY: The invariant of the data being valid UTF-8 has been checked in the line above
#[allow(unsafe_code)]
return Ok(unsafe { String::from_utf8_unchecked(acc) });
}

#[cfg(not(feature = "simd"))]
Ok(String::from_utf8(acc)?)
self.clean_streaming(iter::once(content.as_bytes()), |out| {
acc.extend_from_slice(out);
})?;

// SAFETY: Since the input is a string slice, we can be confident that it is valid UTF-8.
// We also buffered the entirety of the output into the accumulator.
//
// According to [this comment](https://github.com/cloudflare/lol-html/issues/200#issuecomment-1829731640),
// `lol_html` always outputs the data in the same encoding it was supplied in.
//
// Meaning, since we have the entire output accumulated and the source encoding is valid UTF-8,
// this byte vector is, indeed, valid UTF-8.
#[allow(unsafe_code)]
Ok(unsafe { String::from_utf8_unchecked(acc) })
}
}

6 changes: 3 additions & 3 deletions tests/ammonia_tests.rs
Original file line number Diff line number Diff line change
@@ -4,17 +4,17 @@ use bubble_bath::*;

#[test]
fn deeply_nested_allowlisted() {
clean("<b>".repeat(60_000)).unwrap();
clean(&"<b>".repeat(60_000)).unwrap();
}

#[test]
fn deeply_nested_denylisted() {
clean("<b-b>".repeat(60_000)).unwrap();
clean(&"<b-b>".repeat(60_000)).unwrap();
}

#[test]
fn deeply_nested_alternating() {
clean("<b-b>".repeat(35_000)).unwrap();
clean(&"<b-b>".repeat(35_000)).unwrap();
}

#[test]
4 changes: 2 additions & 2 deletions tests/torture.rs
Original file line number Diff line number Diff line change
@@ -6,7 +6,7 @@ use std::fs;
fn torture() {
insta::glob!("inputs/*", |path| {
let input = fs::read_to_string(path).unwrap();
assert_snapshot!(bubble_bath::clean(input).unwrap());
assert_snapshot!(bubble_bath::clean(&input).unwrap());
});
}

@@ -19,6 +19,6 @@ fn torture_escaped() {
..BubbleBath::default()
};

assert_snapshot!(bubble_bath.clean(input).unwrap());
assert_snapshot!(bubble_bath.clean(&input).unwrap());
});
}

0 comments on commit 69522e2

Please sign in to comment.