diff --git a/duvet/Cargo.toml b/duvet/Cargo.toml index c47d2b8..725b145 100644 --- a/duvet/Cargo.toml +++ b/duvet/Cargo.toml @@ -32,6 +32,7 @@ url = "2" v_jsonescape = "0.7" [dev-dependencies] +bolero = "0.12" insta = { version = "1", features = ["filters", "json"] } serde_json = "1" strip-ansi-escapes = "0.2" diff --git a/duvet/src/annotation.rs b/duvet/src/annotation.rs index ca64cae..a6d40dc 100644 --- a/duvet/src/annotation.rs +++ b/duvet/src/annotation.rs @@ -182,7 +182,7 @@ impl Annotation { Err(error!("Could not resolve file {:?}", file)) } - pub fn quote_range(&self, contents: &str) -> Option> { + pub fn quote_range(&self, contents: &str) -> Option<(Range, crate::text::find::Kind)> { crate::text::find(&self.quote, contents) } } diff --git a/duvet/src/reference.rs b/duvet/src/reference.rs index 829c9f6..80b8860 100644 --- a/duvet/src/reference.rs +++ b/duvet/src/reference.rs @@ -113,7 +113,12 @@ pub async fn build_references( continue; } - if let Some(range) = annotation.quote_range(&contents) { + if let Some((range, kind)) = annotation.quote_range(&contents) { + if kind.is_fuzzy() { + // TODO + //warnings.push(warn!("")); + } + for text in contents.ranges(range) { references.push(Reference { target: target.clone(), diff --git a/duvet/src/report.rs b/duvet/src/report.rs index 1e96610..0543a09 100644 --- a/duvet/src/report.rs +++ b/duvet/src/report.rs @@ -68,18 +68,21 @@ impl Report { let specifications = annotation::specifications(annotations.clone(), spec_path).await?; progress!(progress, "Loaded {} specifications", specifications.len()); - let progress = progress!("Compiling references"); + let progress = progress!("Mapping sections"); let reference_map = annotation::reference_map(annotations.clone()).await?; + progress!(progress, "Mapped {} sections", reference_map.len()); + let progress = progress!("Matching references"); let mut report = ReportResult { targets: Default::default(), annotations, blob_link: self.blob_link.as_deref(), issue_link: self.issue_link.as_deref(), }; - let references = reference::query(reference_map.clone(), specifications.clone()).await?; + progress!(progress, "Matched {} references", references.len()); + let progress = progress!("Sorting references"); for reference in references.iter() { report .targets @@ -103,12 +106,7 @@ impl Report { target.statuses.populate(&target.references) }); - progress!( - progress, - "Compiled {} references across {} sections", - references.len(), - reference_map.len() - ); + progress!(progress, "Sorted {} references", references.len()); type ReportFn = fn(&ReportResult, &Path) -> crate::Result<()>; diff --git a/duvet/src/text.rs b/duvet/src/text.rs index cbfd5d2..d3ccc74 100644 --- a/duvet/src/text.rs +++ b/duvet/src/text.rs @@ -1,7 +1,7 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 -mod find; +pub mod find; pub mod view; pub use find::find; diff --git a/duvet/src/text/find.rs b/duvet/src/text/find.rs index e3df816..1df4160 100644 --- a/duvet/src/text/find.rs +++ b/duvet/src/text/find.rs @@ -1,25 +1,45 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 -use core::ops::Range; -use triple_accel::levenshtein::levenshtein_search as text_search; +use core::{fmt, ops::Range}; +use triple_accel::levenshtein::levenshtein_search_simd_with_opts as text_search; -/// In order to make text matching a little nicer to work with, we split on any punctuation, -/// rather than require strict matching -static PUNCTUATION: &[char] = &[ - '!', '"', '#', '$', '%', '&', '\'', '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', - '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', -]; +#[derive(Clone, Copy, Debug)] +pub enum Kind { + Exact, + Fuzzy, +} -pub fn find(needle: &str, haystack: &str) -> Option> { +impl Kind { + #[inline] + pub fn is_fuzzy(&self) -> bool { + matches!(self, Self::Fuzzy) + } +} + +pub fn find(needle: &str, haystack: &str) -> Option<(Range, Kind)> { if needle.is_empty() { return None; } + macro_rules! try_find { + ($find:expr, $kind:expr) => { + if let Some(range) = $find { + return Some((range, $kind)); + } + }; + } + // try finding without ignoring whitespace first - fast_find(needle, haystack) - .or_else(|| fuzzy_find(needle, haystack)) - .or_else(|| slow_find(needle, haystack)) + try_find!(fast_find(needle, haystack), Kind::Exact); + + let normalized_search = NormalizedSearch::new(needle, haystack); + + try_find!(normalized_search.find(fast_find), Kind::Exact); + + try_find!(normalized_search.find(fuzzy_find), Kind::Fuzzy); + + None } fn fast_find(needle: &str, haystack: &str) -> Option> { @@ -32,58 +52,214 @@ fn fast_find(needle: &str, haystack: &str) -> Option> { /// TODO we should probaly deprecate this - it's better to enforce strict matching fn fuzzy_find(needle: &str, haystack: &str) -> Option> { - text_search(needle.as_bytes(), haystack.as_bytes()) - .filter(|m| m.k < 2) - .min_by_key(|m| (m.k, m.start)) - .map(|m| m.start..m.end) + text_search( + needle.as_bytes(), + haystack.as_bytes(), + 1, + triple_accel::SearchType::Best, + triple_accel::levenshtein::LEVENSHTEIN_COSTS, + false, + ) + .map(|m| m.start..m.end) + .next() +} + +struct NormalizedSearch<'a> { + needle: String, + haystack: String, + original_haystack: &'a str, + offset_map: Vec, +} + +impl fmt::Debug for NormalizedSearch<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + struct Mapping<'a> { + formatted: &'a str, + original: &'a str, + mapping: &'a [usize], + } + + impl fmt::Debug for Mapping<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut m = f.debug_map(); + for (idx, ch) in self.formatted.char_indices() { + let start = self.mapping[idx]; + let end = self.mapping[idx + 1]; + let c = &self.original[start..end]; + m.entry(&ch, &(c, start..end)); + } + m.finish() + } + } + + f.debug_struct("NormalizedSearch") + .field("needle", &self.needle) + .field("haystack", &self.haystack) + .field( + "mapping", + &Mapping { + formatted: &self.haystack, + original: self.original_haystack, + mapping: &self.offset_map, + }, + ) + .finish() + } } -fn slow_find(needle: &str, haystack: &str) -> Option> { - let (needle, _) = normalize_whitespace(needle); - let (haystack, offset_map) = normalize_whitespace(haystack); - let range = fuzzy_find(&needle, &haystack)?; +impl<'a> NormalizedSearch<'a> { + fn new(needle: &str, original_haystack: &'a str) -> Self { + let (needle, ()) = normalize_whitespace(needle); + let (haystack, offset_map) = normalize_whitespace(original_haystack); + Self { + needle, + haystack, + original_haystack, + offset_map, + } + } + + fn find(&self, find: fn(&str, &str) -> Option>) -> Option> { + let range = find(&self.needle, &self.haystack)?; + let start = self.offset_map[range.start]; + let end = self.offset_map[range.end]; + + // trim any whitespace at the end + let original = &self.original_haystack[start..end]; + let end = start + original.trim_end().len(); + + Some(start..end) + } +} + +trait OffsetMap { + fn with_capacity(len: usize) -> Self; + fn push(&mut self, idx: usize); +} + +impl OffsetMap for () { + #[inline] + fn with_capacity(_len: usize) -> Self {} + + #[inline] + fn push(&mut self, _idx: usize) {} +} - let start = offset_map[range.start]; - let end = offset_map[range.end]; +impl OffsetMap for Vec { + #[inline] + fn with_capacity(len: usize) -> Self { + Vec::with_capacity(len + 1) + } - Some(start..end) + #[inline] + fn push(&mut self, idx: usize) { + self.push(idx); + } } -fn normalize_whitespace(value: &str) -> (String, Vec) { - let mut offset_map = Vec::with_capacity(value.len() + 1); - let mut out = String::with_capacity(value.len()); +fn normalize_whitespace(value: &str) -> (String, O) { + struct Mapper { + out: String, + offset_map: O, + buffer: Option, + last_end: usize, + } - let value_start = value.as_ptr() as usize; - let mut trimmed_end = 0; + impl Mapper { + #[inline] + fn on_char(&mut self, idx: usize, c: char) { + if c.is_alphanumeric() { + self.flush(); + self.push(idx, c); + return; + } - for word in value.split_whitespace() { - for word in word.split_inclusive(PUNCTUATION) { - let start = word.as_ptr() as usize - value_start; - let end = start + word.len(); - trimmed_end = end; + if c.is_whitespace() { + if self.buffer.is_none() && !self.out.is_empty() { + self.buffer = Some(Buffer { + start: idx, + is_ws: true, + c, + }); + } + return; + } + + // punctuation + if let Some(buffer) = self.buffer.as_ref() { + if !buffer.is_ws { + self.flush(); + } + } + + self.buffer = Some(Buffer { + start: idx, + is_ws: false, + c, + }); + } - if !out.is_empty() { - out.push(' '); - offset_map.push(start); + #[inline] + fn flush(&mut self) { + if let Some(buffer) = self.buffer.take() { + self.push(buffer.start, buffer.c); } - out.push_str(word); - offset_map.extend(start..end); } + + #[inline] + fn push(&mut self, idx: usize, c: char) { + self.out.push(c); + let len = c.len_utf8(); + for _ in 0..len { + self.offset_map.push(idx); + } + self.last_end = idx + len; + } + + #[inline] + fn finish(mut self) -> (String, O) { + if let Some(buffer) = self.buffer.take() { + if !buffer.is_ws { + self.push(buffer.start, buffer.c); + } + } + self.offset_map.push(self.last_end); + (self.out, self.offset_map) + } + } + + struct Buffer { + start: usize, + is_ws: bool, + c: char, } - offset_map.push(trimmed_end); + let offset_map = O::with_capacity(value.len()); + let out = String::with_capacity(value.len()); - debug_assert_eq!(out.len() + 1, offset_map.len()); + let mut mapper = Mapper { + offset_map, + out, + buffer: None, + last_end: 0, + }; + + for (idx, c) in value.char_indices() { + mapper.on_char(idx, c); + } + + let (out, offset_map) = mapper.finish(); (out, offset_map) } #[cfg(test)] mod tests { + use super::Kind; use core::ops::Range; - fn find<'a>(needle: &str, haystack: &'a str) -> Option<(Range, &'a str)> { - super::find(needle, haystack).map(|r| (r.clone(), &haystack[r])) + fn find<'a>(needle: &str, haystack: &'a str) -> Option<(Range, Kind, &'a str)> { + super::find(needle, haystack).map(|(r, kind)| (r.clone(), kind, &haystack[r])) } macro_rules! find_test { @@ -117,4 +293,50 @@ mod tests { "this is a new-\nline", "this is a new-line" ); + find_test!( + punctuation_test, + " Second Sentence. ", + " First sentence. Second Sentence. Third Sentence. " + ); + + fn normalize_whitespace(value: &str) -> (String, Vec) { + let (normalized, mapping) = super::normalize_whitespace::>(value); + + dbg!(value, &normalized); + let mut prev: Option = None; + + for (idx, ch) in normalized.char_indices() { + if let Some(prev) = prev { + if prev.is_whitespace() || !prev.is_alphanumeric() { + assert!(!ch.is_whitespace()); + } + } + prev = Some(ch); + + let start = mapping[idx]; + let end = mapping[idx + ch.len_utf8()]; + let c = &value[start..end]; + assert!(!c.is_empty(), "{mapping:?}"); + } + + (normalized, mapping) + } + + #[test] + fn normalize_test() { + bolero::check!().with_type::().for_each(|s| { + let _ = normalize_whitespace(s); + }); + } + + #[test] + fn foo_test() { + let (a, _) = normalize_whitespace("This is a test.Foo.[F]"); + let (b, _) = normalize_whitespace(" This is a test. Foo . [F]"); + let (c, _) = normalize_whitespace(" This is a test. Foo . [F] "); + let (d, _) = normalize_whitespace("This is a test. Foo . [ F ] "); + assert_eq!(a, b); + assert_eq!(a, c); + assert_eq!(a, d); + } } diff --git a/duvet/src/text/snapshots/duvet__text__find__tests__end.snap b/duvet/src/text/snapshots/duvet__text__find__tests__end.snap index f22d3ff..01276d9 100644 --- a/duvet/src/text/snapshots/duvet__text__find__tests__end.snap +++ b/duvet/src/text/snapshots/duvet__text__find__tests__end.snap @@ -5,6 +5,7 @@ expression: "find(\"d\", \"a b c d\")" Some( ( 6..7, + Exact, "d", ), ) diff --git a/duvet/src/text/snapshots/duvet__text__find__tests__end_2.snap b/duvet/src/text/snapshots/duvet__text__find__tests__end_2.snap index 858a368..f8c9e95 100644 --- a/duvet/src/text/snapshots/duvet__text__find__tests__end_2.snap +++ b/duvet/src/text/snapshots/duvet__text__find__tests__end_2.snap @@ -5,6 +5,7 @@ expression: "find(\"c d\", \"a b c d\")" Some( ( 4..7, + Exact, "c d", ), ) diff --git a/duvet/src/text/snapshots/duvet__text__find__tests__hyphenated_haystack.snap b/duvet/src/text/snapshots/duvet__text__find__tests__hyphenated_haystack.snap index 5c862cd..2c41958 100644 --- a/duvet/src/text/snapshots/duvet__text__find__tests__hyphenated_haystack.snap +++ b/duvet/src/text/snapshots/duvet__text__find__tests__hyphenated_haystack.snap @@ -5,6 +5,7 @@ expression: "find(\"this is a new-line\", \"this is a new-\\nline\")" Some( ( 0..19, + Exact, "this is a new-\nline", ), ) diff --git a/duvet/src/text/snapshots/duvet__text__find__tests__hyphenated_needle.snap b/duvet/src/text/snapshots/duvet__text__find__tests__hyphenated_needle.snap index e7827ca..9dd6f39 100644 --- a/duvet/src/text/snapshots/duvet__text__find__tests__hyphenated_needle.snap +++ b/duvet/src/text/snapshots/duvet__text__find__tests__hyphenated_needle.snap @@ -5,6 +5,7 @@ expression: "find(\"this is a new-\\nline\", \"this is a new-line\")" Some( ( 0..18, + Exact, "this is a new-line", ), ) diff --git a/duvet/src/text/snapshots/duvet__text__find__tests__middle.snap b/duvet/src/text/snapshots/duvet__text__find__tests__middle.snap index f5883d9..3dafcde 100644 --- a/duvet/src/text/snapshots/duvet__text__find__tests__middle.snap +++ b/duvet/src/text/snapshots/duvet__text__find__tests__middle.snap @@ -5,6 +5,7 @@ expression: "find(\"b\", \"a b c d\")" Some( ( 2..3, + Exact, "b", ), ) diff --git a/duvet/src/text/snapshots/duvet__text__find__tests__middle_2.snap b/duvet/src/text/snapshots/duvet__text__find__tests__middle_2.snap index aee8cc6..2585524 100644 --- a/duvet/src/text/snapshots/duvet__text__find__tests__middle_2.snap +++ b/duvet/src/text/snapshots/duvet__text__find__tests__middle_2.snap @@ -5,6 +5,7 @@ expression: "find(\"b c\", \"a b c d\")" Some( ( 2..5, + Exact, "b c", ), ) diff --git a/duvet/src/text/snapshots/duvet__text__find__tests__punctuation_test.snap b/duvet/src/text/snapshots/duvet__text__find__tests__punctuation_test.snap new file mode 100644 index 0000000..f744c6c --- /dev/null +++ b/duvet/src/text/snapshots/duvet__text__find__tests__punctuation_test.snap @@ -0,0 +1,11 @@ +--- +source: duvet/src/text/find.rs +expression: "find(\" Second Sentence. \",\n\" First sentence. Second Sentence. Third Sentence. \")" +--- +Some( + ( + 22..38, + Exact, + "Second Sentence.", + ), +) diff --git a/duvet/src/text/snapshots/duvet__text__find__tests__start.snap b/duvet/src/text/snapshots/duvet__text__find__tests__start.snap index 3454092..054c82a 100644 --- a/duvet/src/text/snapshots/duvet__text__find__tests__start.snap +++ b/duvet/src/text/snapshots/duvet__text__find__tests__start.snap @@ -5,6 +5,7 @@ expression: "find(\"a\", \"a b c d\")" Some( ( 0..1, + Exact, "a", ), ) diff --git a/duvet/src/text/snapshots/duvet__text__find__tests__start_2.snap b/duvet/src/text/snapshots/duvet__text__find__tests__start_2.snap index b8ac5a4..4ba1c42 100644 --- a/duvet/src/text/snapshots/duvet__text__find__tests__start_2.snap +++ b/duvet/src/text/snapshots/duvet__text__find__tests__start_2.snap @@ -5,6 +5,7 @@ expression: "find(\"a b\", \"a b c d\")" Some( ( 0..3, + Exact, "a b", ), ) diff --git a/duvet/src/text/snapshots/duvet__text__find__tests__ws_difference.snap b/duvet/src/text/snapshots/duvet__text__find__tests__ws_difference.snap index ef1bfe9..4f40d48 100644 --- a/duvet/src/text/snapshots/duvet__text__find__tests__ws_difference.snap +++ b/duvet/src/text/snapshots/duvet__text__find__tests__ws_difference.snap @@ -5,6 +5,7 @@ expression: "find(\" this should ignore whitespace differences\ Some( ( 9..85, + Exact, "this should ignore whitespace differences", ), ) diff --git a/integration/snapshots/aws-cryptographic-material-providers-library.snap b/integration/snapshots/aws-cryptographic-material-providers-library.snap index 44f05ba..795abf8 100644 --- a/integration/snapshots/aws-cryptographic-material-providers-library.snap +++ b/integration/snapshots/aws-cryptographic-material-providers-library.snap @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:201bdde1f3cdf9fda8fda3a0591036a294ef3b97dcfeeb1f8bbbea8a392e7097 -size 1902704 +oid sha256:a7e30e82c9ee64ec1e1bdb2fc092f49fc2a023b9f4f3d74aecd39f3974e1b0b5 +size 1902947 diff --git a/integration/snapshots/s2n-quic.snap b/integration/snapshots/s2n-quic.snap index 482c6ff..0e06fd2 100644 --- a/integration/snapshots/s2n-quic.snap +++ b/integration/snapshots/s2n-quic.snap @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:21eeaa29e6aa471e9ce00c1f3e5b967fc146eac957ed8f6f6796b823c0c11700 -size 6286465 +oid sha256:23cc03c36f7969188d61788fd08a51e1584fe9b6795d61b88f1962480e9e778f +size 6286485 diff --git a/integration/snapshots/s2n-tls.snap b/integration/snapshots/s2n-tls.snap index 32a1b1a..37de98e 100644 --- a/integration/snapshots/s2n-tls.snap +++ b/integration/snapshots/s2n-tls.snap @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9a84733f2483a22a58445970b4dcf483409b94a48efb5cd234fc62624c8e0fa8 -size 3272196 +oid sha256:ec82b38ea4c7035c2157f40e9f892aa6a972ad7c97e27f771dbabd5bb2af2363 +size 3272022