Add check for unicode normalization in config (#44)

This ensures that the config sections match Wikipedia's Unicode normalization. We could also normalize every section in every article to handle an edge case where non-normalized output is inadvertently created as tags are joined, but I don't think that's worth it yet. From <https://mediawiki.org/wiki/Unicode_normalization_considerations>: > MediaWiki applies normalization form C (NFC) to Unicode text input. > MediaWiki doesn't apply any normalization to its output, for example > `cafe<nowiki/>́` becomes "café" (shows U+0065 U+0301 in a row, > without precomposed characters like U+00E9 appearing). Signed-off-by: Evan Lloyd New-Schmidt <[email protected]>
organicmaps · Apr 24, 2024 · cd03fed · cd03fed
1 parent 19d9f2c
commit cd03fed
Show file tree

Hide file tree

Showing 3 changed files with 57 additions and 8 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -13,7 +13,6 @@ anyhow = { version = "1.0.71", features = ["backtrace"] }
 clap = { version = "4.3.2", features = ["derive"] }
 csv = "1.2.2"
 ego-tree = "0.6.2"
-expect-test = "1.4.1"
 html5ever = "0.26.0"
 log = "0.4.18"
 markup5ever = "0.11.0"
@@ -29,6 +28,10 @@ tracing-subscriber = { version = "0.3.17", features = ["env-filter"] }
 url = "2.3.1"
 urlencoding = "2.1.2"
 
+[dev-dependencies]
+expect-test = "1.4.1"
+unicode-normalization = "0.1.23"
+
 [profile.release]
 overflow-checks = true
 lto = true

diff --git a/src/html.rs b/src/html.rs
@@ -23,12 +23,23 @@ struct Config<'a> {
     sections_to_remove: BTreeMap<&'a str, BTreeSet<&'a str>>,
 }
 
+/// Path to the processing config file.
+///
+/// Other compile-time macros expect a string literal, so this must be a macro instead of a const str.
+macro_rules! config_path {
+    () => {
+        concat!(
+            env!("CARGO_MANIFEST_DIR"),
+            "/article_processing_config.json"
+        )
+    };
+}
+
 static CONFIG: Lazy<Config<'static>> = Lazy::new(|| {
-    serde_json::from_str(include_str!(concat!(
-        env!("CARGO_MANIFEST_DIR"),
-        "/article_processing_config.json"
-    )))
-    .expect("\"article_processing_config.json\" is either invalid json or the wrong structure")
+    serde_json::from_str(include_str!(config_path!())).expect(concat!(
+        config_path!(),
+        " is either invalid json or the wrong structure"
+    ))
 });
 
 static HEADERS: Lazy<Selector> =
@@ -203,6 +214,9 @@ fn remove_ids(document: &mut Html, ids: impl IntoIterator<Item = NodeId>) {
 }
 
 /// Remove sections with the specified `titles` and all trailing elements until next section.
+///
+/// `titles` are matched by case-sensitive simple byte comparison.
+/// `titles` should be normalized to Unicode NFC to match Wikipedia's internal normalization: <https://mediawiki.org/wiki/Unicode_normalization_considerations>.
 fn remove_sections(document: &mut Html, titles: &BTreeSet<&str>) {
     let mut to_remove = Vec::new();
 
@@ -488,6 +502,37 @@ mod test {
         assert!(!CONFIG.sections_to_remove.is_empty());
     }
 
+    /// Ensure config sections match Wikipedia's Unicode normalization (NFC) so
+    /// that they can be correctly compared bytewise.
+    ///
+    /// As the discussion below mentions, there is an edge-case where section
+    /// names in the article contain templates that expand to non-normalized
+    /// text, which this does not handle.
+    ///
+    /// See also:
+    /// - [super::remove_sections]
+    /// - Mediawiki discussion of normalization: https://mediawiki.org/wiki/Unicode_normalization_considerations
+    /// - Online conversion tool: https://util.unicode.org/UnicodeJsps/transform.jsp?a=Any-NFC
+    #[test]
+    fn static_config_sections_are_normalized() {
+        use unicode_normalization::{is_nfc, UnicodeNormalization};
+
+        let mut all_sections_are_normalized = true;
+        for section in CONFIG.sections_to_remove.values().flatten() {
+            if !is_nfc(section) {
+                all_sections_are_normalized = false;
+                let normalized = String::from_iter(section.nfc());
+                eprintln!("Section to remove {section:?} should be normalized to {normalized:?}");
+            }
+        }
+
+        assert!(
+            all_sections_are_normalized,
+            "Not all sections in {} are in Unicode NFC. Please replace the reported sections.",
+            config_path!()
+        );
+    }
+
     fn expand_links(document: &mut Html) {
         let links: Vec<_> = document
             .select(&Selector::parse("a").unwrap())