Fixed UNICODE processing with the strip_non_letters flag in src/pre…

…processing.jl (#265) * Changed the regex for strip_non_letters in src/preprocessing.jl to [^\p{L}\s], because [^a-zA-Z\s] matches non-ascii letters and removes diacritic characters, for example * Added use cases with Unicode for the Corpus preprocessing with `strip_non_letters` flag. --------- Co-authored-by: rssdev10 <[email protected]>
JuliaText · Oct 26, 2023 · 6d00310 · 6d00310
1 parent d57d8c4
commit 6d00310
Show file tree

Hide file tree

Showing 2 changed files with 21 additions and 6 deletions.
diff --git a/src/preprocessing.jl b/src/preprocessing.jl
@@ -557,7 +557,7 @@ end
 function _build_regex_patterns(lang, flags::UInt32, patterns::Set{T}, words::Set{T}) where T <: AbstractString
     #((flags & strip_whitespace) > 0) && push!(patterns, "\\s+")
     if (flags & strip_non_letters) > 0
-        push!(patterns, "[^a-zA-Z\\s]")
+	push!(patterns, "[^\\p{L}\\s]")
     else
         ((flags & strip_punctuation) > 0) && push!(patterns, "[-.,:;,!?'\"\\[\\]\\(\\)\\{\\}|\\`#\$%@^&*_+<>“”—’‘/]+")
         ((flags & strip_numbers) > 0) && push!(patterns, "\\d+")

diff --git a/test/preprocessing.jl b/test/preprocessing.jl
@@ -1,11 +1,11 @@
 
 @testset "Preprocessing" begin
 
-    sample_text1 = "This is 1 MESSED υπ string!"
-    sample_text1_wo_punctuation = "This is 1 MESSED υπ string"
-    sample_text1_wo_punctuation_numbers = "This is  MESSED υπ string"
-    sample_text1_wo_punctuation_numbers_case = "this is  messed υπ string"
-    sample_text1_wo_punctuation_numbers_case_az = "this is  messed  string"
+    sample_text1 = "This is 1 MESSED 1 string!"
+    sample_text1_wo_punctuation = "This is 1 MESSED   string"
+    sample_text1_wo_punctuation_numbers = "This is  MESSED  string"
+    sample_text1_wo_punctuation_numbers_case = "this is  messed string"
+    sample_text1_wo_punctuation_numbers_case_az = "this is  messed 1 string"
 
     sample_texts = [
         sample_text1,
@@ -142,3 +142,18 @@
     prepare!(crps, strip_html_tags | strip_whitespace | strip_non_letters)
     @test isequal(crps[1].text, "Hi there")
 end
+
+@testset "strip_non_letters with Unicode" begin
+    samples = [
+        ("   Wörterbuch  für  Ärzte!     ", ["Wörterbuch", "für", "Ärzte"])
+        ("   Проверим     прочие алфавиты: αλφάβητο 字母 !  ", ["Проверим", "прочие", "алфавиты", "αλφάβητο", "字母"])
+        ("123 الأبجدية  456", ["الأبجدية"])
+        # ("  वर्णमाला  ! ", "वर्णमाला")
+    ]
+
+    for (sample, expected) in samples
+        crps = Corpus([StringDocument(sample)])
+        prepare!(crps, strip_html_tags | strip_whitespace | strip_non_letters)
+        @test isequal(crps[1].text, join(expected, ' '))
+    end
+end