diff --git a/src/preprocessing.jl b/src/preprocessing.jl index e71051ad..4fb71a0b 100644 --- a/src/preprocessing.jl +++ b/src/preprocessing.jl @@ -557,7 +557,7 @@ end function _build_regex_patterns(lang, flags::UInt32, patterns::Set{T}, words::Set{T}) where T <: AbstractString #((flags & strip_whitespace) > 0) && push!(patterns, "\\s+") if (flags & strip_non_letters) > 0 - push!(patterns, "[^a-zA-Z\\s]") + push!(patterns, "[^\\p{L}\\s]") else ((flags & strip_punctuation) > 0) && push!(patterns, "[-.,:;,!?'\"\\[\\]\\(\\)\\{\\}|\\`#\$%@^&*_+<>“”—’‘/]+") ((flags & strip_numbers) > 0) && push!(patterns, "\\d+") diff --git a/test/preprocessing.jl b/test/preprocessing.jl index 96a042df..f986b49b 100644 --- a/test/preprocessing.jl +++ b/test/preprocessing.jl @@ -1,11 +1,11 @@ @testset "Preprocessing" begin - sample_text1 = "This is 1 MESSED υπ string!" - sample_text1_wo_punctuation = "This is 1 MESSED υπ string" - sample_text1_wo_punctuation_numbers = "This is MESSED υπ string" - sample_text1_wo_punctuation_numbers_case = "this is messed υπ string" - sample_text1_wo_punctuation_numbers_case_az = "this is messed string" + sample_text1 = "This is 1 MESSED 1 string!" + sample_text1_wo_punctuation = "This is 1 MESSED string" + sample_text1_wo_punctuation_numbers = "This is MESSED string" + sample_text1_wo_punctuation_numbers_case = "this is messed string" + sample_text1_wo_punctuation_numbers_case_az = "this is messed 1 string" sample_texts = [ sample_text1, @@ -142,3 +142,18 @@ prepare!(crps, strip_html_tags | strip_whitespace | strip_non_letters) @test isequal(crps[1].text, "Hi there") end + +@testset "strip_non_letters with Unicode" begin + samples = [ + (" Wörterbuch für Ärzte! ", ["Wörterbuch", "für", "Ärzte"]) + (" Проверим прочие алфавиты: αλφάβητο 字母 ! ", ["Проверим", "прочие", "алфавиты", "αλφάβητο", "字母"]) + ("123 الأبجدية 456", ["الأبجدية"]) + # (" वर्णमाला ! ", "वर्णमाला") + ] + + for (sample, expected) in samples + crps = Corpus([StringDocument(sample)]) + prepare!(crps, strip_html_tags | strip_whitespace | strip_non_letters) + @test isequal(crps[1].text, join(expected, ' ')) + end +end