Skip to content

Commit

Permalink
Fixed UNICODE processing with the strip_non_letters flag in src/pre…
Browse files Browse the repository at this point in the history
…processing.jl (#265)

* Changed the regex for strip_non_letters in src/preprocessing.jl to [^\p{L}\s], because [^a-zA-Z\s] matches non-ascii letters and removes diacritic characters, for example

* Added use cases with Unicode for the Corpus preprocessing with `strip_non_letters` flag.

---------

Co-authored-by: rssdev10 <[email protected]>
  • Loading branch information
sigmundv and rssdev10 authored Oct 26, 2023
1 parent d57d8c4 commit 6d00310
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 6 deletions.
2 changes: 1 addition & 1 deletion src/preprocessing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -557,7 +557,7 @@ end
function _build_regex_patterns(lang, flags::UInt32, patterns::Set{T}, words::Set{T}) where T <: AbstractString
#((flags & strip_whitespace) > 0) && push!(patterns, "\\s+")
if (flags & strip_non_letters) > 0
push!(patterns, "[^a-zA-Z\\s]")
push!(patterns, "[^\\p{L}\\s]")
else
((flags & strip_punctuation) > 0) && push!(patterns, "[-.,:;,!?'\"\\[\\]\\(\\)\\{\\}|\\`#\$%@^&*_+<>“”—’‘/]+")
((flags & strip_numbers) > 0) && push!(patterns, "\\d+")
Expand Down
25 changes: 20 additions & 5 deletions test/preprocessing.jl
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@

@testset "Preprocessing" begin

sample_text1 = "This is 1 MESSED υπ string!"
sample_text1_wo_punctuation = "This is 1 MESSED υπ string"
sample_text1_wo_punctuation_numbers = "This is MESSED υπ string"
sample_text1_wo_punctuation_numbers_case = "this is messed υπ string"
sample_text1_wo_punctuation_numbers_case_az = "this is messed string"
sample_text1 = "This is 1 MESSED 1 string!"
sample_text1_wo_punctuation = "This is 1 MESSED string"
sample_text1_wo_punctuation_numbers = "This is MESSED string"
sample_text1_wo_punctuation_numbers_case = "this is messed string"
sample_text1_wo_punctuation_numbers_case_az = "this is messed 1 string"

sample_texts = [
sample_text1,
Expand Down Expand Up @@ -142,3 +142,18 @@
prepare!(crps, strip_html_tags | strip_whitespace | strip_non_letters)
@test isequal(crps[1].text, "Hi there")
end

@testset "strip_non_letters with Unicode" begin
samples = [
(" Wörterbuch für Ärzte! ", ["Wörterbuch", "für", "Ärzte"])
(" Проверим прочие алфавиты: αλφάβητο 字母 ! ", ["Проверим", "прочие", "алфавиты", "αλφάβητο", "字母"])
("123 الأبجدية 456", ["الأبجدية"])
# (" वर्णमाला ! ", "वर्णमाला")
]

for (sample, expected) in samples
crps = Corpus([StringDocument(sample)])
prepare!(crps, strip_html_tags | strip_whitespace | strip_non_letters)
@test isequal(crps[1].text, join(expected, ' '))
end
end

0 comments on commit 6d00310

Please sign in to comment.