diff --git a/src/sentences/sentence_splitting.jl b/src/sentences/sentence_splitting.jl index 0f58147..c6489a3 100644 --- a/src/sentences/sentence_splitting.jl +++ b/src/sentences/sentence_splitting.jl @@ -1,7 +1,7 @@ -function rulebased_split_sentences(sentences) +function rulebased_split_sentences(sentences;collapse_newlines::Bool=false) sentences = replace(sentences, r"([?!.])\s" => Base.SubstitutionString("\\1\n")) - sentences = postproc_splits(sentences) + sentences = postproc_splits(sentences,collapse_newlines) split(sentences, "\n") end @@ -34,7 +34,7 @@ Which draws in part on heuristics included in Yoshimasa Tsuruoka's medss.pl script. """ -function postproc_splits(sentences::AbstractString) +function postproc_splits(sentences::AbstractString,collapse_newlines) # Before we do anything remove windows line-ends sentences = replace(sentences, "\r" => "") @@ -120,7 +120,10 @@ function postproc_splits(sentences::AbstractString) sentences = replace(sentences, r"(\bMs\.)\n" => s"\1 ") sentences = replace(sentences, r"(\bMrs\.)\n" => s"\1 ") - + # no sentence break in between two words with no punctuation + if collapse_newlines==true + sentences=replace(sentences,r"([a-zA-Z0-9])\n([a-zA-Z0-9])"=>s"\1 \2") + end # possible TODO: filter excessively long / short sentences diff --git a/src/set_method_api.jl b/src/set_method_api.jl index d0d7b4f..8b9b26c 100644 --- a/src/set_method_api.jl +++ b/src/set_method_api.jl @@ -22,7 +22,7 @@ Calling this will trigger recompilation of any functions that use `split_sentenc Calling `set_sentence_splitter` will give method overwritten warnings. They are expected, be worried if they do not occur """ function set_sentence_splitter(fun) - @eval split_sentences(str::AbstractString) = $(fun)(str) + @eval split_sentences(str::AbstractString;collapse_newlines::Bool=false) = $(fun)(str;collapse_newlines) end diff --git a/test/sentence_splitting.jl b/test/sentence_splitting.jl index d3b860d..3063a41 100644 --- a/test/sentence_splitting.jl +++ b/test/sentence_splitting.jl @@ -87,3 +87,16 @@ end And sometimes sentences can start with non-capitalized words. i is a good variable name.""") end + +@testset "collapse_newlines" begin + @test length(rulebased_split_sentences(""" + In this article, we present a language-independent, unsupervised approach to sentence boundary + detection. It is based on the assumption that a large number of ambiguities in the determination + of sentence boundaries can be eliminated once abbreviations have been identified. Instead of + relying on orthographic clues, the proposed system is able to detect abbreviations with high + accuracy using three criteria that only require information about the candidate type itself and + are independent of context: Abbreviations can be defined as a very tight collocation consisting + of a truncated word and a final period, abbreviations are usually short, and abbreviations + sometimes contain internal periods.""",collapse_newlines=true))==3 +end +