diff --git a/hae.cc b/hae.cc index 909562d..7efc8e2 100644 --- a/hae.cc +++ b/hae.cc @@ -94,7 +94,7 @@ std::vector combine_chunks(std::vector &chunks, int64_ std::string buffer = ""; for (size_t i = 0; i < chunks.size(); ++i) { - buffer += chunks[i] + "\n"; + buffer += chunks[i] + "\n\n"; // If the chunk has multiple lines, just append it if (buffer.length() > min_size) { combined.push_back(buffer.substr(0, buffer.length() - 1)); @@ -111,7 +111,9 @@ std::vector combine_chunks(std::vector &chunks, int64_ std::vector split_sentences(const std::string& text) { std::string wiki_citation_re = "(\\^\\[[0-9]+\\])*"; - std::regex full_re(":\\n" + wiki_citation_re + "|[.!?]" + wiki_citation_re + "\\s"); + std::string double_newline_re = "\r?\n\r?\n"; + + std::regex full_re(":\\n" + wiki_citation_re + "|[.!?]" + wiki_citation_re + "\\s" + "|" + double_newline_re); size_t prev = 0; std::vector sentences; diff --git a/test/t3.txt b/test/t3.txt index 1fc7aff..3ead672 100644 --- a/test/t3.txt +++ b/test/t3.txt @@ -1,4 +1,5 @@  This is a test +  diff --git a/test/t4.txt b/test/t4.txt index 5a4e112..346b023 100644 --- a/test/t4.txt +++ b/test/t4.txt @@ -1,3 +1,4 @@ This is a test +  diff --git a/test/t5.txt b/test/t5.txt index 6fb3115..f170319 100644 --- a/test/t5.txt +++ b/test/t5.txt @@ -3,3 +3,4 @@ 1.  +