From 08901490ca813bf82eb8696b3dd0da79a9bb5bde Mon Sep 17 00:00:00 2001 From: Ivan Chesnov Date: Fri, 12 Jul 2024 18:18:48 +0300 Subject: [PATCH] GH-40968: [C++][Gandiva] add RE2::Options set_dot_nl(true) for Like f (#80) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * apacheGH-40968: [C++][Gandiva] add RE2::Options set_dot_nl(true) for Like function (apache#40970) (dremio#68) Gandiva function "LIKE" does not always work correctly when the string contains \n. String value: `[function_name: "Space1.protect"\nargs: "passenger_count"\ncolumn_name: "passenger_count" ]` Pattern '%Space1%' nor '%Space1.%' do not match. added flag set_dot_nl(true) to LikeHolder add unit tests. Yes **This PR includes breaking changes to public APIs.** * GitHub Issue: apache#40968 Lead-authored-by: Ivan Chesnov Signed-off-by: Sutou Kouhei * GH-43119: [CI][Packaging] Update manylinux 2014 CentOS repos that have been deprecated (#43121) Jobs are failing to find mirrorlist.centos.org Updating repos based on solution from: https://github.com/apache/arrow/issues/43119#issuecomment-2203534492 Via archery No * GitHub Issue: #43119 Lead-authored-by: Raúl Cumplido Co-authored-by: Sutou Kouhei Co-authored-by: Sutou Kouhei Signed-off-by: Raúl Cumplido --------- Signed-off-by: Sutou Kouhei Signed-off-by: Raúl Cumplido Co-authored-by: Raúl Cumplido Co-authored-by: Sutou Kouhei Co-authored-by: Sutou Kouhei --- ci/docker/centos-7-cpp.dockerfile | 14 ++++++++ .../python-wheel-manylinux-201x.dockerfile | 15 ++++++++ cpp/src/gandiva/regex_functions_holder.cc | 13 ++++--- cpp/src/gandiva/regex_functions_holder.h | 2 +- .../gandiva/regex_functions_holder_test.cc | 34 ++++++++++++++----- 5 files changed, 64 insertions(+), 14 deletions(-) diff --git a/ci/docker/centos-7-cpp.dockerfile b/ci/docker/centos-7-cpp.dockerfile index 8c1893cbbb2ae..1f30eed694e4e 100644 --- a/ci/docker/centos-7-cpp.dockerfile +++ b/ci/docker/centos-7-cpp.dockerfile @@ -17,11 +17,25 @@ FROM centos:centos7 +# Update mirrors to use vault.centos.org as CentOS 7 +# is EOL since 2024-06-30 +RUN sed -i \ + -e 's/^mirrorlist/#mirrorlist/' \ + -e 's/^#baseurl/baseurl/' \ + -e 's/mirror\.centos\.org/vault.centos.org/' \ + /etc/yum.repos.d/*.repo + # devtoolset is required for C++17 RUN \ yum install -y \ centos-release-scl \ epel-release && \ + sed -i \ + -e 's/^mirrorlist/#mirrorlist/' \ + -e 's/^#baseurl/baseurl/' \ + -e 's/^# baseurl/baseurl/' \ + -e 's/mirror\.centos\.org/vault.centos.org/' \ + /etc/yum.repos.d/CentOS-SCLo-scl*.repo && \ yum install -y \ cmake3 \ curl \ diff --git a/ci/docker/python-wheel-manylinux-201x.dockerfile b/ci/docker/python-wheel-manylinux-201x.dockerfile index adab10da623b0..935a213f80bee 100644 --- a/ci/docker/python-wheel-manylinux-201x.dockerfile +++ b/ci/docker/python-wheel-manylinux-201x.dockerfile @@ -24,6 +24,21 @@ ARG manylinux ENV MANYLINUX_VERSION=${manylinux} +# Ensure dnf is installed, especially for the manylinux2014 base +RUN if [ "${MANYLINUX_VERSION}" = "2014" ]; then \ + sed -i \ + -e 's/^mirrorlist/#mirrorlist/' \ + -e 's/^#baseurl/baseurl/' \ + -e 's/mirror\.centos\.org/vault.centos.org/' \ + /etc/yum.repos.d/*.repo; \ + if [ "${arch}" != "amd64" ]; then \ + sed -i \ + -e 's,vault\.centos\.org/centos,vault.centos.org/altarch,' \ + /etc/yum.repos.d/CentOS-SCLo-scl-rh.repo; \ + fi; \ + fi +RUN yum install -y dnf + # Install basic dependencies RUN yum install -y git flex curl autoconf zip perl-IPC-Cmd wget diff --git a/cpp/src/gandiva/regex_functions_holder.cc b/cpp/src/gandiva/regex_functions_holder.cc index 30d68cbc87ab3..6cac3fc701be5 100644 --- a/cpp/src/gandiva/regex_functions_holder.cc +++ b/cpp/src/gandiva/regex_functions_holder.cc @@ -99,13 +99,14 @@ Status LikeHolder::Make(const FunctionNode& node, std::shared_ptr* h "'like' function requires a string literal as the second parameter")); RE2::Options regex_op; + regex_op.set_dot_nl(true); if (node.descriptor()->name() == "ilike") { regex_op.set_case_sensitive(false); // set case-insensitive for ilike function. return Make(std::get(literal->holder()), holder, regex_op); } if (node.children().size() == 2) { - return Make(std::get(literal->holder()), holder); + return Make(std::get(literal->holder()), holder, regex_op); } else { auto escape_char = dynamic_cast(node.children().at(2).get()); ARROW_RETURN_IF( @@ -118,7 +119,7 @@ Status LikeHolder::Make(const FunctionNode& node, std::shared_ptr* h Status::Invalid( "'like' function requires a string literal as the third parameter")); return Make(std::get(literal->holder()), - std::get(escape_char->holder()), holder); + std::get(escape_char->holder()), holder, regex_op); } } @@ -127,7 +128,9 @@ Status LikeHolder::Make(const std::string& sql_pattern, std::string pcre_pattern; ARROW_RETURN_NOT_OK(RegexUtil::SqlLikePatternToPcre(sql_pattern, pcre_pattern)); - auto lholder = std::shared_ptr(new LikeHolder(pcre_pattern)); + RE2::Options regex_op; + regex_op.set_dot_nl(true); + auto lholder = std::shared_ptr(new LikeHolder(pcre_pattern, regex_op)); ARROW_RETURN_IF(!lholder->regex_.ok(), Status::Invalid("Building RE2 pattern '", pcre_pattern, "' failed with: ", lholder->regex_.error())); @@ -137,7 +140,7 @@ Status LikeHolder::Make(const std::string& sql_pattern, } Status LikeHolder::Make(const std::string& sql_pattern, const std::string& escape_char, - std::shared_ptr* holder) { + std::shared_ptr* holder, RE2::Options regex_op) { ARROW_RETURN_IF(escape_char.length() > 1, Status::Invalid("The length of escape char ", escape_char, " in 'like' function is greater than 1")); @@ -149,7 +152,7 @@ Status LikeHolder::Make(const std::string& sql_pattern, const std::string& escap ARROW_RETURN_NOT_OK(RegexUtil::SqlLikePatternToPcre(sql_pattern, pcre_pattern)); } - auto lholder = std::shared_ptr(new LikeHolder(pcre_pattern)); + auto lholder = std::shared_ptr(new LikeHolder(pcre_pattern, regex_op)); ARROW_RETURN_IF(!lholder->regex_.ok(), Status::Invalid("Building RE2 pattern '", pcre_pattern, "' failed with: ", lholder->regex_.error())); diff --git a/cpp/src/gandiva/regex_functions_holder.h b/cpp/src/gandiva/regex_functions_holder.h index ecf4095f3d473..08ffa4a8234f3 100644 --- a/cpp/src/gandiva/regex_functions_holder.h +++ b/cpp/src/gandiva/regex_functions_holder.h @@ -40,7 +40,7 @@ class GANDIVA_EXPORT LikeHolder : public FunctionHolder { static Status Make(const std::string& sql_pattern, std::shared_ptr* holder); static Status Make(const std::string& sql_pattern, const std::string& escape_char, - std::shared_ptr* holder); + std::shared_ptr* holder, RE2::Options regex_op); static Status Make(const std::string& sql_pattern, std::shared_ptr* holder, RE2::Options regex_op); diff --git a/cpp/src/gandiva/regex_functions_holder_test.cc b/cpp/src/gandiva/regex_functions_holder_test.cc index 930f3a7ade718..7efc75e8d508c 100644 --- a/cpp/src/gandiva/regex_functions_holder_test.cc +++ b/cpp/src/gandiva/regex_functions_holder_test.cc @@ -27,6 +27,8 @@ namespace gandiva { class TestLikeHolder : public ::testing::Test { public: RE2::Options regex_op; + void SetUp() { regex_op.set_dot_nl(true); } + FunctionNode BuildLike(std::string pattern) { auto field = std::make_shared(arrow::field("in", arrow::utf8())); auto pattern_node = @@ -75,6 +77,15 @@ TEST_F(TestLikeHolder, TestMatchOne) { EXPECT_FALSE(like("dabc")); } +TEST_F(TestLikeHolder, TestPcreSpecialWithNewLine) { + std::shared_ptr like_holder; + auto status = LikeHolder::Make("%Space1.%", &like_holder, regex_op); + + auto& like = *like_holder; + EXPECT_TRUE( + like("[name: \"Space1.protect\"\nargs: \"count\"\ncolumn_name: \"pass_count\"]")); +} + TEST_F(TestLikeHolder, TestPcreSpecial) { std::shared_ptr like_holder; @@ -104,17 +115,25 @@ TEST_F(TestLikeHolder, TestDot) { EXPECT_FALSE(like("abcd")); } +TEST_F(TestLikeHolder, TestMatchWithNewLine) { + std::shared_ptr like_holder; + auto status = LikeHolder::Make("%abc%", "\\", &like_holder, regex_op); + + auto& like = *like_holder; + EXPECT_TRUE(like("abc\nd")); +} + TEST_F(TestLikeHolder, TestMatchSubString) { std::shared_ptr like_holder; - auto status = LikeHolder::Make("%abc%", "\\", &like_holder); + auto status = LikeHolder::Make("%abc%", "\\", &like_holder, regex_op); EXPECT_EQ(status.ok(), true) << status.message(); auto& like = *like_holder; EXPECT_TRUE(like("abc")); EXPECT_FALSE(like("xxabdc")); - status = LikeHolder::Make("%ab-.^$*+?()[]{}|—/c\\%%", "\\", &like_holder); + status = LikeHolder::Make("%ab-.^$*+?()[]{}|—/c\\%%", "\\", &like_holder, regex_op); EXPECT_EQ(status.ok(), true) << status.message(); auto& like_reserved_char = *like_holder; @@ -192,7 +211,7 @@ TEST_F(TestLikeHolder, TestOptimise) { TEST_F(TestLikeHolder, TestMatchOneEscape) { std::shared_ptr like_holder; - auto status = LikeHolder::Make("ab\\_", "\\", &like_holder); + auto status = LikeHolder::Make("ab\\_", "\\", &like_holder, regex_op); EXPECT_EQ(status.ok(), true) << status.message(); auto& like = *like_holder; @@ -209,8 +228,7 @@ TEST_F(TestLikeHolder, TestMatchOneEscape) { TEST_F(TestLikeHolder, TestMatchManyEscape) { std::shared_ptr like_holder; - auto status = LikeHolder::Make("ab\\%", "\\", &like_holder); - EXPECT_EQ(status.ok(), true) << status.message(); + auto status = LikeHolder::Make("ab\\%", "\\", &like_holder, regex_op); auto& like = *like_holder; @@ -226,7 +244,7 @@ TEST_F(TestLikeHolder, TestMatchManyEscape) { TEST_F(TestLikeHolder, TestMatchEscape) { std::shared_ptr like_holder; - auto status = LikeHolder::Make("ab\\\\", "\\", &like_holder); + auto status = LikeHolder::Make("ab\\\\", "\\", &like_holder, regex_op); EXPECT_EQ(status.ok(), true) << status.message(); auto& like = *like_holder; @@ -239,7 +257,7 @@ TEST_F(TestLikeHolder, TestMatchEscape) { TEST_F(TestLikeHolder, TestEmptyEscapeChar) { std::shared_ptr like_holder; - auto status = LikeHolder::Make("ab\\_", "", &like_holder); + auto status = LikeHolder::Make("ab\\_", "", &like_holder, regex_op); EXPECT_EQ(status.ok(), true) << status.message(); auto& like = *like_holder; @@ -254,7 +272,7 @@ TEST_F(TestLikeHolder, TestEmptyEscapeChar) { TEST_F(TestLikeHolder, TestMultipleEscapeChar) { std::shared_ptr like_holder; - auto status = LikeHolder::Make("ab\\_", "\\\\", &like_holder); + auto status = LikeHolder::Make("ab\\_", "\\\\", &like_holder, regex_op); EXPECT_EQ(status.ok(), false) << status.message(); }