From 8de463f750833c01524049c465134800652d1f9f Mon Sep 17 00:00:00 2001 From: Elias Kosunen Date: Sun, 3 Dec 2023 01:39:04 +0200 Subject: [PATCH] Regex flags /msin --- .github/workflows/linux.yml | 2 +- include/scn/detail/format_string_parser.h | 83 +++++++++++-- src/scn/impl/reader/regex_reader.h | 137 ++++++++++++++++++++-- src/scn/impl/reader/string_reader.h | 14 ++- tests/unittests/regex_test.cpp | 39 ++++++ 5 files changed, 249 insertions(+), 26 deletions(-) diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 91cf96b0..b16255f1 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -477,7 +477,7 @@ jobs: uses: lukka/get-cmake@latest - name: Install re2 - if: matrix.engine == "re2" + if: matrix.engine == 're2' run: | git clone https://github.com/google/re2 -b 2023-11-01 --depth=1 cd re2 diff --git a/include/scn/detail/format_string_parser.h b/include/scn/detail/format_string_parser.h index 7b9f2f65..1fdaa2f7 100644 --- a/include/scn/detail/format_string_parser.h +++ b/include/scn/detail/format_string_parser.h @@ -56,6 +56,48 @@ namespace scn { pointer, // 'p' }; + enum class regex_flags { + none = 0, + multiline = 1, // /m + singleline = 2, // /s + nocase = 4, // /i + nocapture = 8, // /n + // TODO? + // would probably need to go hand-in-hand with locale, + // where it could even be the default/only option -> no flag? + // why else would you even use locale with a regex? + // collate = 16, + }; + + constexpr regex_flags operator&(regex_flags a, regex_flags b) + { + return static_cast(static_cast(a) & + static_cast(b)); + } + constexpr regex_flags operator|(regex_flags a, regex_flags b) + { + return static_cast(static_cast(a) | + static_cast(b)); + } + constexpr regex_flags operator^(regex_flags a, regex_flags b) + { + return static_cast(static_cast(a) ^ + static_cast(b)); + } + + constexpr regex_flags& operator&=(regex_flags& a, regex_flags b) + { + return a = a & b; + } + constexpr regex_flags& operator|=(regex_flags& a, regex_flags b) + { + return a = a | b; + } + constexpr regex_flags& operator^=(regex_flags& a, regex_flags b) + { + return a = a ^ b; + } + template struct basic_format_specs { int width{0}; @@ -64,7 +106,7 @@ namespace scn { std::array charset_literals{0}; bool charset_has_nonascii{false}, charset_is_inverted{false}; std::basic_string_view charset_string{}; - std::basic_string_view regex_flags{}; + regex_flags regexp_flags{regex_flags::none}; unsigned arbitrary_base : 6; unsigned align : 2; bool localized : 1; @@ -205,9 +247,9 @@ namespace scn { { m_specs.charset_string = pattern; } - constexpr void on_regex_flags(std::basic_string_view flags) + constexpr void on_regex_flags(regex_flags flags) { - m_specs.regex_flags = flags; + m_specs.regexp_flags = flags; } constexpr void on_thsep() @@ -640,21 +682,48 @@ namespace scn { return begin; } + regex_flags flags{regex_flags::none}; + constexpr std::array, 4> flag_map{ + {{'m', regex_flags::multiline}, + {'s', regex_flags::singleline}, + {'i', regex_flags::nocase}, + {'n', regex_flags::nocapture}}}; for (; begin != end; ++begin) { if (*begin == CharT{'}'}) { break; } + bool found_flag = false; + for (auto flag : flag_map) { + if (static_cast(flag.first) != *begin) { + continue; + } + if ((flags & flag.second) != regex_flags::none) { + handler.on_error("Flag set multiple times in regex"); + return begin; + } +#if SCN_REGEX_BACKEND == SCN_REGEX_BACKEND_STD + if (*begin == CharT{'s'}) { + handler.on_error( + "/s flag for regex isn't supported by regex " + "backend"); + } +#endif + flags |= flag.second; + found_flag = true; + break; + } + if (!found_flag) { + handler.on_error("Invalid flag in regex"); + return begin; + } } + handler.on_regex_flags(flags); if (SCN_UNLIKELY(begin == end)) { handler.on_error("Unexpected end of regex in format string"); return begin; } - auto flags_end = begin; - handler.on_regex_flags( - make_string_view_from_pointers(regex_end + 1, flags_end)); - return begin; #else handler.on_error("Regular expression support is disabled"); diff --git a/src/scn/impl/reader/regex_reader.h b/src/scn/impl/reader/regex_reader.h index a9e09e80..1ba37081 100644 --- a/src/scn/impl/reader/regex_reader.h +++ b/src/scn/impl/reader/regex_reader.h @@ -41,16 +41,95 @@ namespace scn { SCN_BEGIN_NAMESPACE namespace impl { +#if SCN_REGEX_BACKEND == SCN_REGEX_BACKEND_STD + constexpr auto make_regex_flags(detail::regex_flags flags) + -> scan_expected + { + std::regex_constants::syntax_option_type result{}; + if ((flags & detail::regex_flags::multiline) != + detail::regex_flags::none) { + result |= std::regex_constants::multiline; + } + if ((flags & detail::regex_flags::singleline) != + detail::regex_flags::none) { + return unexpected_scan_error( + scan_error::invalid_format_string, + "/s flag for regex isn't supported by regex backend"); + } + if ((flags & detail::regex_flags::nocase) != + detail::regex_flags::none) { + result |= std::regex_constants::icase; + } + if ((flags & detail::regex_flags::nocapture) != + detail::regex_flags::none) { + result |= std::regex_constants::nosubs; + } + return result; + } +#elif SCN_REGEX_BACKEND == SCN_REGEX_BACKEND_BOOST + constexpr auto make_regex_flags(detail::regex_flags flags) + -> boost::regex_constants::syntax_option_type + { + boost::regex_constants::syntax_option_type result{}; + if ((flags & detail::regex_flags::multiline) == + detail::regex_flags::none) { + result |= boost::regex_constants::no_mod_m; + } + if ((flags & detail::regex_flags::singleline) != + detail::regex_flags::none) { + result |= boost::regex_constants::mod_s; + } + if ((flags & detail::regex_flags::nocase) != + detail::regex_flags::none) { + result |= boost::regex_constants::icase; + } + if ((flags & detail::regex_flags::nocapture) != + detail::regex_flags::none) { + result |= boost::regex_constants::nosubs; + } + return result; + } +#elif SCN_REGEX_BACKEND == SCN_REGEX_BACKEND_RE2 + auto make_regex_flags(detail::regex_flags flags) + -> std::pair + { + RE2::Options opt{RE2::Quiet}; + std::string_view stringflags{}; + + if ((flags & detail::regex_flags::multiline) == + detail::regex_flags::none) { + stringflags = "(?m)"; + } + if ((flags & detail::regex_flags::singleline) != + detail::regex_flags::none) { + opt.set_dot_nl(true); + } + if ((flags & detail::regex_flags::nocase) != + detail::regex_flags::none) { + opt.set_case_sensitive(false); + } + if ((flags & detail::regex_flags::nocapture) != + detail::regex_flags::none) { + opt.set_never_capture(true); + } + + return {opt, stringflags}; + } +#endif + template auto read_regex_string_impl(std::basic_string_view pattern, + detail::regex_flags flags, std::basic_string_view input) -> scan_expected::iterator> { #if SCN_REGEX_BACKEND == SCN_REGEX_BACKEND_STD std::basic_regex re{}; try { - re = std::basic_regex{pattern.data(), pattern.size(), - std::basic_regex::nosubs}; + SCN_TRY(re_flags, make_regex_flags(flags)); + re = std::basic_regex{ + pattern.data(), pattern.size(), + re_flags | std::regex_constants::nosubs}; } catch (const std::regex_error& err) { return unexpected_scan_error(scan_error::invalid_format_string, @@ -81,12 +160,15 @@ namespace scn { #if SCN_REGEX_BOOST_USE_ICU boost::make_u32regex(pattern.data(), pattern.data() + pattern.size(), - boost::regex_constants::no_except | + make_regex_flags(flags) | + boost::regex_constants::no_except | boost::regex_constants::nosubs); #else - boost::basic_regex{pattern.data(), pattern.size(), - boost::regex_constants::no_except | - boost::regex_constants::nosubs}; + boost::basic_regex{ + pattern.data(), pattern.size(), + make_regex_flags(flags) | + boost::regex_constants::no_except | + boost::regex_constants::nosubs}; #endif if (re.status() != 0) { return unexpected_scan_error(scan_error::invalid_format_string, @@ -121,7 +203,18 @@ namespace scn { ranges::distance(input.data(), matches[0].second); #elif SCN_REGEX_BACKEND == SCN_REGEX_BACKEND_RE2 static_assert(std::is_same_v); - auto re = re2::RE2{pattern, RE2::Quiet}; + std::string flagged_pattern{}; + auto re = [&]() { + auto [opts, flagstr] = make_regex_flags(flags); + opts.set_never_capture(true); + if (flagstr.empty()) { + return re2::RE2{pattern, opts}; + } + flagged_pattern.reserve(flagstr.size() + pattern.size()); + flagged_pattern.append(flagstr); + flagged_pattern.append(pattern); + return re2::RE2{flagged_pattern, opts}; + }(); if (!re.ok()) { return unexpected_scan_error( scan_error::invalid_format_string, @@ -141,6 +234,7 @@ namespace scn { template auto read_regex_matches_impl(std::basic_string_view pattern, + detail::regex_flags flags, std::basic_string_view input, basic_regex_matches& value) -> scan_expected::iterator> @@ -148,7 +242,9 @@ namespace scn { #if SCN_REGEX_BACKEND == SCN_REGEX_BACKEND_STD std::basic_regex re{}; try { - re = std::basic_regex{pattern.data(), pattern.size()}; + SCN_TRY(re_flags, make_regex_flags(flags)); + re = std::basic_regex{pattern.data(), pattern.size(), + re_flags}; } catch (const std::regex_error& err) { return unexpected_scan_error(scan_error::invalid_format_string, @@ -215,10 +311,13 @@ namespace scn { #if SCN_REGEX_BOOST_USE_ICU boost::make_u32regex(pattern.data(), pattern.data() + pattern.size(), - boost::regex_constants::no_except); + make_regex_flags(flags) | + boost::regex_constants::no_except); #else - boost::basic_regex{pattern.data(), pattern.size(), - boost::regex_constants::no_except}; + boost::basic_regex{ + pattern.data(), pattern.size(), + make_regex_flags(flags) | + boost::regex_constants::no_except}; #endif if (re.status() != 0) { return unexpected_scan_error(scan_error::invalid_format_string, @@ -272,13 +371,24 @@ namespace scn { ranges::distance(input.data(), matches[0].second); #elif SCN_REGEX_BACKEND == SCN_REGEX_BACKEND_RE2 static_assert(std::is_same_v); - auto re = re2::RE2{pattern, RE2::Quiet}; + std::string flagged_pattern{}; + auto re = [&]() { + auto [opts, flagstr] = make_regex_flags(flags); + if (flagstr.empty()) { + return re2::RE2{pattern, opts}; + } + flagged_pattern.reserve(flagstr.size() + pattern.size()); + flagged_pattern.append(flagstr); + flagged_pattern.append(pattern); + return re2::RE2{flagged_pattern, opts}; + }(); if (!re.ok()) { return unexpected_scan_error( scan_error::invalid_format_string, "Failed to parse regular expression"); } - size_t max_matches_n = + // TODO: Optimize into a single batch allocation + const auto max_matches_n = static_cast(re.NumberOfCapturingGroups()); std::vector> matches(max_matches_n); std::vector match_args(max_matches_n); @@ -368,6 +478,7 @@ namespace scn { ranges::data(range), ranges::data(range) + ranges::size(range)); SCN_TRY(it, read_regex_matches_impl(specs.charset_string, + specs.regexp_flags, input, value)); return ranges::begin(range) + ranges::distance(input.begin(), it); diff --git a/src/scn/impl/reader/string_reader.h b/src/scn/impl/reader/string_reader.h index ebfacb32..68d88f02 100644 --- a/src/scn/impl/reader/string_reader.h +++ b/src/scn/impl/reader/string_reader.h @@ -135,9 +135,10 @@ namespace scn { scan_expected> read( Range&& range, std::basic_string_view pattern, + detail::regex_flags flags, std::basic_string& value) { - SCN_TRY(it, impl(range, pattern)); + SCN_TRY(it, impl(range, pattern, flags)); return read_string_impl(range, it, value); } @@ -145,16 +146,18 @@ namespace scn { scan_expected> read( Range&& range, std::basic_string_view pattern, + detail::regex_flags flags, std::basic_string_view& value) { - SCN_TRY(it, impl(range, pattern)); + SCN_TRY(it, impl(range, pattern, flags)); return read_string_view_impl(range, it, value); } private: template auto impl(Range&& range, - std::basic_string_view pattern) + std::basic_string_view pattern, + detail::regex_flags flags) -> scan_expected> { if constexpr (!ranges::contiguous_range) { @@ -172,7 +175,7 @@ namespace scn { auto input = detail::make_string_view_from_pointers( ranges::data(range), ranges::data(range) + ranges::size(range)); - SCN_TRY(it, read_regex_string_impl(pattern, input)); + SCN_TRY(it, read_regex_string_impl(pattern, flags, input)); return ranges::begin(range) + ranges::distance(input.begin(), it); } @@ -552,7 +555,8 @@ namespace scn { #if !SCN_DISABLE_REGEX case reader_type::regex: return regex_string_reader_impl{}.read( - SCN_FWD(range), specs.charset_string, value); + SCN_FWD(range), specs.charset_string, + specs.regexp_flags, value); #endif default: diff --git a/tests/unittests/regex_test.cpp b/tests/unittests/regex_test.cpp index 9c17e114..ac18e7a5 100644 --- a/tests/unittests/regex_test.cpp +++ b/tests/unittests/regex_test.cpp @@ -227,3 +227,42 @@ TEST(RegexTest, EmojiWithSoUnicodeCharacterClass) #endif } #endif + +TEST(RegexTest, NoCaseFlagStringView) +{ + auto r = scn::scan("FooBar123", "{:/[a-z]+/i}"); + ASSERT_TRUE(r); + EXPECT_FALSE(r->range().empty()); + EXPECT_EQ(r->value(), "FooBar"); +} + +TEST(RegexTest, NoCaseFlagMatches) +{ + auto r = scn::scan("FooBar123", "{:/([a-z]+)/i}"); + ASSERT_TRUE(r); + EXPECT_FALSE(r->range().empty()); + EXPECT_THAT(r->value(), + testing::ElementsAre(testing::Optional(testing::Property( + &scn::regex_match::get, "FooBar"sv)), + testing::Optional(testing::Property( + &scn::regex_match::get, "FooBar"sv)))); +} + +TEST(RegexTest, NoCaseAndNoCaptureFlagStringView) +{ + auto r = scn::scan("FooBar123", "{:/[a-z]+/in}"); + ASSERT_TRUE(r); + EXPECT_FALSE(r->range().empty()); + EXPECT_EQ(r->value(), "FooBar"); +} + +TEST(RegexTest, NoCaseAndNoCaptureFlagMatches) +{ + auto r = + scn::scan("FooBar123", "{:/([a-z]+)([0-9]+)/in}"); + ASSERT_TRUE(r); + EXPECT_TRUE(r->range().empty()); + EXPECT_THAT(r->value(), + testing::ElementsAre(testing::Optional( + testing::Property(&scn::regex_match::get, "FooBar123"sv)))); +}