Skip to content

Commit

Permalink
chore: GlobMatcher uses now reflex::Matcher regex engine
Browse files Browse the repository at this point in the history
Also consolidate benchmarking low level routines undeer dfly_core_test

Before this change:
```
BM_MatchGlob/1000               2164 ns         2164 ns       322600
BM_MatchGlob/10000             21985 ns        21981 ns        31815
```

After this change:

```
BM_MatchGlob/1000                137 ns          137 ns      5276350
BM_MatchGlob/10000               437 ns          437 ns      1510854
```

What's curious is that now matching `*foobar*` on string is faster than
searching for 'foobar` using string::find().

Signed-off-by: Roman Gershman <[email protected]>
  • Loading branch information
romange committed Feb 3, 2025
1 parent 80e4012 commit 6951bf5
Show file tree
Hide file tree
Showing 9 changed files with 342 additions and 222 deletions.
73 changes: 0 additions & 73 deletions .github/workflows/scorecard.yml

This file was deleted.

3 changes: 1 addition & 2 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ add_third_party(

add_third_party(
reflex
URL https://github.com/Genivia/RE-flex/archive/refs/tags/v5.1.0.tar.gz
URL https://github.com/Genivia/RE-flex/archive/refs/tags/v5.2.1.tar.gz
PATCH_COMMAND autoreconf -fi
CONFIGURE_COMMAND <SOURCE_DIR>/configure --disable-avx2 --prefix=${THIRD_PARTY_LIB_DIR}/reflex
CXX=${CMAKE_CXX_COMPILER} CC=${CMAKE_C_COMPILER}
Expand Down Expand Up @@ -125,7 +125,6 @@ add_third_party(
-DFLATBUFFERS_BUILD_FLATC=OFF"
)


add_library(TRDP::jsoncons INTERFACE IMPORTED)
add_dependencies(TRDP::jsoncons jsoncons_project)
set_target_properties(TRDP::jsoncons PROPERTIES
Expand Down
28 changes: 27 additions & 1 deletion src/core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,25 @@ cxx_link(dfly_core base absl::flat_hash_map absl::str_format redis_lib TRDP::lua
add_executable(dash_bench dash_bench.cc)
cxx_link(dash_bench dfly_core redis_test_lib)

cxx_test(dfly_core_test dfly_core TRDP::fast_float LABELS DFLY)
find_library(LIB_PCRE2 NAMES pcre2-8)
if(LIB_PCRE2)
set(PCRE2_LIB ${LIB_PCRE2})
else()
message(STATUS "pcre2-8 not found. Building without PCRE2 support.")
set(PCRE2_LIB "")
endif()


find_library(LIB_RE2 NAMES re2)
if(LIB_RE2)
set(RE2_LIB ${LIB_RE2})
else()
message(STATUS "re2 not found. Building without RE2 support.")
set(RE2_LIB "")
endif()


cxx_test(dfly_core_test dfly_core TRDP::fast_float ${PCRE2_LIB} ${RE2_LIB} LABELS DFLY)
cxx_test(compact_object_test dfly_core LABELS DFLY)
cxx_test(extent_tree_test dfly_core LABELS DFLY)
cxx_test(dash_test dfly_core file redis_test_lib DATA testdata/ids.txt.zst LABELS DFLY)
Expand All @@ -30,3 +48,11 @@ cxx_test(flatbuffers_test dfly_core TRDP::flatbuffers LABELS DFLY)
cxx_test(bloom_test dfly_core LABELS DFLY)
cxx_test(allocation_tracker_test dfly_core absl::random_random LABELS DFLY)
cxx_test(qlist_test dfly_core DATA testdata/list.txt.zst LABELS DFLY)

if(LIB_PCRE2)
target_compile_definitions(dfly_core_test PRIVATE USE_PCRE2)
endif()

if(LIB_RE2)
target_compile_definitions(dfly_core_test PRIVATE USE_RE2)
endif()
199 changes: 198 additions & 1 deletion src/core/dfly_core_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,17 @@
#include <absl/strings/charconv.h>
#include <absl/strings/numbers.h>
#include <fast_float/fast_float.h>

#ifdef USE_PCRE2
#define PCRE2_CODE_UNIT_WIDTH 8
#include <pcre2.h>
#endif

#include <reflex/matcher.h>
#include <re2/re2.h>

#include <random>
#include <regex>

#include "base/gtest.h"
#include "base/logging.h"
Expand Down Expand Up @@ -41,6 +49,124 @@ static string GetRandomHex(size_t len) {
return res;
}

/* Glob-style pattern matching taken fron Redis. */
static int stringmatchlen(const char* pattern, int patternLen, const char* string, int stringLen,
int nocase) {
while (patternLen && stringLen) {
switch (pattern[0]) {
case '*':
while (patternLen && pattern[1] == '*') {
pattern++;
patternLen--;
}
if (patternLen == 1)
return 1; /* match */
while (stringLen) {
if (stringmatchlen(pattern + 1, patternLen - 1, string, stringLen, nocase))
return 1; /* match */
string++;
stringLen--;
}
return 0; /* no match */
break;
case '?':
string++;
stringLen--;
break;
case '[': {
int neg, match;

pattern++;
patternLen--;
neg = pattern[0] == '^';
if (neg) {
pattern++;
patternLen--;
}
match = 0;
while (1) {
if (pattern[0] == '\\' && patternLen >= 2) {
pattern++;
patternLen--;
if (pattern[0] == string[0])
match = 1;
} else if (pattern[0] == ']') {
break;
} else if (patternLen == 0) {
pattern--;
patternLen++;
break;
} else if (patternLen >= 3 && pattern[1] == '-') {
int start = pattern[0];
int end = pattern[2];
int c = string[0];
if (start > end) {
int t = start;
start = end;
end = t;
}
if (nocase) {
start = tolower(start);
end = tolower(end);
c = tolower(c);
}
pattern += 2;
patternLen -= 2;
if (c >= start && c <= end)
match = 1;
} else {
if (!nocase) {
if (pattern[0] == string[0])
match = 1;
} else {
if (tolower((int)pattern[0]) == tolower((int)string[0]))
match = 1;
}
}
pattern++;
patternLen--;
}
if (neg)
match = !match;
if (!match)
return 0; /* no match */
string++;
stringLen--;
break;
}
case '\\':
if (patternLen >= 2) {
pattern++;
patternLen--;
}
/* fall through */
default:
if (!nocase) {
if (pattern[0] != string[0])
return 0; /* no match */
} else {
if (tolower((int)pattern[0]) != tolower((int)string[0]))
return 0; /* no match */
}
string++;
stringLen--;
break;
}
pattern++;
patternLen--;
if (stringLen == 0) {
while (*pattern == '*') {
pattern++;
patternLen--;
}
break;
}
}
if (patternLen == 0 && stringLen == 0)
return 1;
return 0;
}

class TxQueueTest : public ::testing::Test {
protected:
TxQueueTest() {
Expand Down Expand Up @@ -107,6 +233,16 @@ class StringMatchTest : public ::testing::Test {
}
};

TEST_F(StringMatchTest, Glob2Regex) {
EXPECT_EQ(GlobMatcher::Glob2Regex(""), "");
EXPECT_EQ(GlobMatcher::Glob2Regex("*"), ".*");
EXPECT_EQ(GlobMatcher::Glob2Regex("\\?"), "\\?");
EXPECT_EQ(GlobMatcher::Glob2Regex("[abc]"), "[abc]");
EXPECT_EQ(GlobMatcher::Glob2Regex("[^abc]"), "[^abc]");
EXPECT_EQ(GlobMatcher::Glob2Regex("h\\[^|"), "h\\[\\^\\|");
EXPECT_EQ(GlobMatcher::Glob2Regex("[$?^]a"), "[$?^]a");
}

TEST_F(StringMatchTest, Basic) {
EXPECT_EQ(MatchLen("", "", 0), 1);

Expand Down Expand Up @@ -141,8 +277,10 @@ TEST_F(StringMatchTest, Basic) {
EXPECT_EQ(MatchLen("h??llo", "hallo", 0), 0);
EXPECT_EQ(MatchLen("h\\?llo", "hallo", 0), 0);
EXPECT_EQ(MatchLen("h\\?llo", "h?llo", 0), 1);
EXPECT_EQ(MatchLen("abc?", "abc\n", 0), 1);
}

// special regex chars
TEST_F(StringMatchTest, Special) {
EXPECT_EQ(MatchLen("h\\[^|", "h[^|", 0), 1);
EXPECT_EQ(MatchLen("[^", "[^", 0), 0);
EXPECT_EQ(MatchLen("[$?^]a", "?a", 0), 1);
Expand Down Expand Up @@ -222,4 +360,63 @@ static void BM_MatchReflexFindStar(benchmark::State& state) {
}
BENCHMARK(BM_MatchReflexFindStar)->Arg(1000)->Arg(10000);

static void BM_MatchStd(benchmark::State& state) {
string random_val = GetRandomHex(state.range(0));
std::regex regex(".*foobar");
std::match_results<std::string::const_iterator> results;
while (state.KeepRunning()) {
std::regex_match(random_val, results, regex);
}
}
BENCHMARK(BM_MatchStd)->Arg(1000)->Arg(10000);


static void BM_MatchRedisGlob(benchmark::State& state) {
string random_val = GetRandomHex(state.range(0));
const char* pattern = "*foobar*";
while (state.KeepRunning()) {
DoNotOptimize(stringmatchlen(pattern, strlen(pattern), random_val.c_str(), random_val.size(), 0));
}
}
BENCHMARK(BM_MatchRedisGlob)->Arg(1000)->Arg(10000);

#ifdef USE_RE2
static void BM_MatchRe2(benchmark::State& state) {
string random_val = GetRandomHex(state.range(0));
re2::RE2 re(".*foobar.*", re2::RE2::Latin1);
CHECK(re.ok());

while (state.KeepRunning()) {
DoNotOptimize(re2::RE2::FullMatch(random_val, re));
}
}
BENCHMARK(BM_MatchRe2)->Arg(1000)->Arg(10000);
#endif

#ifdef USE_PCRE2
static void BM_MatchPcre2Jit(benchmark::State& state) {
string random_val = GetRandomHex(state.range(0));
int errnum;
PCRE2_SIZE erroffset;
pcre2_code* re = pcre2_compile((PCRE2_SPTR) ".*foobar", PCRE2_ZERO_TERMINATED, 0, &errnum,
&erroffset, nullptr);
CHECK(re);
CHECK_EQ(0, pcre2_jit_compile(re, PCRE2_JIT_COMPLETE));
pcre2_match_data* match_data = pcre2_match_data_create_from_pattern(re, NULL);
const char sample[] = "aaaaaaaaaaaaafoobar";
int rc = pcre2_jit_match(re, (PCRE2_SPTR)sample, strlen(sample), 0,
PCRE2_ANCHORED | PCRE2_ENDANCHORED, match_data, NULL);
CHECK_EQ(1, rc);

while (state.KeepRunning()) {
rc = pcre2_jit_match(re, (PCRE2_SPTR)random_val.c_str(), random_val.size(), 0,
PCRE2_ANCHORED | PCRE2_ENDANCHORED, match_data, NULL);
CHECK_EQ(PCRE2_ERROR_NOMATCH, rc);
}
pcre2_match_data_free(match_data);
pcre2_code_free(re);
}
BENCHMARK(BM_MatchPcre2Jit)->Arg(1000)->Arg(10000);
#endif

} // namespace dfly
Loading

0 comments on commit 6951bf5

Please sign in to comment.