From 7fb7a51bcc24f94f1c5c37e6f33e410cc32977fd Mon Sep 17 00:00:00 2001 From: Reinhard Urban Date: Fri, 10 Jun 2022 13:40:28 +0200 Subject: [PATCH 01/10] add cbmc verify and fix a --conversion-check add another formal verifier (much easier to use), and fix an invalid signed conversion --- Makefile | 4 ++++ formal_verification.md | 5 +++++ re.c | 30 +++++++++++++++++++++++++++++- 3 files changed, 38 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 857d2ee..ece0acb 100644 --- a/Makefile +++ b/Makefile @@ -107,3 +107,7 @@ test: all @echo @echo +CBMC := cbmc + +verify: + $(CBMC) -DCPROVER --64 --depth 200 --bounds-check --pointer-check --memory-leak-check --div-by-zero-check --signed-overflow-check --unsigned-overflow-check --pointer-overflow-check --conversion-check --undefined-shift-check --enum-range-check --pointer-primitive-check -trace $(CBMC_ARGS) re.c diff --git a/formal_verification.md b/formal_verification.md index 46fc9ee..a36bb45 100644 --- a/formal_verification.md +++ b/formal_verification.md @@ -140,3 +140,8 @@ sys 9m34.654s klee@780432c1aaae0:~$ ``` +---- + +For the formal verifier CBMC just call make verify. +This verifier is much faster and better than klee. +https://www.cprover.org/cbmc/ diff --git a/re.c b/re.c index 20d1474..896a417 100644 --- a/re.c +++ b/re.c @@ -230,7 +230,8 @@ re_t re_compile(const char* pattern) default: { re_compiled[j].type = CHAR; - re_compiled[j].u.ch = c; + // cbmc: arithmetic overflow on signed to unsigned type conversion in (unsigned char)c + re_compiled[j].u.ch = (unsigned char)c; } break; } /* no buffer-out-of-bounds access on invalid patterns - see https://github.com/kokke/tiny-regex-c/commit/1a279e04014b70b0695fba559a7c05d55e6ee90b */ @@ -526,3 +527,30 @@ static int matchpattern(regex_t* pattern, const char* text, int* matchlength) } #endif + +#ifdef CPROVER +#define N 24 + +/* Formal verification with cbmc: */ +/* cbmc -DCPROVER --64 --depth 200 --bounds-check --pointer-check --memory-leak-check --div-by-zero-check --signed-overflow-check --unsigned-overflow-check --pointer-overflow-check --conversion-check --undefined-shift-check --enum-range-check --pointer-primitive-check -trace re.c + */ +int main(int argc, char* argv[]) +{ + /* test input - ten chars used as a regex-pattern input */ + char arr[N]; + + /* make input symbolic, to search all paths through the code */ + /* i.e. the input is checked for all possible ten-char combinations */ + for (int i=0; i -127 && arr[i] < 128); + } + /* assume proper NULL termination */ + assume(arr[sizeof(arr) - 1] == 0); + + /* verify abscence of run-time errors - go! */ + re_compile(arr); + + return 0; +} +#endif From 69afafec0a9db141a31706239e12e736faec7469 Mon Sep 17 00:00:00 2001 From: Reinhard Urban Date: Fri, 10 Jun 2022 13:52:11 +0200 Subject: [PATCH 02/10] extend CBMC checks to all APIs compare GH #76 --- Makefile | 2 +- re.c | 11 ++++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index ece0acb..369f419 100644 --- a/Makefile +++ b/Makefile @@ -110,4 +110,4 @@ test: all CBMC := cbmc verify: - $(CBMC) -DCPROVER --64 --depth 200 --bounds-check --pointer-check --memory-leak-check --div-by-zero-check --signed-overflow-check --unsigned-overflow-check --pointer-overflow-check --conversion-check --undefined-shift-check --enum-range-check --pointer-primitive-check -trace $(CBMC_ARGS) re.c + $(CBMC) -DCPROVER --depth 200 --bounds-check --pointer-check --memory-leak-check --div-by-zero-check --signed-overflow-check --unsigned-overflow-check --pointer-overflow-check --conversion-check --undefined-shift-check --enum-range-check --pointer-primitive-check -trace $(CBMC_ARGS) re.c diff --git a/re.c b/re.c index 896a417..6b4cff6 100644 --- a/re.c +++ b/re.c @@ -536,8 +536,10 @@ static int matchpattern(regex_t* pattern, const char* text, int* matchlength) */ int main(int argc, char* argv[]) { + int length; /* test input - ten chars used as a regex-pattern input */ char arr[N]; + regex_t pattern[N]; /* make input symbolic, to search all paths through the code */ /* i.e. the input is checked for all possible ten-char combinations */ @@ -547,10 +549,17 @@ int main(int argc, char* argv[]) } /* assume proper NULL termination */ assume(arr[sizeof(arr) - 1] == 0); - /* verify abscence of run-time errors - go! */ re_compile(arr); + for (int i=0; i Date: Fri, 10 Jun 2022 13:58:30 +0200 Subject: [PATCH 03/10] fix GH #76 out-of-bounds with invalid types in re_print --- re.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/re.c b/re.c index 6b4cff6..a562ce4 100644 --- a/re.c +++ b/re.c @@ -251,7 +251,7 @@ re_t re_compile(const char* pattern) void re_print(regex_t* pattern) { - const char* types[] = { "UNUSED", "DOT", "BEGIN", "END", "QUESTIONMARK", "STAR", "PLUS", "CHAR", "CHAR_CLASS", "INV_CHAR_CLASS", "DIGIT", "NOT_DIGIT", "ALPHA", "NOT_ALPHA", "WHITESPACE", "NOT_WHITESPACE", "BRANCH" }; + const char* types[] = { "UNUSED", "DOT", "BEGIN", "END", "QUESTIONMARK", "STAR", "PLUS", "CHAR", "CHAR_CLASS", "INV_CHAR_CLASS", "DIGIT", "NOT_DIGIT", "ALPHA", "NOT_ALPHA", "WHITESPACE", "NOT_WHITESPACE" /*, "BRANCH" */ }; int i; int j; @@ -263,7 +263,11 @@ void re_print(regex_t* pattern) break; } - printf("type: %s", types[pattern[i].type]); + if (pattern[i].type <= NOT_WHITESPACE) + printf("type: %s", types[pattern[i].type]); + else + printf("invalid type: %d", pattern[i].type); + if (pattern[i].type == CHAR_CLASS || pattern[i].type == INV_CHAR_CLASS) { printf(" ["); From bd55c35edf45d42a99395446db86e7c84482862c Mon Sep 17 00:00:00 2001 From: Reinhard Urban Date: Fri, 10 Jun 2022 14:26:35 +0200 Subject: [PATCH 04/10] refactor cbmc proofs a bit seperate functions. check assume vs nondet_uchar() (both are the same). use less MAX_REGEXP_OBJECTS for cbmc (much faster then) improve the no buffer-out-of-bounds access on invalid patterns check. --- Makefile | 3 ++- re.c | 70 +++++++++++++++++++++++++++++++++++++++++--------------- 2 files changed, 53 insertions(+), 20 deletions(-) diff --git a/Makefile b/Makefile index 369f419..2a204b3 100644 --- a/Makefile +++ b/Makefile @@ -109,5 +109,6 @@ test: all CBMC := cbmc +# unwindset: loop max MAX_REGEXP_OBJECTS patterns verify: - $(CBMC) -DCPROVER --depth 200 --bounds-check --pointer-check --memory-leak-check --div-by-zero-check --signed-overflow-check --unsigned-overflow-check --pointer-overflow-check --conversion-check --undefined-shift-check --enum-range-check --pointer-primitive-check -trace $(CBMC_ARGS) re.c + $(CBMC) -DCPROVER --unwindset 8 --unwind 16 --depth 16 --bounds-check --pointer-check --memory-leak-check --div-by-zero-check --signed-overflow-check --unsigned-overflow-check --pointer-overflow-check --conversion-check --undefined-shift-check --enum-range-check $(CBMC_ARGS) re.c diff --git a/re.c b/re.c index a562ce4..05aa97c 100644 --- a/re.c +++ b/re.c @@ -35,8 +35,12 @@ /* Definitions: */ -#define MAX_REGEXP_OBJECTS 30 /* Max number of regex symbols in expression. */ #define MAX_CHAR_CLASS_LEN 40 /* Max length of character-class buffer in. */ +#ifndef CPROVER +#define MAX_REGEXP_OBJECTS 30 /* Max number of regex symbols in expression. */ +#else +#define MAX_REGEXP_OBJECTS 8 /* faster formal proofs */ +#endif enum { UNUSED, DOT, BEGIN, END, QUESTIONMARK, STAR, PLUS, CHAR, CHAR_CLASS, INV_CHAR_CLASS, DIGIT, NOT_DIGIT, ALPHA, NOT_ALPHA, WHITESPACE, NOT_WHITESPACE, /* BRANCH */ }; @@ -226,6 +230,9 @@ re_t re_compile(const char* pattern) re_compiled[j].u.ccl = &ccl_buf[buf_begin]; } break; + case '\0': // EOL + return 0; + /* Other characters: */ default: { @@ -234,12 +241,6 @@ re_t re_compile(const char* pattern) re_compiled[j].u.ch = (unsigned char)c; } break; } - /* no buffer-out-of-bounds access on invalid patterns - see https://github.com/kokke/tiny-regex-c/commit/1a279e04014b70b0695fba559a7c05d55e6ee90b */ - if (pattern[i] == 0) - { - return 0; - } - i += 1; j += 1; } @@ -251,11 +252,14 @@ re_t re_compile(const char* pattern) void re_print(regex_t* pattern) { - const char* types[] = { "UNUSED", "DOT", "BEGIN", "END", "QUESTIONMARK", "STAR", "PLUS", "CHAR", "CHAR_CLASS", "INV_CHAR_CLASS", "DIGIT", "NOT_DIGIT", "ALPHA", "NOT_ALPHA", "WHITESPACE", "NOT_WHITESPACE" /*, "BRANCH" */ }; + const char *const types[] = { "UNUSED", "DOT", "BEGIN", "END", "QUESTIONMARK", "STAR", "PLUS", "CHAR", "CHAR_CLASS", "INV_CHAR_CLASS", "DIGIT", "NOT_DIGIT", "ALPHA", "NOT_ALPHA", "WHITESPACE", "NOT_WHITESPACE" /*, "BRANCH" */ }; - int i; - int j; + unsigned char i; + unsigned char j; char c; + + if (!pattern) + return; for (i = 0; i < MAX_REGEXP_OBJECTS; ++i) { if (pattern[i].type == UNUSED) @@ -538,32 +542,60 @@ static int matchpattern(regex_t* pattern, const char* text, int* matchlength) /* Formal verification with cbmc: */ /* cbmc -DCPROVER --64 --depth 200 --bounds-check --pointer-check --memory-leak-check --div-by-zero-check --signed-overflow-check --unsigned-overflow-check --pointer-overflow-check --conversion-check --undefined-shift-check --enum-range-check --pointer-primitive-check -trace re.c */ -int main(int argc, char* argv[]) + +void verify_re_compile() { - int length; /* test input - ten chars used as a regex-pattern input */ char arr[N]; - regex_t pattern[N]; - /* make input symbolic, to search all paths through the code */ /* i.e. the input is checked for all possible ten-char combinations */ for (int i=0; i -127 && arr[i] < 128); + //arr[i] = nondet_char(); + assume(arr[i] > -127 && arr[i] < 128); } /* assume proper NULL termination */ assume(arr[sizeof(arr) - 1] == 0); /* verify abscence of run-time errors - go! */ re_compile(arr); +} - for (int i=0; i= 0 && pattern[i].type <= 255); + pattern[i].u.ccl = nondet_long(); } re_print(&pattern); +} + +void verify_re_match() +{ + int length; + regex_t pattern[MAX_REGEXP_OBJECTS]; + char arr[N]; + + for (unsigned char i=0; i= 0 && pattern[i].type <= 255); + assume(pattern[i].u.ccl >= 0 && pattern[i].u.ccl <= ~1); + } + for (int i=0; i -127 && arr[i] < 128); + } + /* assume proper NULL termination */ + assume(arr[sizeof(arr) - 1] == 0); re_match(&pattern, arr, &length); +} +int main(int argc, char* argv[]) +{ + verify_re_compile(); + verify_re_printh(); + verify_re_match(); return 0; } #endif From 9d25c223eedf3ce1056fcf6e31703dd5077673d7 Mon Sep 17 00:00:00 2001 From: Reinhard Urban Date: Fri, 10 Jun 2022 14:51:24 +0200 Subject: [PATCH 05/10] support "\\\\" pattern, and disallow "..\\" ending \\ --- re.c | 11 +++-------- tests/test1.c | 3 +++ 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/re.c b/re.c index 05aa97c..4696505 100644 --- a/re.c +++ b/re.c @@ -156,7 +156,7 @@ re_t re_compile(const char* pattern) case 's': { re_compiled[j].type = WHITESPACE; } break; case 'S': { re_compiled[j].type = NOT_WHITESPACE; } break; - /* Escaped character, e.g. '.' or '$' */ + /* Escaped character, e.g. '.', '$' or '\\' */ default: { re_compiled[j].type = CHAR; @@ -164,14 +164,9 @@ re_t re_compile(const char* pattern) } break; } } - /* '\\' as last char in pattern -> invalid regular expression. */ -/* + /* '\\' as last char without previous \\ -> invalid regular expression. */ else - { - re_compiled[j].type = CHAR; - re_compiled[j].ch = pattern[i]; - } -*/ + return 0; } break; /* Character class: */ diff --git a/tests/test1.c b/tests/test1.c index 5fdfe74..af43c99 100644 --- a/tests/test1.c +++ b/tests/test1.c @@ -89,6 +89,9 @@ char* test_vector[][4] = { NOK, "X?Y", "Z", (char*) 0 }, { OK, "[a-z]+\nbreak", "blahblah\nbreak", (char*) 14 }, { OK, "[a-z\\s]+\nbreak", "bla bla \nbreak", (char*) 14 }, + { NOK, "a\\", "a\\", (char*) 0 }, + { NOK, "\\", "\\", (char*) 0 }, + { OK, "\\\\", "\\", (char*) 1 }, }; From 7bd15de3604148bf4dd2c4e41851fdc83c86dfb3 Mon Sep 17 00:00:00 2001 From: Reinhard Urban Date: Fri, 10 Jun 2022 14:54:19 +0200 Subject: [PATCH 06/10] Clarify python2 is needed --- Makefile | 2 +- scripts/regex_test.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 2a204b3..4d84611 100644 --- a/Makefile +++ b/Makefile @@ -9,7 +9,7 @@ PYTHON != if (python --version 2>&1 | grep -q 'Python 2\..*'); then \ elif command -v python2 >/dev/null 2>&1; then \ echo 'python2'; \ else \ - echo 'Error: no compatible python version found.' >&2; \ + echo 'Error: no compatible python 2 version found.' >&2; \ exit 1; \ fi diff --git a/scripts/regex_test.py b/scripts/regex_test.py index 4fa98de..08b4c5e 100755 --- a/scripts/regex_test.py +++ b/scripts/regex_test.py @@ -1,7 +1,7 @@ #!/usr/bin/env python """ - This program generates random text that matches a given regex-pattern. + This python2 program generates random text that matches a given regex-pattern. The pattern is given via sys.argv and the generated text is passed to the binary 'tests/test_rand' to check if the generated text also matches the regex-pattern in the C implementation. From 0388df31ef50e5df4681da44f3828b5c112aa4e0 Mon Sep 17 00:00:00 2001 From: Reinhard Urban Date: Fri, 10 Jun 2022 15:37:47 +0200 Subject: [PATCH 07/10] re-enable INV_CHAR_CLASS and use the enum type internally --- README.md | 7 +------ re.c | 8 +++++--- tests/test1.c | 4 +--- 3 files changed, 7 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 0a2be86..fabddab 100644 --- a/README.md +++ b/README.md @@ -51,8 +51,6 @@ int re_match(const char* pattern, const char* text, int* matchlength); ### Supported regex-operators The following features / regex-operators are supported by this library. -NOTE: inverted character classes are buggy - see the test harness for concrete examples. - - `.` Dot, matches any character - `^` Start anchor, matches beginning of string @@ -104,10 +102,10 @@ if (match_idx != -1) For more usage examples I encourage you to look at the code in the `tests`-folder. ### TODO -- Fix the implementation of inverted character classes. - Fix implementation of branches (`|`), and see if that can lead us closer to groups as well, e.g. `(a|b)+`. - Add `example.c` that demonstrates usage. - Add `tests/test_perf.c` for performance and time measurements. +- Add optional multibyte support (e.g. UTF-8) - Testing: Improve pattern rejection testing. ### FAQ @@ -118,6 +116,3 @@ For more usage examples I encourage you to look at the code in the `tests`-folde ### License All material in this repository is in the public domain. - - - diff --git a/re.c b/re.c index 4696505..d4413e6 100644 --- a/re.c +++ b/re.c @@ -15,7 +15,7 @@ * '+' Plus, match one or more (greedy) * '?' Question, match zero or one (non-greedy) * '[abc]' Character class, match if one of {'a', 'b', 'c'} - * '[^abc]' Inverted class, match if NOT one of {'a', 'b', 'c'} -- NOTE: feature is currently broken! + * '[^abc]' Inverted class, match if NOT one of {'a', 'b', 'c'} * '[a-zA-Z]' Character ranges, the character set of the ranges { a-z | A-Z } * '\s' Whitespace, \t \f \r \n \v and spaces * '\S' Non-whitespace @@ -43,11 +43,11 @@ #endif -enum { UNUSED, DOT, BEGIN, END, QUESTIONMARK, STAR, PLUS, CHAR, CHAR_CLASS, INV_CHAR_CLASS, DIGIT, NOT_DIGIT, ALPHA, NOT_ALPHA, WHITESPACE, NOT_WHITESPACE, /* BRANCH */ }; +enum regex_type_e { UNUSED, DOT, BEGIN, END, QUESTIONMARK, STAR, PLUS, CHAR, CHAR_CLASS, INV_CHAR_CLASS, DIGIT, NOT_DIGIT, ALPHA, NOT_ALPHA, WHITESPACE, NOT_WHITESPACE, /* BRANCH */ }; typedef struct regex_t { - unsigned char type; /* CHAR, STAR, etc. */ + enum regex_type_e type; /* CHAR, STAR, etc. */ union { unsigned char ch; /* the character itself */ @@ -270,6 +270,8 @@ void re_print(regex_t* pattern) if (pattern[i].type == CHAR_CLASS || pattern[i].type == INV_CHAR_CLASS) { printf(" ["); + if (pattern[i].type == INV_CHAR_CLASS) + printf("^"); for (j = 0; j < MAX_CHAR_CLASS_LEN; ++j) { c = pattern[i].u.ccl[j]; diff --git a/tests/test1.c b/tests/test1.c index af43c99..7005494 100644 --- a/tests/test1.c +++ b/tests/test1.c @@ -75,15 +75,13 @@ char* test_vector[][4] = { OK, "[Hh]ello [Ww]orld\\s*[!]?", "Hello world! ", (char*) 11 }, { OK, "[Hh]ello [Ww]orld\\s*[!]?", "Hello world !", (char*) 13 }, { OK, "[Hh]ello [Ww]orld\\s*[!]?", "hello World !", (char*) 15 }, - { NOK, "\\d\\d?:\\d\\d?:\\d\\d?", "a:0", (char*) 0 }, /* Failing test case reported in https://github.com/kokke/tiny-regex-c/issues/12 */ -/* + { NOK, "\\d\\d?:\\d\\d?:\\d\\d?", "a:0", (char*) 0 }, { OK, "[^\\w][^-1-4]", ")T", (char*) 2 }, { OK, "[^\\w][^-1-4]", ")^", (char*) 2 }, { OK, "[^\\w][^-1-4]", "*)", (char*) 2 }, { OK, "[^\\w][^-1-4]", "!.", (char*) 2 }, { OK, "[^\\w][^-1-4]", " x", (char*) 2 }, { OK, "[^\\w][^-1-4]", "$b", (char*) 2 }, -*/ { OK, ".?bar", "real_bar", (char*) 4 }, { NOK, ".?bar", "real_foo", (char*) 0 }, { NOK, "X?Y", "Z", (char*) 0 }, From f334c5b3ff61acb6fbeebf83c11502bb9989b64b Mon Sep 17 00:00:00 2001 From: Reinhard Urban Date: Fri, 10 Jun 2022 15:38:31 +0200 Subject: [PATCH 08/10] prepare multi-byte support and fix isalpha crashes on bad libc's. Fixes GH #70. e.g. UTF-8. --- re.c | 7 ++++--- tests/test1.c | 5 +++++ 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/re.c b/re.c index d4413e6..fae8aa0 100644 --- a/re.c +++ b/re.c @@ -296,15 +296,15 @@ void re_print(regex_t* pattern) /* Private functions: */ static int matchdigit(char c) { - return isdigit(c); + return isdigit((unsigned char)c); } static int matchalpha(char c) { - return isalpha(c); + return isalpha((unsigned char)c); } static int matchwhitespace(char c) { - return isspace(c); + return isspace((unsigned char)c); } static int matchalphanum(char c) { @@ -407,6 +407,7 @@ static int matchstar(regex_t p, regex_t* pattern, const char* text, int* matchle { int prelen = *matchlength; const char* prepoint = text; + // TODO check if multibyte, and use mbtowc() then while ((text[0] != '\0') && matchone(p, *text)) { text++; diff --git a/tests/test1.c b/tests/test1.c index 7005494..b98be12 100644 --- a/tests/test1.c +++ b/tests/test1.c @@ -4,6 +4,7 @@ #include #include +//#include #include "re.h" @@ -90,6 +91,8 @@ char* test_vector[][4] = { NOK, "a\\", "a\\", (char*) 0 }, { NOK, "\\", "\\", (char*) 0 }, { OK, "\\\\", "\\", (char*) 1 }, + // no multibyte support yet + //{ OK, "\\w+", "Çüéâ", (char*) 4 }, }; @@ -106,6 +109,8 @@ int main() size_t nfailed = 0; size_t i; + //setlocale(LC_CTYPE, "en_US.UTF-8"); + for (i = 0; i < ntests; ++i) { pattern = test_vector[i][1]; From 148e229fb68a7875668653df6572a80f8ca8b988 Mon Sep 17 00:00:00 2001 From: Reinhard Urban Date: Mon, 20 Jun 2022 08:44:45 +0200 Subject: [PATCH 09/10] TODOs and new tests --- README.md | 13 +++++++++---- tests/test1.c | 2 ++ 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index fabddab..d74f46a 100644 --- a/README.md +++ b/README.md @@ -61,7 +61,7 @@ The following features / regex-operators are supported by this library. - `[abc]` Character class, match if one of {'a', 'b', 'c'} - `[^abc]` Inverted class, match if NOT one of {'a', 'b', 'c'} - `[a-zA-Z]` Character ranges, the character set of the ranges { a-z | A-Z } - - `\s` Whitespace, \t \f \r \n \v and spaces + - `\s` Whitespace, '\t' '\f' '\r' '\n' '\v' and spaces - `\S` Non-whitespace - `\w` Alphanumeric, [a-zA-Z0-9_] - `\W` Non-alphanumeric @@ -88,7 +88,7 @@ int match_length; /* Standard null-terminated C-string to search: */ const char* string_to_search = "ahem.. 'hello world !' .."; -/* Compile a simple regular expression using character classes, meta-char and greedy + non-greedy quantifiers: */ +/* Compile a simple regular expression using character classes, meta-char and greedy quantifiers: */ re_t pattern = re_compile("[Hh]ello [Ww]orld\\s*[!]?"); /* Check if the regex matches the text: */ @@ -102,10 +102,15 @@ if (match_idx != -1) For more usage examples I encourage you to look at the code in the `tests`-folder. ### TODO -- Fix implementation of branches (`|`), and see if that can lead us closer to groups as well, e.g. `(a|b)+`. +- Fix implementation of branches (`|`) (see the branch), and add groups as well, e.g. `(a|b)+`. +- `re_match_capture()` with groups. - Add `example.c` that demonstrates usage. - Add `tests/test_perf.c` for performance and time measurements. -- Add optional multibyte support (e.g. UTF-8) +- Add optional multibyte support (e.g. UTF-8). On non-wchar systems roll our own. +- Word boundary: \b \B +- non-greedy, lazy quantifiers (??, +?, *?, {n,m}?) +- case-insensitive option or API. `re_matchi()` +- '.' may not match '\r' nor '\n', unless a single-line option is given. - Testing: Improve pattern rejection testing. ### FAQ diff --git a/tests/test1.c b/tests/test1.c index b98be12..228b2e1 100644 --- a/tests/test1.c +++ b/tests/test1.c @@ -37,6 +37,8 @@ char* test_vector[][4] = { OK, "[abc]", "1c2", (char*) 1 }, { NOK, "[abc]", "1C2", (char*) 0 }, { OK, "[1-5]+", "0123456789", (char*) 5 }, + { OK, "[1-5-]+", "123-", (char*) 4 }, + { OK, "[1-5-]+[-1-2]-[-]", "13132231--353444-511--", (char *) 22 }, { OK, "[.2]", "1C2", (char*) 1 }, { OK, "a*$", "Xaa", (char*) 2 }, { OK, "a*$", "Xaa", (char*) 2 }, From 89f513f4e8fb74a673bf9dface055faee2d4ba2a Mon Sep 17 00:00:00 2001 From: Reinhard Urban Date: Mon, 20 Jun 2022 08:56:38 +0200 Subject: [PATCH 10/10] fix ranges with ending - Fixes GH #79 and the exreg failures with [1-5-]+[-1-2]-[-] --- re.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/re.c b/re.c index fae8aa0..e81aa67 100644 --- a/re.c +++ b/re.c @@ -373,7 +373,9 @@ static int matchcharclass(char c, const char* str) { if (c == '-') { - return ((str[-1] == '\0') || (str[1] == '\0')); + if ((str[-1] == '\0') || (str[1] == '\0')) + return 1; + // else continue } else {