Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

3 fixes #78

Closed
wants to merge 10 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ PYTHON != if (python --version 2>&1 | grep -q 'Python 2\..*'); then \
elif command -v python2 >/dev/null 2>&1; then \
echo 'python2'; \
else \
echo 'Error: no compatible python version found.' >&2; \
echo 'Error: no compatible python 2 version found.' >&2; \
exit 1; \
fi

Expand Down Expand Up @@ -107,3 +107,8 @@ test: all
@echo
@echo

CBMC := cbmc

# unwindset: loop max MAX_REGEXP_OBJECTS patterns
verify:
$(CBMC) -DCPROVER --unwindset 8 --unwind 16 --depth 16 --bounds-check --pointer-check --memory-leak-check --div-by-zero-check --signed-overflow-check --unsigned-overflow-check --pointer-overflow-check --conversion-check --undefined-shift-check --enum-range-check $(CBMC_ARGS) re.c
18 changes: 9 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,6 @@ int re_match(const char* pattern, const char* text, int* matchlength);
### Supported regex-operators
The following features / regex-operators are supported by this library.

NOTE: inverted character classes are buggy - see the test harness for concrete examples.


- `.` Dot, matches any character
- `^` Start anchor, matches beginning of string
Expand All @@ -63,7 +61,7 @@ NOTE: inverted character classes are buggy - see the test harness for concrete e
- `[abc]` Character class, match if one of {'a', 'b', 'c'}
- `[^abc]` Inverted class, match if NOT one of {'a', 'b', 'c'}
- `[a-zA-Z]` Character ranges, the character set of the ranges { a-z | A-Z }
- `\s` Whitespace, \t \f \r \n \v and spaces
- `\s` Whitespace, '\t' '\f' '\r' '\n' '\v' and spaces
- `\S` Non-whitespace
- `\w` Alphanumeric, [a-zA-Z0-9_]
- `\W` Non-alphanumeric
Expand All @@ -90,7 +88,7 @@ int match_length;
/* Standard null-terminated C-string to search: */
const char* string_to_search = "ahem.. 'hello world !' ..";

/* Compile a simple regular expression using character classes, meta-char and greedy + non-greedy quantifiers: */
/* Compile a simple regular expression using character classes, meta-char and greedy quantifiers: */
re_t pattern = re_compile("[Hh]ello [Ww]orld\\s*[!]?");

/* Check if the regex matches the text: */
Expand All @@ -104,10 +102,15 @@ if (match_idx != -1)
For more usage examples I encourage you to look at the code in the `tests`-folder.

### TODO
- Fix the implementation of inverted character classes.
- Fix implementation of branches (`|`), and see if that can lead us closer to groups as well, e.g. `(a|b)+`.
- Fix implementation of branches (`|`) (see the branch), and add groups as well, e.g. `(a|b)+`.
- `re_match_capture()` with groups.
- Add `example.c` that demonstrates usage.
- Add `tests/test_perf.c` for performance and time measurements.
- Add optional multibyte support (e.g. UTF-8). On non-wchar systems roll our own.
- Word boundary: \b \B
- non-greedy, lazy quantifiers (??, +?, *?, {n,m}?)
- case-insensitive option or API. `re_matchi()`
- '.' may not match '\r' nor '\n', unless a single-line option is given.
- Testing: Improve pattern rejection testing.

### FAQ
Expand All @@ -118,6 +121,3 @@ For more usage examples I encourage you to look at the code in the `tests`-folde
### License
All material in this repository is in the public domain.




5 changes: 5 additions & 0 deletions formal_verification.md
Original file line number Diff line number Diff line change
Expand Up @@ -140,3 +140,8 @@ sys 9m34.654s
klee@780432c1aaae0:~$
```

----

For the formal verifier CBMC just call make verify.
This verifier is much faster and better than klee.
https://www.cprover.org/cbmc/
127 changes: 100 additions & 27 deletions re.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
* '+' Plus, match one or more (greedy)
* '?' Question, match zero or one (non-greedy)
* '[abc]' Character class, match if one of {'a', 'b', 'c'}
* '[^abc]' Inverted class, match if NOT one of {'a', 'b', 'c'} -- NOTE: feature is currently broken!
* '[^abc]' Inverted class, match if NOT one of {'a', 'b', 'c'}
* '[a-zA-Z]' Character ranges, the character set of the ranges { a-z | A-Z }
* '\s' Whitespace, \t \f \r \n \v and spaces
* '\S' Non-whitespace
Expand All @@ -35,15 +35,19 @@

/* Definitions: */

#define MAX_REGEXP_OBJECTS 30 /* Max number of regex symbols in expression. */
#define MAX_CHAR_CLASS_LEN 40 /* Max length of character-class buffer in. */
#ifndef CPROVER
#define MAX_REGEXP_OBJECTS 30 /* Max number of regex symbols in expression. */
#else
#define MAX_REGEXP_OBJECTS 8 /* faster formal proofs */
#endif


enum { UNUSED, DOT, BEGIN, END, QUESTIONMARK, STAR, PLUS, CHAR, CHAR_CLASS, INV_CHAR_CLASS, DIGIT, NOT_DIGIT, ALPHA, NOT_ALPHA, WHITESPACE, NOT_WHITESPACE, /* BRANCH */ };
enum regex_type_e { UNUSED, DOT, BEGIN, END, QUESTIONMARK, STAR, PLUS, CHAR, CHAR_CLASS, INV_CHAR_CLASS, DIGIT, NOT_DIGIT, ALPHA, NOT_ALPHA, WHITESPACE, NOT_WHITESPACE, /* BRANCH */ };

typedef struct regex_t
{
unsigned char type; /* CHAR, STAR, etc. */
enum regex_type_e type; /* CHAR, STAR, etc. */
union
{
unsigned char ch; /* the character itself */
Expand Down Expand Up @@ -152,22 +156,17 @@ re_t re_compile(const char* pattern)
case 's': { re_compiled[j].type = WHITESPACE; } break;
case 'S': { re_compiled[j].type = NOT_WHITESPACE; } break;

/* Escaped character, e.g. '.' or '$' */
/* Escaped character, e.g. '.', '$' or '\\' */
default:
{
re_compiled[j].type = CHAR;
re_compiled[j].u.ch = pattern[i];
} break;
}
}
/* '\\' as last char in pattern -> invalid regular expression. */
/*
/* '\\' as last char without previous \\ -> invalid regular expression. */
else
{
re_compiled[j].type = CHAR;
re_compiled[j].ch = pattern[i];
}
*/
return 0;
} break;

/* Character class: */
Expand Down Expand Up @@ -226,19 +225,17 @@ re_t re_compile(const char* pattern)
re_compiled[j].u.ccl = &ccl_buf[buf_begin];
} break;

case '\0': // EOL
return 0;

/* Other characters: */
default:
{
re_compiled[j].type = CHAR;
re_compiled[j].u.ch = c;
// cbmc: arithmetic overflow on signed to unsigned type conversion in (unsigned char)c
re_compiled[j].u.ch = (unsigned char)c;
} break;
}
/* no buffer-out-of-bounds access on invalid patterns - see https://github.com/kokke/tiny-regex-c/commit/1a279e04014b70b0695fba559a7c05d55e6ee90b */
if (pattern[i] == 0)
{
return 0;
}

i += 1;
j += 1;
}
Expand All @@ -250,22 +247,31 @@ re_t re_compile(const char* pattern)

void re_print(regex_t* pattern)
{
const char* types[] = { "UNUSED", "DOT", "BEGIN", "END", "QUESTIONMARK", "STAR", "PLUS", "CHAR", "CHAR_CLASS", "INV_CHAR_CLASS", "DIGIT", "NOT_DIGIT", "ALPHA", "NOT_ALPHA", "WHITESPACE", "NOT_WHITESPACE", "BRANCH" };
const char *const types[] = { "UNUSED", "DOT", "BEGIN", "END", "QUESTIONMARK", "STAR", "PLUS", "CHAR", "CHAR_CLASS", "INV_CHAR_CLASS", "DIGIT", "NOT_DIGIT", "ALPHA", "NOT_ALPHA", "WHITESPACE", "NOT_WHITESPACE" /*, "BRANCH" */ };

int i;
int j;
unsigned char i;
unsigned char j;
char c;

if (!pattern)
return;
for (i = 0; i < MAX_REGEXP_OBJECTS; ++i)
{
if (pattern[i].type == UNUSED)
{
break;
}

printf("type: %s", types[pattern[i].type]);
if (pattern[i].type <= NOT_WHITESPACE)
printf("type: %s", types[pattern[i].type]);
else
printf("invalid type: %d", pattern[i].type);

if (pattern[i].type == CHAR_CLASS || pattern[i].type == INV_CHAR_CLASS)
{
printf(" [");
if (pattern[i].type == INV_CHAR_CLASS)
printf("^");
for (j = 0; j < MAX_CHAR_CLASS_LEN; ++j)
{
c = pattern[i].u.ccl[j];
Expand All @@ -290,15 +296,15 @@ void re_print(regex_t* pattern)
/* Private functions: */
static int matchdigit(char c)
{
return isdigit(c);
return isdigit((unsigned char)c);
}
static int matchalpha(char c)
{
return isalpha(c);
return isalpha((unsigned char)c);
}
static int matchwhitespace(char c)
{
return isspace(c);
return isspace((unsigned char)c);
}
static int matchalphanum(char c)
{
Expand Down Expand Up @@ -367,7 +373,9 @@ static int matchcharclass(char c, const char* str)
{
if (c == '-')
{
return ((str[-1] == '\0') || (str[1] == '\0'));
if ((str[-1] == '\0') || (str[1] == '\0'))
return 1;
// else continue
}
else
{
Expand Down Expand Up @@ -401,6 +409,7 @@ static int matchstar(regex_t p, regex_t* pattern, const char* text, int* matchle
{
int prelen = *matchlength;
const char* prepoint = text;
// TODO check if multibyte, and use mbtowc() then
while ((text[0] != '\0') && matchone(p, *text))
{
text++;
Expand Down Expand Up @@ -526,3 +535,67 @@ static int matchpattern(regex_t* pattern, const char* text, int* matchlength)
}

#endif

#ifdef CPROVER
#define N 24

/* Formal verification with cbmc: */
/* cbmc -DCPROVER --64 --depth 200 --bounds-check --pointer-check --memory-leak-check --div-by-zero-check --signed-overflow-check --unsigned-overflow-check --pointer-overflow-check --conversion-check --undefined-shift-check --enum-range-check --pointer-primitive-check -trace re.c
*/

void verify_re_compile()
{
/* test input - ten chars used as a regex-pattern input */
char arr[N];
/* make input symbolic, to search all paths through the code */
/* i.e. the input is checked for all possible ten-char combinations */
for (int i=0; i<sizeof(arr)-1; i++) {
//arr[i] = nondet_char();
assume(arr[i] > -127 && arr[i] < 128);
}
/* assume proper NULL termination */
assume(arr[sizeof(arr) - 1] == 0);
/* verify abscence of run-time errors - go! */
re_compile(arr);
}

void verify_re_print()
{
regex_t pattern[MAX_REGEXP_OBJECTS];
for (unsigned char i=0; i<MAX_REGEXP_OBJECTS; i++) {
//pattern[i].type = nondet_uchar();
assume(pattern[i].type >= 0 && pattern[i].type <= 255);
pattern[i].u.ccl = nondet_long();
}
re_print(&pattern);
}

void verify_re_match()
{
int length;
regex_t pattern[MAX_REGEXP_OBJECTS];
char arr[N];

for (unsigned char i=0; i<MAX_REGEXP_OBJECTS; i++) {
//pattern[i].type = nondet_uchar();
//pattern[i].u.ch = nondet_int();
assume(pattern[i].type >= 0 && pattern[i].type <= 255);
assume(pattern[i].u.ccl >= 0 && pattern[i].u.ccl <= ~1);
}
for (int i=0; i<sizeof(arr)-1; i++) {
assume(arr[i] > -127 && arr[i] < 128);
}
/* assume proper NULL termination */
assume(arr[sizeof(arr) - 1] == 0);

re_match(&pattern, arr, &length);
}

int main(int argc, char* argv[])
{
verify_re_compile();
verify_re_printh();
verify_re_match();
return 0;
}
#endif
2 changes: 1 addition & 1 deletion scripts/regex_test.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env python

"""
This program generates random text that matches a given regex-pattern.
This python2 program generates random text that matches a given regex-pattern.
The pattern is given via sys.argv and the generated text is passed to
the binary 'tests/test_rand' to check if the generated text also matches
the regex-pattern in the C implementation.
Expand Down
14 changes: 11 additions & 3 deletions tests/test1.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

#include <stdio.h>
#include <string.h>
//#include <locale.h>
#include "re.h"


Expand Down Expand Up @@ -36,6 +37,8 @@ char* test_vector[][4] =
{ OK, "[abc]", "1c2", (char*) 1 },
{ NOK, "[abc]", "1C2", (char*) 0 },
{ OK, "[1-5]+", "0123456789", (char*) 5 },
{ OK, "[1-5-]+", "123-", (char*) 4 },
{ OK, "[1-5-]+[-1-2]-[-]", "13132231--353444-511--", (char *) 22 },
{ OK, "[.2]", "1C2", (char*) 1 },
{ OK, "a*$", "Xaa", (char*) 2 },
{ OK, "a*$", "Xaa", (char*) 2 },
Expand Down Expand Up @@ -75,20 +78,23 @@ char* test_vector[][4] =
{ OK, "[Hh]ello [Ww]orld\\s*[!]?", "Hello world! ", (char*) 11 },
{ OK, "[Hh]ello [Ww]orld\\s*[!]?", "Hello world !", (char*) 13 },
{ OK, "[Hh]ello [Ww]orld\\s*[!]?", "hello World !", (char*) 15 },
{ NOK, "\\d\\d?:\\d\\d?:\\d\\d?", "a:0", (char*) 0 }, /* Failing test case reported in https://github.com/kokke/tiny-regex-c/issues/12 */
/*
{ NOK, "\\d\\d?:\\d\\d?:\\d\\d?", "a:0", (char*) 0 },
{ OK, "[^\\w][^-1-4]", ")T", (char*) 2 },
{ OK, "[^\\w][^-1-4]", ")^", (char*) 2 },
{ OK, "[^\\w][^-1-4]", "*)", (char*) 2 },
{ OK, "[^\\w][^-1-4]", "!.", (char*) 2 },
{ OK, "[^\\w][^-1-4]", " x", (char*) 2 },
{ OK, "[^\\w][^-1-4]", "$b", (char*) 2 },
*/
{ OK, ".?bar", "real_bar", (char*) 4 },
{ NOK, ".?bar", "real_foo", (char*) 0 },
{ NOK, "X?Y", "Z", (char*) 0 },
{ OK, "[a-z]+\nbreak", "blahblah\nbreak", (char*) 14 },
{ OK, "[a-z\\s]+\nbreak", "bla bla \nbreak", (char*) 14 },
{ NOK, "a\\", "a\\", (char*) 0 },
{ NOK, "\\", "\\", (char*) 0 },
{ OK, "\\\\", "\\", (char*) 1 },
// no multibyte support yet
//{ OK, "\\w+", "Çüéâ", (char*) 4 },
};


Expand All @@ -105,6 +111,8 @@ int main()
size_t nfailed = 0;
size_t i;

//setlocale(LC_CTYPE, "en_US.UTF-8");

for (i = 0; i < ntests; ++i)
{
pattern = test_vector[i][1];
Expand Down