kokke · rurban · Jun 10, 2022 · Jun 10, 2022 · Jun 10, 2022 · Jun 10, 2022
diff --git a/Makefile b/Makefile
@@ -9,7 +9,7 @@ PYTHON != if (python --version 2>&1 | grep -q 'Python 2\..*'); then \
           elif command -v python2 >/dev/null 2>&1; then             \
             echo 'python2';                                         \
           else                                                      \
-            echo 'Error: no compatible python version found.' >&2;  \
+            echo 'Error: no compatible python 2 version found.' >&2;  \
             exit 1;                                                 \
           fi
 
@@ -107,3 +107,8 @@ test: all
 	@echo
 	@echo
 
+CBMC := cbmc
+
+# unwindset: loop max MAX_REGEXP_OBJECTS patterns
+verify:
+	$(CBMC) -DCPROVER --unwindset 8 --unwind 16 --depth 16 --bounds-check --pointer-check --memory-leak-check --div-by-zero-check --signed-overflow-check --unsigned-overflow-check --pointer-overflow-check --conversion-check --undefined-shift-check --enum-range-check $(CBMC_ARGS) re.c
diff --git a/README.md b/README.md
@@ -51,8 +51,6 @@ int  re_match(const char* pattern, const char* text, int* matchlength);
 ### Supported regex-operators
 The following features / regex-operators are supported by this library.
 
-NOTE: inverted character classes are buggy - see the test harness for concrete examples.
-
 
   -  `.`         Dot, matches any character
   -  `^`         Start anchor, matches beginning of string
@@ -63,7 +61,7 @@ NOTE: inverted character classes are buggy - see the test harness for concrete e
   -  `[abc]`     Character class, match if one of {'a', 'b', 'c'}
   -  `[^abc]`   Inverted class, match if NOT one of {'a', 'b', 'c'}
   -  `[a-zA-Z]` Character ranges, the character set of the ranges { a-z | A-Z }
-  -  `\s`       Whitespace, \t \f \r \n \v and spaces
+  -  `\s`       Whitespace, '\t' '\f' '\r' '\n' '\v' and spaces
   -  `\S`       Non-whitespace
   -  `\w`       Alphanumeric, [a-zA-Z0-9_]
   -  `\W`       Non-alphanumeric
@@ -90,7 +88,7 @@ int match_length;
 /* Standard null-terminated C-string to search: */
 const char* string_to_search = "ahem.. 'hello world !' ..";
 
-/* Compile a simple regular expression using character classes, meta-char and greedy + non-greedy quantifiers: */
+/* Compile a simple regular expression using character classes, meta-char and greedy quantifiers: */
 re_t pattern = re_compile("[Hh]ello [Ww]orld\\s*[!]?");
 
 /* Check if the regex matches the text: */
@@ -104,10 +102,15 @@ if (match_idx != -1)
 For more usage examples I encourage you to look at the code in the `tests`-folder.
 
 ### TODO
-- Fix the implementation of inverted character classes.
-- Fix implementation of branches (`|`), and see if that can lead us closer to groups as well, e.g. `(a|b)+`.
+- Fix implementation of branches (`|`) (see the branch), and add groups as well, e.g. `(a|b)+`.
+- `re_match_capture()` with groups.
 - Add `example.c` that demonstrates usage.
 - Add `tests/test_perf.c` for performance and time measurements.
+- Add optional multibyte support (e.g. UTF-8). On non-wchar systems roll our own.
+- Word boundary: \b \B
+- non-greedy, lazy quantifiers (??, +?, *?, {n,m}?)
+- case-insensitive option or API. `re_matchi()`
+- '.' may not match '\r' nor '\n', unless a single-line option is given.
 - Testing: Improve pattern rejection testing.
 
 ### FAQ
@@ -118,6 +121,3 @@ For more usage examples I encourage you to look at the code in the `tests`-folde
 ### License
 All material in this repository is in the public domain.
 
-
-
-
diff --git a/formal_verification.md b/formal_verification.md
@@ -140,3 +140,8 @@ sys     9m34.654s
 klee@780432c1aaae0:~$ 
 ```
 
+----
+
+For the formal verifier CBMC just call make verify.
+This verifier is much faster and better than klee.
+https://www.cprover.org/cbmc/
diff --git a/re.c b/re.c
@@ -15,7 +15,7 @@
  *   '+'        Plus, match one or more (greedy)
  *   '?'        Question, match zero or one (non-greedy)
  *   '[abc]'    Character class, match if one of {'a', 'b', 'c'}
- *   '[^abc]'   Inverted class, match if NOT one of {'a', 'b', 'c'} -- NOTE: feature is currently broken!
+ *   '[^abc]'   Inverted class, match if NOT one of {'a', 'b', 'c'}
  *   '[a-zA-Z]' Character ranges, the character set of the ranges { a-z | A-Z }
  *   '\s'       Whitespace, \t \f \r \n \v and spaces
  *   '\S'       Non-whitespace
@@ -35,15 +35,19 @@
 
 /* Definitions: */
 
-#define MAX_REGEXP_OBJECTS      30    /* Max number of regex symbols in expression. */
 #define MAX_CHAR_CLASS_LEN      40    /* Max length of character-class buffer in.   */
+#ifndef CPROVER
+#define MAX_REGEXP_OBJECTS      30    /* Max number of regex symbols in expression. */
+#else
+#define MAX_REGEXP_OBJECTS      8    /* faster formal proofs */
+#endif
 
 
-enum { UNUSED, DOT, BEGIN, END, QUESTIONMARK, STAR, PLUS, CHAR, CHAR_CLASS, INV_CHAR_CLASS, DIGIT, NOT_DIGIT, ALPHA, NOT_ALPHA, WHITESPACE, NOT_WHITESPACE, /* BRANCH */ };
+enum regex_type_e { UNUSED, DOT, BEGIN, END, QUESTIONMARK, STAR, PLUS, CHAR, CHAR_CLASS, INV_CHAR_CLASS, DIGIT, NOT_DIGIT, ALPHA, NOT_ALPHA, WHITESPACE, NOT_WHITESPACE, /* BRANCH */ };
 
 typedef struct regex_t
 {
-  unsigned char  type;   /* CHAR, STAR, etc.                      */
+  enum regex_type_e type;   /* CHAR, STAR, etc.                      */
   union
   {
     unsigned char  ch;   /*      the character itself             */
@@ -152,22 +156,17 @@ re_t re_compile(const char* pattern)
             case 's': {    re_compiled[j].type = WHITESPACE;       } break;
             case 'S': {    re_compiled[j].type = NOT_WHITESPACE;   } break;
 
-            /* Escaped character, e.g. '.' or '$' */
+              /* Escaped character, e.g. '.', '$' or '\\' */
             default:
             {
               re_compiled[j].type = CHAR;
               re_compiled[j].u.ch = pattern[i];
             } break;
           }
         }
-        /* '\\' as last char in pattern -> invalid regular expression. */
-/*
+        /* '\\' as last char without previous \\ -> invalid regular expression. */
         else
-        {
-          re_compiled[j].type = CHAR;
-          re_compiled[j].ch = pattern[i];
-        }
-*/
+          return 0;
       } break;
 
       /* Character class: */
@@ -226,19 +225,17 @@ re_t re_compile(const char* pattern)
         re_compiled[j].u.ccl = &ccl_buf[buf_begin];
       } break;
 
+      case '\0': // EOL
+        return 0;
+
       /* Other characters: */
       default:
       {
         re_compiled[j].type = CHAR;
-        re_compiled[j].u.ch = c;
+        // cbmc: arithmetic overflow on signed to unsigned type conversion in (unsigned char)c
+        re_compiled[j].u.ch = (unsigned char)c;
       } break;
     }
-    /* no buffer-out-of-bounds access on invalid patterns - see https://github.com/kokke/tiny-regex-c/commit/1a279e04014b70b0695fba559a7c05d55e6ee90b */
-    if (pattern[i] == 0)
-    {
-      return 0;
-    }
-
     i += 1;
     j += 1;
   }
@@ -250,22 +247,31 @@ re_t re_compile(const char* pattern)
 
 void re_print(regex_t* pattern)
 {
-  const char* types[] = { "UNUSED", "DOT", "BEGIN", "END", "QUESTIONMARK", "STAR", "PLUS", "CHAR", "CHAR_CLASS", "INV_CHAR_CLASS", "DIGIT", "NOT_DIGIT", "ALPHA", "NOT_ALPHA", "WHITESPACE", "NOT_WHITESPACE", "BRANCH" };
+  const char *const types[] = { "UNUSED", "DOT", "BEGIN", "END", "QUESTIONMARK", "STAR", "PLUS", "CHAR", "CHAR_CLASS", "INV_CHAR_CLASS", "DIGIT", "NOT_DIGIT", "ALPHA", "NOT_ALPHA", "WHITESPACE", "NOT_WHITESPACE" /*, "BRANCH" */ };
 
-  int i;
-  int j;
+  unsigned char i;
+  unsigned char j;
   char c;
+
+  if (!pattern)
+    return;
   for (i = 0; i < MAX_REGEXP_OBJECTS; ++i)
   {
     if (pattern[i].type == UNUSED)
     {
       break;
     }
 
-    printf("type: %s", types[pattern[i].type]);
+    if (pattern[i].type <= NOT_WHITESPACE)
+      printf("type: %s", types[pattern[i].type]);
+    else
+      printf("invalid type: %d", pattern[i].type);
+
     if (pattern[i].type == CHAR_CLASS || pattern[i].type == INV_CHAR_CLASS)
     {
       printf(" [");
+      if (pattern[i].type == INV_CHAR_CLASS)
+        printf("^");
       for (j = 0; j < MAX_CHAR_CLASS_LEN; ++j)
       {
         c = pattern[i].u.ccl[j];
@@ -290,15 +296,15 @@ void re_print(regex_t* pattern)
 /* Private functions: */
 static int matchdigit(char c)
 {
-  return isdigit(c);
+  return isdigit((unsigned char)c);
 }
 static int matchalpha(char c)
 {
-  return isalpha(c);
+  return isalpha((unsigned char)c);
 }
 static int matchwhitespace(char c)
 {
-  return isspace(c);
+  return isspace((unsigned char)c);
 }
 static int matchalphanum(char c)
 {
@@ -367,7 +373,9 @@ static int matchcharclass(char c, const char* str)
     {
       if (c == '-')
       {
-        return ((str[-1] == '\0') || (str[1] == '\0'));
+        if ((str[-1] == '\0') || (str[1] == '\0'))
+            return 1;
+        // else continue
       }
       else
       {
@@ -401,6 +409,7 @@ static int matchstar(regex_t p, regex_t* pattern, const char* text, int* matchle
 {
   int prelen = *matchlength;
   const char* prepoint = text;
+  // TODO check if multibyte, and use mbtowc() then
   while ((text[0] != '\0') && matchone(p, *text))
   {
     text++;
@@ -526,3 +535,67 @@ static int matchpattern(regex_t* pattern, const char* text, int* matchlength)
 }
 
 #endif
+
+#ifdef CPROVER
+#define N 24
+
+/* Formal verification with cbmc: */
+/* cbmc -DCPROVER --64 --depth 200 --bounds-check --pointer-check --memory-leak-check --div-by-zero-check --signed-overflow-check --unsigned-overflow-check --pointer-overflow-check --conversion-check --undefined-shift-check --enum-range-check --pointer-primitive-check -trace re.c
+ */
+
+void verify_re_compile()
+{
+  /* test input - ten chars used as a regex-pattern input */
+  char arr[N];
+  /* make input symbolic, to search all paths through the code */
+  /* i.e. the input is checked for all possible ten-char combinations */
+  for (int i=0; i<sizeof(arr)-1; i++) {
+    //arr[i] = nondet_char();
+    assume(arr[i] > -127 && arr[i] < 128);
+  }
+  /* assume proper NULL termination */
+  assume(arr[sizeof(arr) - 1] == 0);
+  /* verify abscence of run-time errors - go! */
+  re_compile(arr);
+}
+
+void verify_re_print()
+{
+  regex_t pattern[MAX_REGEXP_OBJECTS];
+  for (unsigned char i=0; i<MAX_REGEXP_OBJECTS; i++) {
+    //pattern[i].type = nondet_uchar();
+    assume(pattern[i].type >= 0 && pattern[i].type <= 255);
+    pattern[i].u.ccl = nondet_long();
+  }
+  re_print(&pattern);
+}
+
+void verify_re_match()
+{
+  int length;
+  regex_t pattern[MAX_REGEXP_OBJECTS];
+  char arr[N];
+
+  for (unsigned char i=0; i<MAX_REGEXP_OBJECTS; i++) {
+    //pattern[i].type = nondet_uchar();
+    //pattern[i].u.ch = nondet_int();
+    assume(pattern[i].type >= 0 && pattern[i].type <= 255);
+    assume(pattern[i].u.ccl >= 0 && pattern[i].u.ccl <= ~1);
+  }
+  for (int i=0; i<sizeof(arr)-1; i++) {
+    assume(arr[i] > -127 && arr[i] < 128);
+  }
+  /* assume proper NULL termination */
+  assume(arr[sizeof(arr) - 1] == 0);
+
+  re_match(&pattern, arr, &length);
+}
+
+int main(int argc, char* argv[])
+{
+  verify_re_compile();
+  verify_re_printh();
+  verify_re_match();
+  return 0;
+}
+#endif
diff --git a/scripts/regex_test.py b/scripts/regex_test.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 
 """
-  This program generates random text that matches a given regex-pattern.
+  This python2 program generates random text that matches a given regex-pattern.
   The pattern is given via sys.argv and the generated text is passed to
   the binary 'tests/test_rand' to check if the generated text also matches
   the regex-pattern in the C implementation.

diff --git a/tests/test1.c b/tests/test1.c
@@ -4,6 +4,7 @@
 
 #include <stdio.h>
 #include <string.h>
+//#include <locale.h>
 #include "re.h"
 
 
@@ -36,6 +37,8 @@ char* test_vector[][4] =
   { OK,  "[abc]",                     "1c2",              (char*) 1      },
   { NOK, "[abc]",                     "1C2",              (char*) 0      },
   { OK,  "[1-5]+",                    "0123456789",       (char*) 5      },
+  { OK,  "[1-5-]+",                   "123-",             (char*) 4      },
+  { OK,  "[1-5-]+[-1-2]-[-]", 	      "13132231--353444-511--",    (char *) 22  },
   { OK,  "[.2]",                      "1C2",              (char*) 1      },
   { OK,  "a*$",                       "Xaa",              (char*) 2      },
   { OK,  "a*$",                       "Xaa",              (char*) 2      },
@@ -75,20 +78,23 @@ char* test_vector[][4] =
   { OK,  "[Hh]ello [Ww]orld\\s*[!]?", "Hello world!   ",  (char*) 11     },
   { OK,  "[Hh]ello [Ww]orld\\s*[!]?", "Hello world  !",   (char*) 13     },
   { OK,  "[Hh]ello [Ww]orld\\s*[!]?", "hello World    !", (char*) 15     },
-  { NOK, "\\d\\d?:\\d\\d?:\\d\\d?",   "a:0",              (char*) 0      }, /* Failing test case reported in https://github.com/kokke/tiny-regex-c/issues/12 */
-/*
+  { NOK, "\\d\\d?:\\d\\d?:\\d\\d?",   "a:0",              (char*) 0      },
   { OK,  "[^\\w][^-1-4]",     ")T",          (char*) 2      },
   { OK,  "[^\\w][^-1-4]",     ")^",          (char*) 2      },
   { OK,  "[^\\w][^-1-4]",     "*)",          (char*) 2      },
   { OK,  "[^\\w][^-1-4]",     "!.",          (char*) 2      },
   { OK,  "[^\\w][^-1-4]",     " x",          (char*) 2      },
   { OK,  "[^\\w][^-1-4]",     "$b",          (char*) 2      },
-*/
   { OK,  ".?bar",                      "real_bar",        (char*) 4      },
   { NOK, ".?bar",                      "real_foo",        (char*) 0      },
   { NOK, "X?Y",                        "Z",               (char*) 0      },
   { OK, "[a-z]+\nbreak",              "blahblah\nbreak",  (char*) 14     },
   { OK, "[a-z\\s]+\nbreak",           "bla bla \nbreak",  (char*) 14     },
+  { NOK, "a\\",                       "a\\",              (char*) 0      },
+  { NOK, "\\",                        "\\",               (char*) 0      },
+  { OK,  "\\\\",                      "\\",               (char*) 1      },
+  // no multibyte support yet
+  //{ OK,  "\\w+",                      "Çüéâ",             (char*) 4      },
 };
 
 
@@ -105,6 +111,8 @@ int main()
     size_t nfailed = 0;
     size_t i;
 
+    //setlocale(LC_CTYPE, "en_US.UTF-8");
+
     for (i = 0; i < ntests; ++i)
     {
         pattern = test_vector[i][1];