From 7fb7a51bcc24f94f1c5c37e6f33e410cc32977fd Mon Sep 17 00:00:00 2001
From: Reinhard Urban <reinhard.urban@nubix.de>
Date: Fri, 10 Jun 2022 13:40:28 +0200
Subject: [PATCH 01/10] add cbmc verify and fix a --conversion-check

add another formal verifier (much easier to use),
and fix an invalid signed conversion
---
 Makefile               |  4 ++++
 formal_verification.md |  5 +++++
 re.c                   | 30 +++++++++++++++++++++++++++++-
 3 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 857d2ee..ece0acb 100644
--- a/Makefile
+++ b/Makefile
@@ -107,3 +107,7 @@ test: all
 	@echo
 	@echo
 
+CBMC := cbmc
+
+verify:
+	$(CBMC) -DCPROVER --64 --depth 200 --bounds-check --pointer-check --memory-leak-check --div-by-zero-check --signed-overflow-check --unsigned-overflow-check --pointer-overflow-check --conversion-check --undefined-shift-check --enum-range-check --pointer-primitive-check -trace $(CBMC_ARGS) re.c
diff --git a/formal_verification.md b/formal_verification.md
index 46fc9ee..a36bb45 100644
--- a/formal_verification.md
+++ b/formal_verification.md
@@ -140,3 +140,8 @@ sys     9m34.654s
 klee@780432c1aaae0:~$ 
 ```
 
+----
+
+For the formal verifier CBMC just call make verify.
+This verifier is much faster and better than klee.
+https://www.cprover.org/cbmc/
diff --git a/re.c b/re.c
index 20d1474..896a417 100644
--- a/re.c
+++ b/re.c
@@ -230,7 +230,8 @@ re_t re_compile(const char* pattern)
       default:
       {
         re_compiled[j].type = CHAR;
-        re_compiled[j].u.ch = c;
+        // cbmc: arithmetic overflow on signed to unsigned type conversion in (unsigned char)c
+        re_compiled[j].u.ch = (unsigned char)c;
       } break;
     }
     /* no buffer-out-of-bounds access on invalid patterns - see https://github.com/kokke/tiny-regex-c/commit/1a279e04014b70b0695fba559a7c05d55e6ee90b */
@@ -526,3 +527,30 @@ static int matchpattern(regex_t* pattern, const char* text, int* matchlength)
 }
 
 #endif
+
+#ifdef CPROVER
+#define N 24
+
+/* Formal verification with cbmc: */
+/* cbmc -DCPROVER --64 --depth 200 --bounds-check --pointer-check --memory-leak-check --div-by-zero-check --signed-overflow-check --unsigned-overflow-check --pointer-overflow-check --conversion-check --undefined-shift-check --enum-range-check --pointer-primitive-check -trace re.c
+ */
+int main(int argc, char* argv[])
+{
+  /* test input - ten chars used as a regex-pattern input */
+  char arr[N];
+
+  /* make input symbolic, to search all paths through the code */
+  /* i.e. the input is checked for all possible ten-char combinations */
+  for (int i=0; i<sizeof(arr)-1; i++) {
+      //arr[i] = nondet_char();
+      assume(arr[i] > -127 && arr[i] < 128);
+  }
+  /* assume proper NULL termination */
+  assume(arr[sizeof(arr) - 1] == 0);
+
+  /* verify abscence of run-time errors - go! */
+  re_compile(arr);
+
+  return 0;
+}
+#endif

From 69afafec0a9db141a31706239e12e736faec7469 Mon Sep 17 00:00:00 2001
From: Reinhard Urban <reinhard.urban@nubix.de>
Date: Fri, 10 Jun 2022 13:52:11 +0200
Subject: [PATCH 02/10] extend CBMC checks to all APIs

compare GH #76
---
 Makefile |  2 +-
 re.c     | 11 ++++++++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index ece0acb..369f419 100644
--- a/Makefile
+++ b/Makefile
@@ -110,4 +110,4 @@ test: all
 CBMC := cbmc
 
 verify:
-	$(CBMC) -DCPROVER --64 --depth 200 --bounds-check --pointer-check --memory-leak-check --div-by-zero-check --signed-overflow-check --unsigned-overflow-check --pointer-overflow-check --conversion-check --undefined-shift-check --enum-range-check --pointer-primitive-check -trace $(CBMC_ARGS) re.c
+	$(CBMC) -DCPROVER --depth 200 --bounds-check --pointer-check --memory-leak-check --div-by-zero-check --signed-overflow-check --unsigned-overflow-check --pointer-overflow-check --conversion-check --undefined-shift-check --enum-range-check --pointer-primitive-check -trace $(CBMC_ARGS) re.c
diff --git a/re.c b/re.c
index 896a417..6b4cff6 100644
--- a/re.c
+++ b/re.c
@@ -536,8 +536,10 @@ static int matchpattern(regex_t* pattern, const char* text, int* matchlength)
  */
 int main(int argc, char* argv[])
 {
+  int length;
   /* test input - ten chars used as a regex-pattern input */
   char arr[N];
+  regex_t pattern[N];
 
   /* make input symbolic, to search all paths through the code */
   /* i.e. the input is checked for all possible ten-char combinations */
@@ -547,10 +549,17 @@ int main(int argc, char* argv[])
   }
   /* assume proper NULL termination */
   assume(arr[sizeof(arr) - 1] == 0);
-
   /* verify abscence of run-time errors - go! */
   re_compile(arr);
 
+  for (int i=0; i<N; i++) {
+      pattern[i].type = nondet_uchar();
+      pattern[i].u.ch = nondet_int();
+  }
+  re_print(&pattern);
+
+  re_match(&pattern, arr, &length);
+
   return 0;
 }
 #endif

From e4486516ec68a2c8b6e3844400a27c2c30ea43a0 Mon Sep 17 00:00:00 2001
From: Reinhard Urban <reinhard.urban@nubix.de>
Date: Fri, 10 Jun 2022 13:58:30 +0200
Subject: [PATCH 03/10] fix GH #76 out-of-bounds

with invalid types in re_print
---
 re.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/re.c b/re.c
index 6b4cff6..a562ce4 100644
--- a/re.c
+++ b/re.c
@@ -251,7 +251,7 @@ re_t re_compile(const char* pattern)
 
 void re_print(regex_t* pattern)
 {
-  const char* types[] = { "UNUSED", "DOT", "BEGIN", "END", "QUESTIONMARK", "STAR", "PLUS", "CHAR", "CHAR_CLASS", "INV_CHAR_CLASS", "DIGIT", "NOT_DIGIT", "ALPHA", "NOT_ALPHA", "WHITESPACE", "NOT_WHITESPACE", "BRANCH" };
+  const char* types[] = { "UNUSED", "DOT", "BEGIN", "END", "QUESTIONMARK", "STAR", "PLUS", "CHAR", "CHAR_CLASS", "INV_CHAR_CLASS", "DIGIT", "NOT_DIGIT", "ALPHA", "NOT_ALPHA", "WHITESPACE", "NOT_WHITESPACE" /*, "BRANCH" */ };
 
   int i;
   int j;
@@ -263,7 +263,11 @@ void re_print(regex_t* pattern)
       break;
     }
 
-    printf("type: %s", types[pattern[i].type]);
+    if (pattern[i].type <= NOT_WHITESPACE)
+      printf("type: %s", types[pattern[i].type]);
+    else
+      printf("invalid type: %d", pattern[i].type);
+
     if (pattern[i].type == CHAR_CLASS || pattern[i].type == INV_CHAR_CLASS)
     {
       printf(" [");

From bd55c35edf45d42a99395446db86e7c84482862c Mon Sep 17 00:00:00 2001
From: Reinhard Urban <reinhard.urban@nubix.de>
Date: Fri, 10 Jun 2022 14:26:35 +0200
Subject: [PATCH 04/10] refactor cbmc proofs a bit

seperate functions.
check assume vs nondet_uchar() (both are the same).
use less MAX_REGEXP_OBJECTS for cbmc (much faster then)

improve the no buffer-out-of-bounds access on invalid patterns check.
---
 Makefile |  3 ++-
 re.c     | 70 +++++++++++++++++++++++++++++++++++++++++---------------
 2 files changed, 53 insertions(+), 20 deletions(-)

diff --git a/Makefile b/Makefile
index 369f419..2a204b3 100644
--- a/Makefile
+++ b/Makefile
@@ -109,5 +109,6 @@ test: all
 
 CBMC := cbmc
 
+# unwindset: loop max MAX_REGEXP_OBJECTS patterns
 verify:
-	$(CBMC) -DCPROVER --depth 200 --bounds-check --pointer-check --memory-leak-check --div-by-zero-check --signed-overflow-check --unsigned-overflow-check --pointer-overflow-check --conversion-check --undefined-shift-check --enum-range-check --pointer-primitive-check -trace $(CBMC_ARGS) re.c
+	$(CBMC) -DCPROVER --unwindset 8 --unwind 16 --depth 16 --bounds-check --pointer-check --memory-leak-check --div-by-zero-check --signed-overflow-check --unsigned-overflow-check --pointer-overflow-check --conversion-check --undefined-shift-check --enum-range-check $(CBMC_ARGS) re.c
diff --git a/re.c b/re.c
index a562ce4..05aa97c 100644
--- a/re.c
+++ b/re.c
@@ -35,8 +35,12 @@
 
 /* Definitions: */
 
-#define MAX_REGEXP_OBJECTS      30    /* Max number of regex symbols in expression. */
 #define MAX_CHAR_CLASS_LEN      40    /* Max length of character-class buffer in.   */
+#ifndef CPROVER
+#define MAX_REGEXP_OBJECTS      30    /* Max number of regex symbols in expression. */
+#else
+#define MAX_REGEXP_OBJECTS      8    /* faster formal proofs */
+#endif
 
 
 enum { UNUSED, DOT, BEGIN, END, QUESTIONMARK, STAR, PLUS, CHAR, CHAR_CLASS, INV_CHAR_CLASS, DIGIT, NOT_DIGIT, ALPHA, NOT_ALPHA, WHITESPACE, NOT_WHITESPACE, /* BRANCH */ };
@@ -226,6 +230,9 @@ re_t re_compile(const char* pattern)
         re_compiled[j].u.ccl = &ccl_buf[buf_begin];
       } break;
 
+      case '\0': // EOL
+        return 0;
+
       /* Other characters: */
       default:
       {
@@ -234,12 +241,6 @@ re_t re_compile(const char* pattern)
         re_compiled[j].u.ch = (unsigned char)c;
       } break;
     }
-    /* no buffer-out-of-bounds access on invalid patterns - see https://github.com/kokke/tiny-regex-c/commit/1a279e04014b70b0695fba559a7c05d55e6ee90b */
-    if (pattern[i] == 0)
-    {
-      return 0;
-    }
-
     i += 1;
     j += 1;
   }
@@ -251,11 +252,14 @@ re_t re_compile(const char* pattern)
 
 void re_print(regex_t* pattern)
 {
-  const char* types[] = { "UNUSED", "DOT", "BEGIN", "END", "QUESTIONMARK", "STAR", "PLUS", "CHAR", "CHAR_CLASS", "INV_CHAR_CLASS", "DIGIT", "NOT_DIGIT", "ALPHA", "NOT_ALPHA", "WHITESPACE", "NOT_WHITESPACE" /*, "BRANCH" */ };
+  const char *const types[] = { "UNUSED", "DOT", "BEGIN", "END", "QUESTIONMARK", "STAR", "PLUS", "CHAR", "CHAR_CLASS", "INV_CHAR_CLASS", "DIGIT", "NOT_DIGIT", "ALPHA", "NOT_ALPHA", "WHITESPACE", "NOT_WHITESPACE" /*, "BRANCH" */ };
 
-  int i;
-  int j;
+  unsigned char i;
+  unsigned char j;
   char c;
+
+  if (!pattern)
+    return;
   for (i = 0; i < MAX_REGEXP_OBJECTS; ++i)
   {
     if (pattern[i].type == UNUSED)
@@ -538,32 +542,60 @@ static int matchpattern(regex_t* pattern, const char* text, int* matchlength)
 /* Formal verification with cbmc: */
 /* cbmc -DCPROVER --64 --depth 200 --bounds-check --pointer-check --memory-leak-check --div-by-zero-check --signed-overflow-check --unsigned-overflow-check --pointer-overflow-check --conversion-check --undefined-shift-check --enum-range-check --pointer-primitive-check -trace re.c
  */
-int main(int argc, char* argv[])
+
+void verify_re_compile()
 {
-  int length;
   /* test input - ten chars used as a regex-pattern input */
   char arr[N];
-  regex_t pattern[N];
-
   /* make input symbolic, to search all paths through the code */
   /* i.e. the input is checked for all possible ten-char combinations */
   for (int i=0; i<sizeof(arr)-1; i++) {
-      //arr[i] = nondet_char();
-      assume(arr[i] > -127 && arr[i] < 128);
+    //arr[i] = nondet_char();
+    assume(arr[i] > -127 && arr[i] < 128);
   }
   /* assume proper NULL termination */
   assume(arr[sizeof(arr) - 1] == 0);
   /* verify abscence of run-time errors - go! */
   re_compile(arr);
+}
 
-  for (int i=0; i<N; i++) {
-      pattern[i].type = nondet_uchar();
-      pattern[i].u.ch = nondet_int();
+void verify_re_print()
+{
+  regex_t pattern[MAX_REGEXP_OBJECTS];
+  for (unsigned char i=0; i<MAX_REGEXP_OBJECTS; i++) {
+    //pattern[i].type = nondet_uchar();
+    assume(pattern[i].type >= 0 && pattern[i].type <= 255);
+    pattern[i].u.ccl = nondet_long();
   }
   re_print(&pattern);
+}
+
+void verify_re_match()
+{
+  int length;
+  regex_t pattern[MAX_REGEXP_OBJECTS];
+  char arr[N];
+
+  for (unsigned char i=0; i<MAX_REGEXP_OBJECTS; i++) {
+    //pattern[i].type = nondet_uchar();
+    //pattern[i].u.ch = nondet_int();
+    assume(pattern[i].type >= 0 && pattern[i].type <= 255);
+    assume(pattern[i].u.ccl >= 0 && pattern[i].u.ccl <= ~1);
+  }
+  for (int i=0; i<sizeof(arr)-1; i++) {
+    assume(arr[i] > -127 && arr[i] < 128);
+  }
+  /* assume proper NULL termination */
+  assume(arr[sizeof(arr) - 1] == 0);
 
   re_match(&pattern, arr, &length);
+}
 
+int main(int argc, char* argv[])
+{
+  verify_re_compile();
+  verify_re_printh();
+  verify_re_match();
   return 0;
 }
 #endif

From 9d25c223eedf3ce1056fcf6e31703dd5077673d7 Mon Sep 17 00:00:00 2001
From: Reinhard Urban <reinhard.urban@nubix.de>
Date: Fri, 10 Jun 2022 14:51:24 +0200
Subject: [PATCH 05/10] support "\\\\" pattern, and disallow "..\\"

ending \\
---
 re.c          | 11 +++--------
 tests/test1.c |  3 +++
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/re.c b/re.c
index 05aa97c..4696505 100644
--- a/re.c
+++ b/re.c
@@ -156,7 +156,7 @@ re_t re_compile(const char* pattern)
             case 's': {    re_compiled[j].type = WHITESPACE;       } break;
             case 'S': {    re_compiled[j].type = NOT_WHITESPACE;   } break;
 
-            /* Escaped character, e.g. '.' or '$' */
+              /* Escaped character, e.g. '.', '$' or '\\' */
             default:
             {
               re_compiled[j].type = CHAR;
@@ -164,14 +164,9 @@ re_t re_compile(const char* pattern)
             } break;
           }
         }
-        /* '\\' as last char in pattern -> invalid regular expression. */
-/*
+        /* '\\' as last char without previous \\ -> invalid regular expression. */
         else
-        {
-          re_compiled[j].type = CHAR;
-          re_compiled[j].ch = pattern[i];
-        }
-*/
+          return 0;
       } break;
 
       /* Character class: */
diff --git a/tests/test1.c b/tests/test1.c
index 5fdfe74..af43c99 100644
--- a/tests/test1.c
+++ b/tests/test1.c
@@ -89,6 +89,9 @@ char* test_vector[][4] =
   { NOK, "X?Y",                        "Z",               (char*) 0      },
   { OK, "[a-z]+\nbreak",              "blahblah\nbreak",  (char*) 14     },
   { OK, "[a-z\\s]+\nbreak",           "bla bla \nbreak",  (char*) 14     },
+  { NOK, "a\\",                       "a\\",              (char*) 0      },
+  { NOK, "\\",                        "\\",               (char*) 0      },
+  { OK,  "\\\\",                      "\\",               (char*) 1      },
 };
 
 

From 7bd15de3604148bf4dd2c4e41851fdc83c86dfb3 Mon Sep 17 00:00:00 2001
From: Reinhard Urban <reinhard.urban@nubix.de>
Date: Fri, 10 Jun 2022 14:54:19 +0200
Subject: [PATCH 06/10] Clarify python2 is needed

---
 Makefile              | 2 +-
 scripts/regex_test.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 2a204b3..4d84611 100644
--- a/Makefile
+++ b/Makefile
@@ -9,7 +9,7 @@ PYTHON != if (python --version 2>&1 | grep -q 'Python 2\..*'); then \
           elif command -v python2 >/dev/null 2>&1; then             \
             echo 'python2';                                         \
           else                                                      \
-            echo 'Error: no compatible python version found.' >&2;  \
+            echo 'Error: no compatible python 2 version found.' >&2;  \
             exit 1;                                                 \
           fi
 
diff --git a/scripts/regex_test.py b/scripts/regex_test.py
index 4fa98de..08b4c5e 100755
--- a/scripts/regex_test.py
+++ b/scripts/regex_test.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 
 """
-  This program generates random text that matches a given regex-pattern.
+  This python2 program generates random text that matches a given regex-pattern.
   The pattern is given via sys.argv and the generated text is passed to
   the binary 'tests/test_rand' to check if the generated text also matches
   the regex-pattern in the C implementation.

From 0388df31ef50e5df4681da44f3828b5c112aa4e0 Mon Sep 17 00:00:00 2001
From: Reinhard Urban <reinhard.urban@nubix.de>
Date: Fri, 10 Jun 2022 15:37:47 +0200
Subject: [PATCH 07/10] re-enable INV_CHAR_CLASS

and use the enum type internally
---
 README.md     | 7 +------
 re.c          | 8 +++++---
 tests/test1.c | 4 +---
 3 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 0a2be86..fabddab 100644
--- a/README.md
+++ b/README.md
@@ -51,8 +51,6 @@ int  re_match(const char* pattern, const char* text, int* matchlength);
 ### Supported regex-operators
 The following features / regex-operators are supported by this library.
 
-NOTE: inverted character classes are buggy - see the test harness for concrete examples.
-
 
   -  `.`         Dot, matches any character
   -  `^`         Start anchor, matches beginning of string
@@ -104,10 +102,10 @@ if (match_idx != -1)
 For more usage examples I encourage you to look at the code in the `tests`-folder.
 
 ### TODO
-- Fix the implementation of inverted character classes.
 - Fix implementation of branches (`|`), and see if that can lead us closer to groups as well, e.g. `(a|b)+`.
 - Add `example.c` that demonstrates usage.
 - Add `tests/test_perf.c` for performance and time measurements.
+- Add optional multibyte support (e.g. UTF-8)
 - Testing: Improve pattern rejection testing.
 
 ### FAQ
@@ -118,6 +116,3 @@ For more usage examples I encourage you to look at the code in the `tests`-folde
 ### License
 All material in this repository is in the public domain.
 
-
-
- 
diff --git a/re.c b/re.c
index 4696505..d4413e6 100644
--- a/re.c
+++ b/re.c
@@ -15,7 +15,7 @@
  *   '+'        Plus, match one or more (greedy)
  *   '?'        Question, match zero or one (non-greedy)
  *   '[abc]'    Character class, match if one of {'a', 'b', 'c'}
- *   '[^abc]'   Inverted class, match if NOT one of {'a', 'b', 'c'} -- NOTE: feature is currently broken!
+ *   '[^abc]'   Inverted class, match if NOT one of {'a', 'b', 'c'}
  *   '[a-zA-Z]' Character ranges, the character set of the ranges { a-z | A-Z }
  *   '\s'       Whitespace, \t \f \r \n \v and spaces
  *   '\S'       Non-whitespace
@@ -43,11 +43,11 @@
 #endif
 
 
-enum { UNUSED, DOT, BEGIN, END, QUESTIONMARK, STAR, PLUS, CHAR, CHAR_CLASS, INV_CHAR_CLASS, DIGIT, NOT_DIGIT, ALPHA, NOT_ALPHA, WHITESPACE, NOT_WHITESPACE, /* BRANCH */ };
+enum regex_type_e { UNUSED, DOT, BEGIN, END, QUESTIONMARK, STAR, PLUS, CHAR, CHAR_CLASS, INV_CHAR_CLASS, DIGIT, NOT_DIGIT, ALPHA, NOT_ALPHA, WHITESPACE, NOT_WHITESPACE, /* BRANCH */ };
 
 typedef struct regex_t
 {
-  unsigned char  type;   /* CHAR, STAR, etc.                      */
+  enum regex_type_e type;   /* CHAR, STAR, etc.                      */
   union
   {
     unsigned char  ch;   /*      the character itself             */
@@ -270,6 +270,8 @@ void re_print(regex_t* pattern)
     if (pattern[i].type == CHAR_CLASS || pattern[i].type == INV_CHAR_CLASS)
     {
       printf(" [");
+      if (pattern[i].type == INV_CHAR_CLASS)
+        printf("^");
       for (j = 0; j < MAX_CHAR_CLASS_LEN; ++j)
       {
         c = pattern[i].u.ccl[j];
diff --git a/tests/test1.c b/tests/test1.c
index af43c99..7005494 100644
--- a/tests/test1.c
+++ b/tests/test1.c
@@ -75,15 +75,13 @@ char* test_vector[][4] =
   { OK,  "[Hh]ello [Ww]orld\\s*[!]?", "Hello world!   ",  (char*) 11     },
   { OK,  "[Hh]ello [Ww]orld\\s*[!]?", "Hello world  !",   (char*) 13     },
   { OK,  "[Hh]ello [Ww]orld\\s*[!]?", "hello World    !", (char*) 15     },
-  { NOK, "\\d\\d?:\\d\\d?:\\d\\d?",   "a:0",              (char*) 0      }, /* Failing test case reported in https://github.com/kokke/tiny-regex-c/issues/12 */
-/*
+  { NOK, "\\d\\d?:\\d\\d?:\\d\\d?",   "a:0",              (char*) 0      },
   { OK,  "[^\\w][^-1-4]",     ")T",          (char*) 2      },
   { OK,  "[^\\w][^-1-4]",     ")^",          (char*) 2      },
   { OK,  "[^\\w][^-1-4]",     "*)",          (char*) 2      },
   { OK,  "[^\\w][^-1-4]",     "!.",          (char*) 2      },
   { OK,  "[^\\w][^-1-4]",     " x",          (char*) 2      },
   { OK,  "[^\\w][^-1-4]",     "$b",          (char*) 2      },
-*/
   { OK,  ".?bar",                      "real_bar",        (char*) 4      },
   { NOK, ".?bar",                      "real_foo",        (char*) 0      },
   { NOK, "X?Y",                        "Z",               (char*) 0      },

From f334c5b3ff61acb6fbeebf83c11502bb9989b64b Mon Sep 17 00:00:00 2001
From: Reinhard Urban <reinhard.urban@nubix.de>
Date: Fri, 10 Jun 2022 15:38:31 +0200
Subject: [PATCH 08/10] prepare multi-byte support

and fix isalpha crashes on bad libc's. Fixes GH #70.
e.g. UTF-8.
---
 re.c          | 7 ++++---
 tests/test1.c | 5 +++++
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/re.c b/re.c
index d4413e6..fae8aa0 100644
--- a/re.c
+++ b/re.c
@@ -296,15 +296,15 @@ void re_print(regex_t* pattern)
 /* Private functions: */
 static int matchdigit(char c)
 {
-  return isdigit(c);
+  return isdigit((unsigned char)c);
 }
 static int matchalpha(char c)
 {
-  return isalpha(c);
+  return isalpha((unsigned char)c);
 }
 static int matchwhitespace(char c)
 {
-  return isspace(c);
+  return isspace((unsigned char)c);
 }
 static int matchalphanum(char c)
 {
@@ -407,6 +407,7 @@ static int matchstar(regex_t p, regex_t* pattern, const char* text, int* matchle
 {
   int prelen = *matchlength;
   const char* prepoint = text;
+  // TODO check if multibyte, and use mbtowc() then
   while ((text[0] != '\0') && matchone(p, *text))
   {
     text++;
diff --git a/tests/test1.c b/tests/test1.c
index 7005494..b98be12 100644
--- a/tests/test1.c
+++ b/tests/test1.c
@@ -4,6 +4,7 @@
 
 #include <stdio.h>
 #include <string.h>
+//#include <locale.h>
 #include "re.h"
 
 
@@ -90,6 +91,8 @@ char* test_vector[][4] =
   { NOK, "a\\",                       "a\\",              (char*) 0      },
   { NOK, "\\",                        "\\",               (char*) 0      },
   { OK,  "\\\\",                      "\\",               (char*) 1      },
+  // no multibyte support yet
+  //{ OK,  "\\w+",                      "Çüéâ",             (char*) 4      },
 };
 
 
@@ -106,6 +109,8 @@ int main()
     size_t nfailed = 0;
     size_t i;
 
+    //setlocale(LC_CTYPE, "en_US.UTF-8");
+
     for (i = 0; i < ntests; ++i)
     {
         pattern = test_vector[i][1];

From 148e229fb68a7875668653df6572a80f8ca8b988 Mon Sep 17 00:00:00 2001
From: Reinhard Urban <reinhard.urban@nubix.de>
Date: Mon, 20 Jun 2022 08:44:45 +0200
Subject: [PATCH 09/10] TODOs and new tests

---
 README.md     | 13 +++++++++----
 tests/test1.c |  2 ++
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index fabddab..d74f46a 100644
--- a/README.md
+++ b/README.md
@@ -61,7 +61,7 @@ The following features / regex-operators are supported by this library.
   -  `[abc]`     Character class, match if one of {'a', 'b', 'c'}
   -  `[^abc]`   Inverted class, match if NOT one of {'a', 'b', 'c'}
   -  `[a-zA-Z]` Character ranges, the character set of the ranges { a-z | A-Z }
-  -  `\s`       Whitespace, \t \f \r \n \v and spaces
+  -  `\s`       Whitespace, '\t' '\f' '\r' '\n' '\v' and spaces
   -  `\S`       Non-whitespace
   -  `\w`       Alphanumeric, [a-zA-Z0-9_]
   -  `\W`       Non-alphanumeric
@@ -88,7 +88,7 @@ int match_length;
 /* Standard null-terminated C-string to search: */
 const char* string_to_search = "ahem.. 'hello world !' ..";
 
-/* Compile a simple regular expression using character classes, meta-char and greedy + non-greedy quantifiers: */
+/* Compile a simple regular expression using character classes, meta-char and greedy quantifiers: */
 re_t pattern = re_compile("[Hh]ello [Ww]orld\\s*[!]?");
 
 /* Check if the regex matches the text: */
@@ -102,10 +102,15 @@ if (match_idx != -1)
 For more usage examples I encourage you to look at the code in the `tests`-folder.
 
 ### TODO
-- Fix implementation of branches (`|`), and see if that can lead us closer to groups as well, e.g. `(a|b)+`.
+- Fix implementation of branches (`|`) (see the branch), and add groups as well, e.g. `(a|b)+`.
+- `re_match_capture()` with groups.
 - Add `example.c` that demonstrates usage.
 - Add `tests/test_perf.c` for performance and time measurements.
-- Add optional multibyte support (e.g. UTF-8)
+- Add optional multibyte support (e.g. UTF-8). On non-wchar systems roll our own.
+- Word boundary: \b \B
+- non-greedy, lazy quantifiers (??, +?, *?, {n,m}?)
+- case-insensitive option or API. `re_matchi()`
+- '.' may not match '\r' nor '\n', unless a single-line option is given.
 - Testing: Improve pattern rejection testing.
 
 ### FAQ
diff --git a/tests/test1.c b/tests/test1.c
index b98be12..228b2e1 100644
--- a/tests/test1.c
+++ b/tests/test1.c
@@ -37,6 +37,8 @@ char* test_vector[][4] =
   { OK,  "[abc]",                     "1c2",              (char*) 1      },
   { NOK, "[abc]",                     "1C2",              (char*) 0      },
   { OK,  "[1-5]+",                    "0123456789",       (char*) 5      },
+  { OK,  "[1-5-]+",                   "123-",             (char*) 4      },
+  { OK,  "[1-5-]+[-1-2]-[-]", 	      "13132231--353444-511--",    (char *) 22  },
   { OK,  "[.2]",                      "1C2",              (char*) 1      },
   { OK,  "a*$",                       "Xaa",              (char*) 2      },
   { OK,  "a*$",                       "Xaa",              (char*) 2      },

From 89f513f4e8fb74a673bf9dface055faee2d4ba2a Mon Sep 17 00:00:00 2001
From: Reinhard Urban <reinhard.urban@nubix.de>
Date: Mon, 20 Jun 2022 08:56:38 +0200
Subject: [PATCH 10/10] fix ranges with ending -

Fixes GH #79 and the exreg failures with [1-5-]+[-1-2]-[-]
---
 re.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/re.c b/re.c
index fae8aa0..e81aa67 100644
--- a/re.c
+++ b/re.c
@@ -373,7 +373,9 @@ static int matchcharclass(char c, const char* str)
     {
       if (c == '-')
       {
-        return ((str[-1] == '\0') || (str[1] == '\0'));
+        if ((str[-1] == '\0') || (str[1] == '\0'))
+            return 1;
+        // else continue
       }
       else
       {