From 304256cb638a6afd1cc7d6810be34447abc398bf Mon Sep 17 00:00:00 2001 From: Alexander Date: Wed, 8 May 2019 15:02:52 +0200 Subject: [PATCH] Add ASCII-only option, to mimic default RE2 behavior (#1) * add ASCII-only option, to mimic default RE2 behaviour This is a workaround, motivated by the difference in handling non-valid UTF8 bytes that Oniriguma has, compared to Go's default RE2. See https://github.com/src-d/enry/issues/225#issuecomment-490043281 Summary of changes: - c: prevent `NewOnigRegex()` from hard-coding UTF8 - c: `NewOnigRegex()` now propely calls to `onig_initialize()` [1] - go: expose new `MustCompileASCII()` \w default charecter class matching only ASCII - go: `MustCompile()` refactored, `initRegexp()` extracted for common UTF8/ASCII logic Encoding was not exposed on Go API level intentionaly for simplisity, in order to avoid introducing complex struct type [2] to API surface. 1. https://github.com/kkos/oniguruma/blob/83572e983928243d741f61ac290fc057d69fefc3/doc/API#L6 2. https://github.com/kkos/oniguruma/blob/83572e983928243d741f61ac290fc057d69fefc3/src/oniguruma.h#L121 Signed-off-by: Alexander Bezzubov * ci: test on 2 latest go versions Signed-off-by: Alexander Bezzubov * ci: bump version of Oniguruma to 6.9.1 Update deb to get fix https://bugs.launchpad.net/ubuntu/+source/dpkg/+bug/1730627 Signed-off-by: Alexander Bezzubov * ci: refactor oniguruma installation Signed-off-by: Alexander Bezzubov * refactoring go part a bit, addressing review feedback Signed-off-by: Alexander Bezzubov * ci: fix typo in bash var substitution Signed-off-by: Alexander Bezzubov * cgo: simplify naive encoding init Signed-off-by: Alexander Bezzubov * go: doc syntax fix Signed-off-by: Alexander Bezzubov * tixing fypos Signed-off-by: Alexander Bezzubov --- .travis.yml | 23 ++++++++++++++--------- chelper.c | 2 +- regex.go | 28 +++++++++++++++++++++++----- 3 files changed, 38 insertions(+), 15 deletions(-) diff --git a/.travis.yml b/.travis.yml index 8c53b50..29bddf1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,15 +1,20 @@ +dist: trusty language: go +go: + - '1.11.x' + - '1.12.x' + env: global: - LD_LIBRARY_PATH="/usr/local/lib":${LD_LIBRARY_PATH} - GO111MODULE=on -addons: - apt: - packages: - - libonig-dev + - ONIGURUMA_VERSION='6.9.1' -jobs: - include: - - go: 1.11.x - script: - - go test -v --cover -race \ No newline at end of file +before_install: # install oniguruma manually as trusty has only ancient 5.x + - sudo apt-get install -y dpkg # dpkg >= 1.17.5ubuntu5.8 fixes https://bugs.launchpad.net/ubuntu/+source/dpkg/+bug/1730627 + - wget "http://archive.ubuntu.com/ubuntu/pool/universe/libo/libonig/libonig5_${ONIGURUMA_VERSION}-1_amd64.deb" + - sudo dpkg -i "libonig5_${ONIGURUMA_VERSION}-1_amd64.deb" + - wget "http://archive.ubuntu.com/ubuntu/pool/universe/libo/libonig/libonig-dev_${ONIGURUMA_VERSION}-1_amd64.deb" + - sudo dpkg -i "libonig-dev_${ONIGURUMA_VERSION}-1_amd64.deb" +script: + - go test -v --cover -race diff --git a/chelper.c b/chelper.c index 7b605d1..d768a77 100644 --- a/chelper.c +++ b/chelper.c @@ -17,7 +17,7 @@ int NewOnigRegex( char *pattern, int pattern_length, int option, *error_info = (OnigErrorInfo *) malloc(sizeof(OnigErrorInfo)); memset(*error_info, 0, sizeof(OnigErrorInfo)); - *encoding = (void*)ONIG_ENCODING_UTF8; + onig_initialize_encoding(*encoding); *error_buffer = (char*) malloc(ONIG_MAX_ERROR_MESSAGE_LEN * sizeof(char)); diff --git a/regex.go b/regex.go index 9bfc0a0..cbb647c 100644 --- a/regex.go +++ b/regex.go @@ -47,15 +47,24 @@ type Regexp struct { namedGroupInfo NamedGroupInfo } +// NewRegexp creates and initializes a new Regexp with the given pattern and option. func NewRegexp(pattern string, option int) (re *Regexp, err error) { - re = &Regexp{pattern: pattern} - patternCharPtr := C.CString(pattern) - defer C.free(unsafe.Pointer(patternCharPtr)) + return initRegexp(&Regexp{pattern: pattern, encoding: C.ONIG_ENCODING_UTF8}, option) +} + +// NewRegexpASCII is equivalent to NewRegexp, but with the encoding restricted to ASCII. +func NewRegexpASCII(pattern string, option int) (re *Regexp, err error) { + return initRegexp(&Regexp{pattern: pattern, encoding: C.ONIG_ENCODING_ASCII}, option) +} +func initRegexp(re *Regexp, option int) (*Regexp, error) { + var err error + patternCharPtr := C.CString(re.pattern) + defer C.free(unsafe.Pointer(patternCharPtr)) mutex.Lock() defer mutex.Unlock() - error_code := C.NewOnigRegex(patternCharPtr, C.int(len(pattern)), C.int(option), &re.regex, &re.region, &re.encoding, &re.errorInfo, &re.errorBuf) - if error_code != C.ONIG_NORMAL { + errorCode := C.NewOnigRegex(patternCharPtr, C.int(len(re.pattern)), C.int(option), &re.regex, &re.region, &re.encoding, &re.errorInfo, &re.errorBuf) + if errorCode != C.ONIG_NORMAL { err = errors.New(C.GoString(re.errorBuf)) } else { err = nil @@ -95,6 +104,15 @@ func MustCompileWithOption(str string, option int) *Regexp { return regexp } +// MustCompileASCII is equivalent to MustCompile, but with the encoding restricted to ASCII. +func MustCompileASCII(str string) *Regexp { + regexp, error := NewRegexpASCII(str, ONIG_OPTION_DEFAULT) + if error != nil { + panic("regexp: compiling " + str + ": " + error.Error()) + } + return regexp +} + func (re *Regexp) Free() { mutex.Lock() if re.regex != nil {