Skip to content

Commit

Permalink
Add ASCII-only option, to mimic default RE2 behavior (#1)
Browse files Browse the repository at this point in the history
* add ASCII-only option, to mimic default RE2 behaviour

This is a workaround, motivated by the difference in handling non-valid UTF8
bytes that Oniriguma has, compared to Go's default RE2.

See src-d/enry#225 (comment)

Summary of changes:
 - c: prevent `NewOnigRegex()` from hard-coding UTF8
 - c: `NewOnigRegex()` now propely calls to `onig_initialize()` [1]
 - go: expose new `MustCompileASCII()` \w default charecter class matching only ASCII
 - go: `MustCompile()` refactored, `initRegexp()` extracted for common UTF8/ASCII logic

Encoding was not exposed on Go API level intentionaly for simplisity,
in order to avoid introducing complex struct type [2] to API surface.

 1. https://github.com/kkos/oniguruma/blob/83572e983928243d741f61ac290fc057d69fefc3/doc/API#L6
 2. https://github.com/kkos/oniguruma/blob/83572e983928243d741f61ac290fc057d69fefc3/src/oniguruma.h#L121

Signed-off-by: Alexander Bezzubov <[email protected]>

* ci: test on 2 latest go versions

Signed-off-by: Alexander Bezzubov <[email protected]>

* ci: bump version of Oniguruma to 6.9.1

Update deb to get fix https://bugs.launchpad.net/ubuntu/+source/dpkg/+bug/1730627

Signed-off-by: Alexander Bezzubov <[email protected]>

* ci: refactor oniguruma installation

Signed-off-by: Alexander Bezzubov <[email protected]>

* refactoring go part a bit, addressing review feedback

Signed-off-by: Alexander Bezzubov <[email protected]>

* ci: fix typo in bash var substitution

Signed-off-by: Alexander Bezzubov <[email protected]>

* cgo: simplify naive encoding init

Signed-off-by: Alexander Bezzubov <[email protected]>

* go: doc syntax fix

Signed-off-by: Alexander Bezzubov <[email protected]>

* tixing fypos

Signed-off-by: Alexander Bezzubov <[email protected]>
  • Loading branch information
bzz authored and kuba-- committed May 8, 2019
1 parent 7883039 commit 304256c
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 15 deletions.
23 changes: 14 additions & 9 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,15 +1,20 @@
dist: trusty
language: go
go:
- '1.11.x'
- '1.12.x'

env:
global:
- LD_LIBRARY_PATH="/usr/local/lib":${LD_LIBRARY_PATH}
- GO111MODULE=on
addons:
apt:
packages:
- libonig-dev
- ONIGURUMA_VERSION='6.9.1'

jobs:
include:
- go: 1.11.x
script:
- go test -v --cover -race
before_install: # install oniguruma manually as trusty has only ancient 5.x
- sudo apt-get install -y dpkg # dpkg >= 1.17.5ubuntu5.8 fixes https://bugs.launchpad.net/ubuntu/+source/dpkg/+bug/1730627
- wget "http://archive.ubuntu.com/ubuntu/pool/universe/libo/libonig/libonig5_${ONIGURUMA_VERSION}-1_amd64.deb"
- sudo dpkg -i "libonig5_${ONIGURUMA_VERSION}-1_amd64.deb"
- wget "http://archive.ubuntu.com/ubuntu/pool/universe/libo/libonig/libonig-dev_${ONIGURUMA_VERSION}-1_amd64.deb"
- sudo dpkg -i "libonig-dev_${ONIGURUMA_VERSION}-1_amd64.deb"
script:
- go test -v --cover -race
2 changes: 1 addition & 1 deletion chelper.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ int NewOnigRegex( char *pattern, int pattern_length, int option,
*error_info = (OnigErrorInfo *) malloc(sizeof(OnigErrorInfo));
memset(*error_info, 0, sizeof(OnigErrorInfo));

*encoding = (void*)ONIG_ENCODING_UTF8;
onig_initialize_encoding(*encoding);

*error_buffer = (char*) malloc(ONIG_MAX_ERROR_MESSAGE_LEN * sizeof(char));

Expand Down
28 changes: 23 additions & 5 deletions regex.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,15 +47,24 @@ type Regexp struct {
namedGroupInfo NamedGroupInfo
}

// NewRegexp creates and initializes a new Regexp with the given pattern and option.
func NewRegexp(pattern string, option int) (re *Regexp, err error) {
re = &Regexp{pattern: pattern}
patternCharPtr := C.CString(pattern)
defer C.free(unsafe.Pointer(patternCharPtr))
return initRegexp(&Regexp{pattern: pattern, encoding: C.ONIG_ENCODING_UTF8}, option)
}

// NewRegexpASCII is equivalent to NewRegexp, but with the encoding restricted to ASCII.
func NewRegexpASCII(pattern string, option int) (re *Regexp, err error) {
return initRegexp(&Regexp{pattern: pattern, encoding: C.ONIG_ENCODING_ASCII}, option)
}

func initRegexp(re *Regexp, option int) (*Regexp, error) {
var err error
patternCharPtr := C.CString(re.pattern)
defer C.free(unsafe.Pointer(patternCharPtr))
mutex.Lock()
defer mutex.Unlock()
error_code := C.NewOnigRegex(patternCharPtr, C.int(len(pattern)), C.int(option), &re.regex, &re.region, &re.encoding, &re.errorInfo, &re.errorBuf)
if error_code != C.ONIG_NORMAL {
errorCode := C.NewOnigRegex(patternCharPtr, C.int(len(re.pattern)), C.int(option), &re.regex, &re.region, &re.encoding, &re.errorInfo, &re.errorBuf)
if errorCode != C.ONIG_NORMAL {
err = errors.New(C.GoString(re.errorBuf))
} else {
err = nil
Expand Down Expand Up @@ -95,6 +104,15 @@ func MustCompileWithOption(str string, option int) *Regexp {
return regexp
}

// MustCompileASCII is equivalent to MustCompile, but with the encoding restricted to ASCII.
func MustCompileASCII(str string) *Regexp {
regexp, error := NewRegexpASCII(str, ONIG_OPTION_DEFAULT)
if error != nil {
panic("regexp: compiling " + str + ": " + error.Error())
}
return regexp
}

func (re *Regexp) Free() {
mutex.Lock()
if re.regex != nil {
Expand Down

0 comments on commit 304256c

Please sign in to comment.