From 1659c2350421c350c6aac17042dfe54d47c78c35 Mon Sep 17 00:00:00 2001 From: Anton Medvedev Date: Fri, 17 May 2024 18:10:26 +0200 Subject: [PATCH 1/3] Refactor lexer (#653) --- expr_test.go | 13 +++++- file/error.go | 28 +++++++++--- file/location.go | 8 +--- file/source.go | 73 +++++++++-------------------- file/source_test.go | 15 ------ parser/lexer/lexer.go | 91 ++++++++++++++++++------------------- parser/lexer/lexer_test.go | 32 +++++++------ parser/lexer/state.go | 14 +++--- parser/parser.go | 2 +- parser/parser_test.go | 12 ++--- testdata/crash.txt | Bin 0 -> 6 bytes vm/program.go | 6 +-- 12 files changed, 135 insertions(+), 159 deletions(-) create mode 100644 testdata/crash.txt diff --git a/expr_test.go b/expr_test.go index 234b9be61..bd099a276 100644 --- a/expr_test.go +++ b/expr_test.go @@ -1623,7 +1623,10 @@ func TestCompile_exposed_error(t *testing.T) { b, err := json.Marshal(err) require.NoError(t, err) - require.Equal(t, `{"Line":1,"Column":2,"Message":"invalid operation: == (mismatched types int and bool)","Snippet":"\n | 1 == true\n | ..^","Prev":null}`, string(b)) + require.Equal(t, + `{"from":2,"to":4,"line":1,"column":2,"message":"invalid operation: == (mismatched types int and bool)","snippet":"\n | 1 == true\n | ..^","prev":null}`, + string(b), + ) } func TestAsBool_exposed_error(t *testing.T) { @@ -2667,3 +2670,11 @@ func TestIssue_integer_truncated_by_compiler(t *testing.T) { _, err = expr.Compile("fn(256)", expr.Env(env)) require.Error(t, err) } + +func TestExpr_crash(t *testing.T) { + content, err := os.ReadFile("testdata/crash.txt") + require.NoError(t, err) + + _, err = expr.Compile(string(content)) + require.Error(t, err) +} diff --git a/file/error.go b/file/error.go index edf202b04..8ff85dfa5 100644 --- a/file/error.go +++ b/file/error.go @@ -8,22 +8,36 @@ import ( type Error struct { Location - Message string - Snippet string - Prev error + Line int `json:"line"` + Column int `json:"column"` + Message string `json:"message"` + Snippet string `json:"snippet"` + Prev error `json:"prev"` } func (e *Error) Error() string { return e.format() } -func (e *Error) Bind(source *Source) *Error { - if snippet, found := source.Snippet(e.Location.Line); found { +func (e *Error) Bind(source Source) *Error { + e.Line = 1 + for i, r := range source { + if i == e.From { + break + } + if r == '\n' { + e.Line++ + e.Column = 0 + } else { + e.Column++ + } + } + if snippet, found := source.Snippet(e.Line); found { snippet := strings.Replace(snippet, "\t", " ", -1) srcLine := "\n | " + snippet var bytes = []byte(snippet) var indLine = "\n | " - for i := 0; i < e.Location.Column && len(bytes) > 0; i++ { + for i := 0; i < e.Column && len(bytes) > 0; i++ { _, sz := utf8.DecodeRune(bytes) bytes = bytes[sz:] if sz > 1 { @@ -54,7 +68,7 @@ func (e *Error) Wrap(err error) { } func (e *Error) format() string { - if e.Location.Empty() { + if e.Snippet == "" { return e.Message } return fmt.Sprintf( diff --git a/file/location.go b/file/location.go index a92e27f0b..6c6bc2427 100644 --- a/file/location.go +++ b/file/location.go @@ -1,10 +1,6 @@ package file type Location struct { - Line int // The 1-based line of the location. - Column int // The 0-based column number of the location. -} - -func (l Location) Empty() bool { - return l.Column == 0 && l.Line == 0 + From int `json:"from"` + To int `json:"to"` } diff --git a/file/source.go b/file/source.go index d86a546b1..8e2b2d154 100644 --- a/file/source.go +++ b/file/source.go @@ -1,78 +1,47 @@ package file import ( - "encoding/json" "strings" "unicode/utf8" ) -type Source struct { - contents []rune - lineOffsets []int32 -} - -func NewSource(contents string) *Source { - s := &Source{ - contents: []rune(contents), - } - s.updateOffsets() - return s -} - -func (s *Source) MarshalJSON() ([]byte, error) { - return json.Marshal(s.contents) -} - -func (s *Source) UnmarshalJSON(b []byte) error { - contents := make([]rune, 0) - err := json.Unmarshal(b, &contents) - if err != nil { - return err - } +type Source []rune - s.contents = contents - s.updateOffsets() - return nil +func NewSource(contents string) Source { + return []rune(contents) } -func (s *Source) Content() string { - return string(s.contents) +func (s Source) String() string { + return string(s) } -func (s *Source) Snippet(line int) (string, bool) { +func (s Source) Snippet(line int) (string, bool) { if s == nil { return "", false } - charStart, found := s.findLineOffset(line) - if !found || len(s.contents) == 0 { + lines := strings.Split(string(s), "\n") + lineOffsets := make([]int, len(lines)) + var offset int + for i, line := range lines { + offset = offset + utf8.RuneCountInString(line) + 1 + lineOffsets[i] = offset + } + charStart, found := getLineOffset(lineOffsets, line) + if !found || len(s) == 0 { return "", false } - charEnd, found := s.findLineOffset(line + 1) + charEnd, found := getLineOffset(lineOffsets, line+1) if found { - return string(s.contents[charStart : charEnd-1]), true - } - return string(s.contents[charStart:]), true -} - -// updateOffsets compute line offsets up front as they are referred to frequently. -func (s *Source) updateOffsets() { - lines := strings.Split(string(s.contents), "\n") - offsets := make([]int32, len(lines)) - var offset int32 - for i, line := range lines { - offset = offset + int32(utf8.RuneCountInString(line)) + 1 - offsets[int32(i)] = offset + return string(s[charStart : charEnd-1]), true } - s.lineOffsets = offsets + return string(s[charStart:]), true } -// findLineOffset returns the offset where the (1-indexed) line begins, -// or false if line doesn't exist. -func (s *Source) findLineOffset(line int) (int32, bool) { +func getLineOffset(lineOffsets []int, line int) (int, bool) { if line == 1 { return 0, true - } else if line > 1 && line <= len(s.lineOffsets) { - offset := s.lineOffsets[line-2] + } else if line > 1 && line <= len(lineOffsets) { + offset := lineOffsets[line-2] return offset, true } return -1, false diff --git a/file/source_test.go b/file/source_test.go index d380c22d0..5aa4f23ae 100644 --- a/file/source_test.go +++ b/file/source_test.go @@ -1,10 +1,7 @@ package file import ( - "encoding/json" "testing" - - "github.com/expr-lang/expr/internal/testify/assert" ) const ( @@ -55,15 +52,3 @@ func TestStringSource_SnippetSingleLine(t *testing.T) { t.Errorf(unexpectedSnippet, t.Name(), str2, "") } } - -func TestStringSource_MarshalJSON(t *testing.T) { - source := NewSource("hello, world") - encoded, err := json.Marshal(source) - assert.NoError(t, err) - assert.Equal(t, `[104,101,108,108,111,44,32,119,111,114,108,100]`, string(encoded)) - - decoded := &Source{} - err = json.Unmarshal(encoded, decoded) - assert.NoError(t, err) - assert.Equal(t, source.Content(), decoded.Content()) -} diff --git a/parser/lexer/lexer.go b/parser/lexer/lexer.go index c32658637..e6b06c09d 100644 --- a/parser/lexer/lexer.go +++ b/parser/lexer/lexer.go @@ -3,20 +3,18 @@ package lexer import ( "fmt" "strings" - "unicode/utf8" "github.com/expr-lang/expr/file" ) -func Lex(source *file.Source) ([]Token, error) { +func Lex(source file.Source) ([]Token, error) { l := &lexer{ - input: source.Content(), + source: source, tokens: make([]Token, 0), + start: 0, + end: 0, } - - l.loc = file.Location{Line: 1, Column: 0} - l.prev = l.loc - l.startLoc = l.loc + l.commit() for state := root; state != nil; { state = state(l) @@ -30,34 +28,25 @@ func Lex(source *file.Source) ([]Token, error) { } type lexer struct { - input string + source file.Source tokens []Token - start, end int // current position in input - width int // last rune width - startLoc file.Location // start location - prev, loc file.Location // prev location of end location, end location + start, end int err *file.Error } const eof rune = -1 +func (l *lexer) commit() { + l.start = l.end +} + func (l *lexer) next() rune { - if l.end >= len(l.input) { - l.width = 0 + if l.end >= len(l.source) { + l.end++ return eof } - r, w := utf8.DecodeRuneInString(l.input[l.end:]) - l.width = w - l.end += w - - l.prev = l.loc - if r == '\n' { - l.loc.Line++ - l.loc.Column = 0 - } else { - l.loc.Column++ - } - + r := l.source[l.end] + l.end++ return r } @@ -68,8 +57,7 @@ func (l *lexer) peek() rune { } func (l *lexer) backup() { - l.end -= l.width - l.loc = l.prev + l.end-- } func (l *lexer) emit(t Kind) { @@ -78,35 +66,39 @@ func (l *lexer) emit(t Kind) { func (l *lexer) emitValue(t Kind, value string) { l.tokens = append(l.tokens, Token{ - Location: l.startLoc, + Location: file.Location{From: l.start, To: l.end}, Kind: t, Value: value, }) - l.start = l.end - l.startLoc = l.loc + l.commit() } func (l *lexer) emitEOF() { + from := l.end - 2 + if from < 0 { + from = 0 + } + to := l.end - 1 + if to < 0 { + to = 0 + } l.tokens = append(l.tokens, Token{ - Location: l.prev, // Point to previous position for better error messages. + Location: file.Location{From: from, To: to}, Kind: EOF, }) - l.start = l.end - l.startLoc = l.loc + l.commit() } func (l *lexer) skip() { - l.start = l.end - l.startLoc = l.loc + l.commit() } func (l *lexer) word() string { - return l.input[l.start:l.end] -} - -func (l *lexer) ignore() { - l.start = l.end - l.startLoc = l.loc + // TODO: boundary check is NOT needed here, but for some reason CI fuzz tests are failing. + if l.start > len(l.source) || l.end > len(l.source) { + return "__invalid__" + } + return string(l.source[l.start:l.end]) } func (l *lexer) accept(valid string) bool { @@ -132,18 +124,18 @@ func (l *lexer) skipSpaces() { } func (l *lexer) acceptWord(word string) bool { - pos, loc, prev := l.end, l.loc, l.prev + pos := l.end l.skipSpaces() for _, ch := range word { if l.next() != ch { - l.end, l.loc, l.prev = pos, loc, prev + l.end = pos return false } } if r := l.peek(); r != ' ' && r != eof { - l.end, l.loc, l.prev = pos, loc, prev + l.end = pos return false } @@ -153,8 +145,11 @@ func (l *lexer) acceptWord(word string) bool { func (l *lexer) error(format string, args ...any) stateFn { if l.err == nil { // show first error l.err = &file.Error{ - Location: l.loc, - Message: fmt.Sprintf(format, args...), + Location: file.Location{ + From: l.end - 1, + To: l.end, + }, + Message: fmt.Sprintf(format, args...), } } return nil @@ -230,6 +225,6 @@ func (l *lexer) scanRawString(quote rune) (n int) { ch = l.next() n++ } - l.emitValue(String, l.input[l.start+1:l.end-1]) + l.emitValue(String, string(l.source[l.start+1:l.end-1])) return } diff --git a/parser/lexer/lexer_test.go b/parser/lexer/lexer_test.go index f30d3ba11..5edcbe5c4 100644 --- a/parser/lexer/lexer_test.go +++ b/parser/lexer/lexer_test.go @@ -5,10 +5,9 @@ import ( "strings" "testing" + "github.com/expr-lang/expr/file" "github.com/expr-lang/expr/internal/testify/assert" "github.com/expr-lang/expr/internal/testify/require" - - "github.com/expr-lang/expr/file" . "github.com/expr-lang/expr/parser/lexer" ) @@ -17,6 +16,13 @@ func TestLex(t *testing.T) { input string tokens []Token }{ + { + "1", + []Token{ + {Kind: Number, Value: "1"}, + {Kind: EOF}, + }, + }, { ".5 0.025 1 02 1e3 0xFF 0b0101 0o600 1.2e-4 1_000_000 _42 -.5", []Token{ @@ -265,25 +271,25 @@ func compareTokens(i1, i2 []Token) bool { } func TestLex_location(t *testing.T) { - source := file.NewSource("1..2 3..4") + source := file.NewSource("1..2\n3..4") tokens, err := Lex(source) require.NoError(t, err) require.Equal(t, []Token{ - {Location: file.Location{Line: 1, Column: 0}, Kind: Number, Value: "1"}, - {Location: file.Location{Line: 1, Column: 1}, Kind: Operator, Value: ".."}, - {Location: file.Location{Line: 1, Column: 3}, Kind: Number, Value: "2"}, - {Location: file.Location{Line: 1, Column: 5}, Kind: Number, Value: "3"}, - {Location: file.Location{Line: 1, Column: 6}, Kind: Operator, Value: ".."}, - {Location: file.Location{Line: 1, Column: 8}, Kind: Number, Value: "4"}, - {Location: file.Location{Line: 1, Column: 8}, Kind: EOF, Value: ""}, + {Location: file.Location{From: 0, To: 1}, Kind: "Number", Value: "1"}, + {Location: file.Location{From: 1, To: 3}, Kind: "Operator", Value: ".."}, + {Location: file.Location{From: 3, To: 4}, Kind: "Number", Value: "2"}, + {Location: file.Location{From: 5, To: 6}, Kind: "Number", Value: "3"}, + {Location: file.Location{From: 6, To: 8}, Kind: "Operator", Value: ".."}, + {Location: file.Location{From: 8, To: 9}, Kind: "Number", Value: "4"}, + {Location: file.Location{From: 8, To: 9}, Kind: "EOF", Value: ""}, }, tokens) } const errorTests = ` "\xQA" -invalid char escape (1:5) +invalid char escape (1:4) | "\xQA" - | ....^ + | ...^ id "hello literal not terminated (1:10) @@ -291,7 +297,7 @@ literal not terminated (1:10) | .........^ früh ♥︎ -unrecognized character: U+2665 '♥' (1:7) +unrecognized character: U+2665 '♥' (1:6) | früh ♥︎ ` diff --git a/parser/lexer/state.go b/parser/lexer/state.go index 72f02bf4e..d351e2f5c 100644 --- a/parser/lexer/state.go +++ b/parser/lexer/state.go @@ -14,7 +14,7 @@ func root(l *lexer) stateFn { l.emitEOF() return nil case utils.IsSpace(r): - l.ignore() + l.skip() return root case r == '\'' || r == '"': l.scanString(r) @@ -83,14 +83,14 @@ func (l *lexer) scanNumber() bool { } } l.acceptRun(digits) - loc, prev, end := l.loc, l.prev, l.end + end := l.end if l.accept(".") { // Lookup for .. operator: if after dot there is another dot (1..2), it maybe a range operator. if l.peek() == '.' { // We can't backup() here, as it would require two backups, // and backup() func supports only one for now. So, save and // restore it here. - l.loc, l.prev, l.end = loc, prev, end + l.end = end return true } l.acceptRun(digits) @@ -147,7 +147,7 @@ func not(l *lexer) stateFn { l.skipSpaces() - pos, loc, prev := l.end, l.loc, l.prev + end := l.end // Get the next word. for { @@ -164,7 +164,7 @@ func not(l *lexer) stateFn { case "in", "matches", "contains", "startsWith", "endsWith": l.emit(Operator) default: - l.end, l.loc, l.prev = pos, loc, prev + l.end = end } return root } @@ -193,7 +193,7 @@ func singleLineComment(l *lexer) stateFn { break } } - l.ignore() + l.skip() return root } @@ -207,7 +207,7 @@ func multiLineComment(l *lexer) stateFn { break } } - l.ignore() + l.skip() return root } diff --git a/parser/parser.go b/parser/parser.go index 6d96561ad..fd1b3c043 100644 --- a/parser/parser.go +++ b/parser/parser.go @@ -55,7 +55,7 @@ type parser struct { type Tree struct { Node Node - Source *file.Source + Source file.Source } func Parse(input string) (*Tree, error) { diff --git a/parser/parser_test.go b/parser/parser_test.go index bb0fc078b..3c6ee5b2b 100644 --- a/parser/parser_test.go +++ b/parser/parser_test.go @@ -724,14 +724,14 @@ Operator (||) and coalesce expressions (??) cannot be mixed. Wrap either by pare | ...........^ 0b15 -bad number syntax: "0b15" (1:5) +bad number syntax: "0b15" (1:4) | 0b15 - | ....^ + | ...^ 0X10G -bad number syntax: "0X10G" (1:6) +bad number syntax: "0X10G" (1:5) | 0X10G - | .....^ + | ....^ 0o1E invalid float literal: strconv.ParseFloat: parsing "0o1E": invalid syntax (1:4) @@ -744,9 +744,9 @@ invalid float literal: strconv.ParseFloat: parsing "0b1E": invalid syntax (1:4) | ...^ 0b1E+6 -bad number syntax: "0b1E+6" (1:7) +bad number syntax: "0b1E+6" (1:6) | 0b1E+6 - | ......^ + | .....^ 0b1E+1 invalid float literal: strconv.ParseFloat: parsing "0b1E+1": invalid syntax (1:6) diff --git a/testdata/crash.txt b/testdata/crash.txt new file mode 100644 index 0000000000000000000000000000000000000000..3721d562474afc6e4fe987516512fb8e91c36fb1 GIT binary patch literal 6 NcmZQzU|>?`1pokN05<>t literal 0 HcmV?d00001 diff --git a/vm/program.go b/vm/program.go index 989546744..15ce26f5b 100644 --- a/vm/program.go +++ b/vm/program.go @@ -21,7 +21,7 @@ type Program struct { Arguments []int Constants []any - source *file.Source + source file.Source node ast.Node locations []file.Location variables int @@ -32,7 +32,7 @@ type Program struct { // NewProgram returns a new Program. It's used by the compiler. func NewProgram( - source *file.Source, + source file.Source, node ast.Node, locations []file.Location, variables int, @@ -58,7 +58,7 @@ func NewProgram( } // Source returns origin file.Source. -func (program *Program) Source() *file.Source { +func (program *Program) Source() file.Source { return program.source } From eb70f943cc3d585443f34580a7e7f8f0bd197622 Mon Sep 17 00:00:00 2001 From: Anton Medvedev Date: Fri, 17 May 2024 19:40:20 +0200 Subject: [PATCH 2/3] Add checks for nil before string casts in VM (#654) --- expr_test.go | 34 ++++++++++++++++++++++++++++++++++ vm/vm.go | 21 ++++++++++++++++++++- 2 files changed, 54 insertions(+), 1 deletion(-) diff --git a/expr_test.go b/expr_test.go index bd099a276..8b7856a43 100644 --- a/expr_test.go +++ b/expr_test.go @@ -2678,3 +2678,37 @@ func TestExpr_crash(t *testing.T) { _, err = expr.Compile(string(content)) require.Error(t, err) } + +func TestExpr_nil_op_str(t *testing.T) { + // Let's test operators, which do `.(string)` in VM, also check for nil. + + var str *string = nil + env := map[string]any{ + "nilString": str, + } + + tests := []struct{ code string }{ + {`nilString == "str"`}, + {`nilString contains "str"`}, + {`nilString matches "str"`}, + {`nilString startsWith "str"`}, + {`nilString endsWith "str"`}, + + {`"str" == nilString`}, + {`"str" contains nilString`}, + {`"str" matches nilString`}, + {`"str" startsWith nilString`}, + {`"str" endsWith nilString`}, + } + + for _, tt := range tests { + t.Run(tt.code, func(t *testing.T) { + program, err := expr.Compile(tt.code) + require.NoError(t, err) + + output, err := expr.Run(program, env) + require.NoError(t, err) + require.Equal(t, false, output) + }) + } +} diff --git a/vm/vm.go b/vm/vm.go index 7e933ce74..fa1223b42 100644 --- a/vm/vm.go +++ b/vm/vm.go @@ -274,31 +274,50 @@ func (vm *VM) Run(program *Program, env any) (_ any, err error) { case OpMatches: b := vm.pop() a := vm.pop() + if runtime.IsNil(a) || runtime.IsNil(b) { + vm.push(false) + break + } match, err := regexp.MatchString(b.(string), a.(string)) if err != nil { panic(err) } - vm.push(match) case OpMatchesConst: a := vm.pop() + if runtime.IsNil(a) { + vm.push(false) + break + } r := program.Constants[arg].(*regexp.Regexp) vm.push(r.MatchString(a.(string))) case OpContains: b := vm.pop() a := vm.pop() + if runtime.IsNil(a) || runtime.IsNil(b) { + vm.push(false) + break + } vm.push(strings.Contains(a.(string), b.(string))) case OpStartsWith: b := vm.pop() a := vm.pop() + if runtime.IsNil(a) || runtime.IsNil(b) { + vm.push(false) + break + } vm.push(strings.HasPrefix(a.(string), b.(string))) case OpEndsWith: b := vm.pop() a := vm.pop() + if runtime.IsNil(a) || runtime.IsNil(b) { + vm.push(false) + break + } vm.push(strings.HasSuffix(a.(string), b.(string))) case OpSlice: From cae6003b4797768f5a18001bdccfa47aaeaeb6a4 Mon Sep 17 00:00:00 2001 From: Anton Medvedev Date: Fri, 17 May 2024 21:40:50 +0200 Subject: [PATCH 3/3] Fix build for 386 and add build.yml workflow --- .github/workflows/build.yml | 23 +++++++++++++++++++++++ compiler/compiler.go | 6 ++---- 2 files changed, 25 insertions(+), 4 deletions(-) create mode 100644 .github/workflows/build.yml diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 000000000..d2e6518ca --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,23 @@ +name: build + +on: + push: + branches: [ master ] + pull_request: + branches: [ master ] + +jobs: + test: + runs-on: ubuntu-latest + strategy: + matrix: + go-versions: [ '1.18', '1.22' ] + go-arch: [ '386' ] + steps: + - uses: actions/checkout@v3 + - name: Setup Go ${{ matrix.go-version }} + uses: actions/setup-go@v4 + with: + go-version: ${{ matrix.go-version }} + - name: Build + run: GOARCH=${{ matrix.go-arch }} go build diff --git a/compiler/compiler.go b/compiler/compiler.go index c04611bdc..457088d3f 100644 --- a/compiler/compiler.go +++ b/compiler/compiler.go @@ -345,9 +345,7 @@ func (c *compiler) IntegerNode(node *ast.IntegerNode) { } c.emitPush(int32(node.Value)) case reflect.Int64: - if node.Value > math.MaxInt64 || node.Value < math.MinInt64 { - panic(fmt.Sprintf("constant %d overflows int64", node.Value)) - } + panic(fmt.Sprintf("constant %d overflows int64", node.Value)) c.emitPush(int64(node.Value)) case reflect.Uint: if node.Value < 0 { @@ -365,7 +363,7 @@ func (c *compiler) IntegerNode(node *ast.IntegerNode) { } c.emitPush(uint16(node.Value)) case reflect.Uint32: - if node.Value > math.MaxUint32 || node.Value < 0 { + if node.Value < 0 { panic(fmt.Sprintf("constant %d overflows uint32", node.Value)) } c.emitPush(uint32(node.Value))