Merge pull request #27 from RReverser/sweet-js-rework

Add context-aware state machine
tdewolff · Jun 10, 2017 · e3d09bb · e3d09bb
2 parents 874ae8f + 68e2aa9
commit e3d09bb
Show file tree

Hide file tree

Showing 3 changed files with 141 additions and 167 deletions.
diff --git a/js/lex.go b/js/lex.go
@@ -33,6 +33,30 @@ const (
 	TemplateToken
 )
 
+// TokenState determines a state in which next token should be read
+type TokenState uint32
+
+// TokenState values
+const (
+	ExprState TokenState = iota
+	StmtParensState
+	SubscriptState
+	PropNameState
+)
+
+// ParsingContext determines the context in which following token should be parsed.
+// This affects parsing regular expressions and template literals.
+type ParsingContext uint32
+
+// ParsingContext values
+const (
+	GlobalContext ParsingContext = iota
+	StmtParensContext
+	ExprParensContext
+	BracesContext
+	TemplateContext
+)
+
 // String returns the string representation of a TokenType.
 func (tt TokenType) String() string {
 	switch tt {
@@ -66,21 +90,34 @@ func (tt TokenType) String() string {
 
 // Lexer is the state for the lexer.
 type Lexer struct {
-	r *buffer.Lexer
-
-	regexpState   bool
-	templateState bool
+	r     *buffer.Lexer
+	stack []ParsingContext
+	state TokenState
 	emptyLine     bool
 }
 
 // NewLexer returns a new Lexer for a given io.Reader.
 func NewLexer(r io.Reader) *Lexer {
 	return &Lexer{
-		r:         buffer.NewLexer(r),
+		r:     buffer.NewLexer(r),
+		stack: make([]ParsingContext, 0),
+		state: ExprState,
 		emptyLine: true,
 	}
 }
 
+func (l *Lexer) enterContext(context ParsingContext) {
+	l.stack = append(l.stack, context)
+}
+
+func (l *Lexer) leaveContext() ParsingContext {
+	ctx := GlobalContext
+	if last := len(l.stack) - 1; last >= 0 {
+		ctx, l.stack = l.stack[last], l.stack[:last]
+	}
+	return ctx
+}
+
 // Err returns the error encountered during lexing, this is often io.EOF but also other errors can be returned.
 func (l *Lexer) Err() error {
 	return l.r.Err()
@@ -96,36 +133,75 @@ func (l *Lexer) Next() (TokenType, []byte) {
 	tt := UnknownToken
 	c := l.r.Peek(0)
 	switch c {
-	case '(', ')', '[', ']', '{', '}', ';', ',', '~', '?', ':':
-		if c == '}' && l.templateState && l.consumeTemplateToken() {
+	case '(':
+		if l.state == StmtParensState {
+			l.enterContext(StmtParensContext)
+		} else {
+			l.enterContext(ExprParensContext)
+		}
+		l.state = ExprState
+		l.r.Move(1)
+		tt = PunctuatorToken
+	case ')':
+		if l.leaveContext() == StmtParensContext {
+			l.state = ExprState
+		} else {
+			l.state = SubscriptState
+		}
+		l.r.Move(1)
+		tt = PunctuatorToken
+	case '{':
+		l.enterContext(BracesContext)
+		l.state = ExprState
+		l.r.Move(1)
+		tt = PunctuatorToken
+	case '}':
+		if l.leaveContext() == TemplateContext && l.consumeTemplateToken() {
 			tt = TemplateToken
 		} else {
+			// will work incorrectly for objects or functions divided by something,
+			// but that's an extremely rare case
+			l.state = ExprState
 			l.r.Move(1)
 			tt = PunctuatorToken
 		}
+	case ']':
+		l.state = SubscriptState
+		l.r.Move(1)
+		tt = PunctuatorToken
+	case '[', ';', ',', '~', '?', ':':
+		l.state = ExprState
+		l.r.Move(1)
+		tt = PunctuatorToken
 	case '<', '>', '=', '!', '+', '-', '*', '%', '&', '|', '^':
 		if (c == '<' || (l.emptyLine && c == '-')) && l.consumeCommentToken() {
 			return CommentToken, l.r.Shift()
 		} else if l.consumeLongPunctuatorToken() {
+			l.state = ExprState
 			tt = PunctuatorToken
 		}
 	case '/':
 		if l.consumeCommentToken() {
 			return CommentToken, l.r.Shift()
-		} else if l.regexpState && l.consumeRegexpToken() {
+		} else if l.state == ExprState && l.consumeRegexpToken() {
+			l.state = SubscriptState
 			tt = RegexpToken
 		} else if l.consumeLongPunctuatorToken() {
+			l.state = ExprState
 			tt = PunctuatorToken
 		}
 	case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.':
 		if l.consumeNumericToken() {
 			tt = NumericToken
+			l.state = SubscriptState
 		} else if c == '.' {
+			l.state = PropNameState
 			l.r.Move(1)
 			tt = PunctuatorToken
 		}
 	case '\'', '"':
 		if l.consumeStringToken() {
+			l.state = SubscriptState
 			tt = StringToken
 		}
 	case ' ', '\t', '\v', '\f':
@@ -139,13 +215,27 @@ func (l *Lexer) Next() (TokenType, []byte) {
 		}
 		tt = LineTerminatorToken
 	case '`':
-		l.templateState = true
 		if l.consumeTemplateToken() {
 			tt = TemplateToken
 		}
 	default:
 		if l.consumeIdentifierToken() {
 			tt = IdentifierToken
+			if l.state != PropNameState {
+				switch hash := ToHash(l.r.Lexeme()); hash {
+				case 0, This, False, True, Null:
+					l.state = SubscriptState
+				case If, While, For, With:
+					l.state = StmtParensState
+				default:
+					// This will include keywords that can't be followed by a regexp, but only
+					// by a specified char (like `switch` or `try`), but we don't check for syntax
+					// errors as we don't attempt to parse a full JS grammar when streaming
+					l.state = ExprState
+				}
+			} else {
+				l.state = SubscriptState
+			}
 		} else if c >= 0xC0 {
 			if l.consumeWhitespace() {
 				for l.consumeWhitespace() {
@@ -163,23 +253,6 @@ func (l *Lexer) Next() (TokenType, []byte) {
 
 	l.emptyLine = tt == LineTerminatorToken
 
-	// differentiate between divisor and regexp state, because the '/' character is ambiguous!
-	// ErrorToken, WhitespaceToken and CommentToken are already returned
-	if tt == LineTerminatorToken || tt == PunctuatorToken && regexpStateByte[c] {
-		l.regexpState = true
-	} else if tt == IdentifierToken {
-		switch hash := ToHash(l.r.Lexeme()); hash {
-		case 0, This, False, True, Null:
-			l.regexpState = false
-		default:
-			// This will include keywords that can't be followed by a regexp, but only
-			// by a specified char (like `if` or `try`), but we don't check for syntax
-			// errors as we don't attempt to parse a full JS grammar when streaming
-			l.regexpState = true
-		}
-	} else {
-		l.regexpState = false
-	}
 	if tt == UnknownToken {
 		_, n := l.r.PeekRune(0)
 		l.r.Move(n)
@@ -560,10 +633,12 @@ func (l *Lexer) consumeTemplateToken() bool {
 	for {
 		c := l.r.Peek(0)
 		if c == '`' {
-			l.templateState = false
+			l.state = SubscriptState
 			l.r.Move(1)
 			return true
 		} else if c == '$' && l.r.Peek(1) == '{' {
+			l.enterContext(TemplateContext)
+			l.state = ExprState
 			l.r.Move(2)
 			return true
 		} else if c == 0 {

diff --git a/js/lex_test.go b/js/lex_test.go
@@ -10,10 +10,10 @@ import (
 	"github.com/tdewolff/test"
 )
 
-func helperStringify(t *testing.T, input string) string {
+func helperStringify(t *testing.T, input string, index int) string {
 	s := ""
 	l := NewLexer(bytes.NewBufferString(input))
-	for i := 0; i < 10; i++ {
+	for i := 0; i <= index; i++ {
 		tt, data := l.Next()
 		if tt == ErrorToken {
 			if l.Err() != nil {
@@ -28,7 +28,7 @@ func helperStringify(t *testing.T, input string) string {
 			s += tt.String() + "('" + string(data) + "') "
 		}
 	}
-	return s
+	return s + " with code: " + strconv.Quote(input)
 }
 
 ////////////////////////////////////////////////////////////////
@@ -63,8 +63,8 @@ func TestTokens(t *testing.T) {
 		{"1 /*comment\nmultiline*/ -->nothing\n", TTs{NumericToken, CommentToken, CommentToken, LineTerminatorToken}},
 		{"$ _\u200C \\u2000 \u200C", TTs{IdentifierToken, IdentifierToken, IdentifierToken, UnknownToken}},
 		{">>>=>>>>=", TTs{PunctuatorToken, PunctuatorToken, PunctuatorToken}},
-		{"/", TTs{PunctuatorToken}},
-		{"/=", TTs{PunctuatorToken}},
+		{"1/", TTs{NumericToken, PunctuatorToken}},
+		{"1/=", TTs{NumericToken, PunctuatorToken}},
 		{"010xF", TTs{NumericToken, NumericToken, IdentifierToken}},
 		{"50e+-0", TTs{NumericToken, IdentifierToken, PunctuatorToken, PunctuatorToken, NumericToken}},
 		{"'str\\i\\'ng'", TTs{StringToken}},
@@ -80,6 +80,7 @@ func TestTokens(t *testing.T) {
 		{"`template`", TTs{TemplateToken}},
 		{"`a${x+y}b`", TTs{TemplateToken, IdentifierToken, PunctuatorToken, IdentifierToken, TemplateToken}},
 		{"`temp\nlate`", TTs{TemplateToken}},
+		{"`outer${{x: 10}}bar${ raw`nested${2}endnest` }end`", TTs{TemplateToken, PunctuatorToken, IdentifierToken, PunctuatorToken, NumericToken, PunctuatorToken, TemplateToken, IdentifierToken, TemplateToken, NumericToken, TemplateToken, TemplateToken}},
 
 		// early endings
 		{"'string", TTs{StringToken}},
@@ -108,31 +109,61 @@ func TestTokens(t *testing.T) {
 		{"return /abc/;", TTs{IdentifierToken, RegexpToken, PunctuatorToken}},
 		{"yield /abc/;", TTs{IdentifierToken, RegexpToken, PunctuatorToken}},
 		{"a/b/g", TTs{IdentifierToken, PunctuatorToken, IdentifierToken, PunctuatorToken, IdentifierToken}},
+		{"{}/1/g", TTs{PunctuatorToken, PunctuatorToken, RegexpToken}},
+		{"i(0)/1/g", TTs{IdentifierToken, PunctuatorToken, NumericToken, PunctuatorToken, PunctuatorToken, NumericToken, PunctuatorToken, IdentifierToken}},
+		{"if(0)/1/g", TTs{IdentifierToken, PunctuatorToken, NumericToken, PunctuatorToken, RegexpToken}},
+		{"a.if(0)/1/g", TTs{IdentifierToken, PunctuatorToken, IdentifierToken, PunctuatorToken, NumericToken, PunctuatorToken, PunctuatorToken, NumericToken, PunctuatorToken, IdentifierToken}},
+		{"while(0)/1/g", TTs{IdentifierToken, PunctuatorToken, NumericToken, PunctuatorToken, RegexpToken}},
+		{"for(;;)/1/g", TTs{IdentifierToken, PunctuatorToken, PunctuatorToken, PunctuatorToken, PunctuatorToken, RegexpToken}},
+		{"with(0)/1/g", TTs{IdentifierToken, PunctuatorToken, NumericToken, PunctuatorToken, RegexpToken}},
+		{"this/1/g", TTs{IdentifierToken, PunctuatorToken, NumericToken, PunctuatorToken, IdentifierToken}},
+		{"case /1/g:", TTs{IdentifierToken, RegexpToken, PunctuatorToken}},
+		{"function f(){}/1/g", TTs{IdentifierToken, IdentifierToken, PunctuatorToken, PunctuatorToken, PunctuatorToken, PunctuatorToken, RegexpToken}},
+		{"this.return/1/g", TTs{IdentifierToken, PunctuatorToken, IdentifierToken, PunctuatorToken, NumericToken, PunctuatorToken, IdentifierToken}},
+		{"(a+b)/1/g", TTs{PunctuatorToken, IdentifierToken, PunctuatorToken, IdentifierToken, PunctuatorToken, PunctuatorToken, NumericToken, PunctuatorToken, IdentifierToken}},
 
 		// go fuzz
 		{"`", TTs{UnknownToken}},
 	}
+
+	passed := 0
+
 	for _, tt := range tokenTests {
-		stringify := helperStringify(t, tt.js)
 		l := NewLexer(bytes.NewBufferString(tt.js))
 		i := 0
+		j := 0
 		for {
 			token, _ := l.Next()
+			j++
 			if token == ErrorToken {
+				stringify := helperStringify(t, tt.js, j)
 				test.That(t, i == len(tt.expected), "when error occurred we must be at the end in "+stringify)
 				test.Error(t, l.Err(), io.EOF, "in "+stringify)
+				passed++
 				break
 			} else if token == WhitespaceToken {
 				continue
 			}
-			test.That(t, i < len(tt.expected), "index", i, "must not exceed expected token types size", len(tt.expected), "in "+stringify)
 			if i < len(tt.expected) {
-				test.That(t, token == tt.expected[i], "token types must match at index "+strconv.Itoa(i)+" in "+stringify)
+				expected := tt.expected[i]
+				if token != expected {
+					stringify := helperStringify(t, tt.js, j)
+					test.String(t, token.String(), expected.String(), "token types must match at index "+strconv.Itoa(i)+" in "+stringify)
+					break
+				}
+			} else {
+				stringify := helperStringify(t, tt.js, j)
+				test.That(t, false, "index", i, "must not exceed expected token types size", len(tt.expected), "in "+stringify)
+				break
 			}
 			i++
 		}
 	}
 
+	if passed != len(tokenTests) {
+		t.Logf("Failed %d / %d token tests", len(tokenTests)-passed, len(tokenTests))
+	}
+
 	test.String(t, WhitespaceToken.String(), "Whitespace")
 	test.String(t, TokenType(100).String(), "Invalid(100)")
 }