Skip to content

Commit

Permalink
Merge pull request #27 from RReverser/sweet-js-rework
Browse files Browse the repository at this point in the history
Add context-aware state machine
  • Loading branch information
tdewolff authored Jun 10, 2017
2 parents 874ae8f + 68e2aa9 commit e3d09bb
Show file tree
Hide file tree
Showing 3 changed files with 141 additions and 167 deletions.
129 changes: 102 additions & 27 deletions js/lex.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,30 @@ const (
TemplateToken
)

// TokenState determines a state in which next token should be read
type TokenState uint32

// TokenState values
const (
ExprState TokenState = iota
StmtParensState
SubscriptState
PropNameState
)

// ParsingContext determines the context in which following token should be parsed.
// This affects parsing regular expressions and template literals.
type ParsingContext uint32

// ParsingContext values
const (
GlobalContext ParsingContext = iota
StmtParensContext
ExprParensContext
BracesContext
TemplateContext
)

// String returns the string representation of a TokenType.
func (tt TokenType) String() string {
switch tt {
Expand Down Expand Up @@ -66,21 +90,34 @@ func (tt TokenType) String() string {

// Lexer is the state for the lexer.
type Lexer struct {
r *buffer.Lexer

regexpState bool
templateState bool
r *buffer.Lexer
stack []ParsingContext
state TokenState
emptyLine bool
}

// NewLexer returns a new Lexer for a given io.Reader.
func NewLexer(r io.Reader) *Lexer {
return &Lexer{
r: buffer.NewLexer(r),
r: buffer.NewLexer(r),
stack: make([]ParsingContext, 0),
state: ExprState,
emptyLine: true,
}
}

func (l *Lexer) enterContext(context ParsingContext) {
l.stack = append(l.stack, context)
}

func (l *Lexer) leaveContext() ParsingContext {
ctx := GlobalContext
if last := len(l.stack) - 1; last >= 0 {
ctx, l.stack = l.stack[last], l.stack[:last]
}
return ctx
}

// Err returns the error encountered during lexing, this is often io.EOF but also other errors can be returned.
func (l *Lexer) Err() error {
return l.r.Err()
Expand All @@ -96,36 +133,75 @@ func (l *Lexer) Next() (TokenType, []byte) {
tt := UnknownToken
c := l.r.Peek(0)
switch c {
case '(', ')', '[', ']', '{', '}', ';', ',', '~', '?', ':':
if c == '}' && l.templateState && l.consumeTemplateToken() {
case '(':
if l.state == StmtParensState {
l.enterContext(StmtParensContext)
} else {
l.enterContext(ExprParensContext)
}
l.state = ExprState
l.r.Move(1)
tt = PunctuatorToken
case ')':
if l.leaveContext() == StmtParensContext {
l.state = ExprState
} else {
l.state = SubscriptState
}
l.r.Move(1)
tt = PunctuatorToken
case '{':
l.enterContext(BracesContext)
l.state = ExprState
l.r.Move(1)
tt = PunctuatorToken
case '}':
if l.leaveContext() == TemplateContext && l.consumeTemplateToken() {
tt = TemplateToken
} else {
// will work incorrectly for objects or functions divided by something,
// but that's an extremely rare case
l.state = ExprState
l.r.Move(1)
tt = PunctuatorToken
}
case ']':
l.state = SubscriptState
l.r.Move(1)
tt = PunctuatorToken
case '[', ';', ',', '~', '?', ':':
l.state = ExprState
l.r.Move(1)
tt = PunctuatorToken
case '<', '>', '=', '!', '+', '-', '*', '%', '&', '|', '^':
if (c == '<' || (l.emptyLine && c == '-')) && l.consumeCommentToken() {
return CommentToken, l.r.Shift()
} else if l.consumeLongPunctuatorToken() {
l.state = ExprState
tt = PunctuatorToken
}
case '/':
if l.consumeCommentToken() {
return CommentToken, l.r.Shift()
} else if l.regexpState && l.consumeRegexpToken() {
} else if l.state == ExprState && l.consumeRegexpToken() {
l.state = SubscriptState
tt = RegexpToken
} else if l.consumeLongPunctuatorToken() {
l.state = ExprState
tt = PunctuatorToken
}
case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.':
if l.consumeNumericToken() {
tt = NumericToken
l.state = SubscriptState
} else if c == '.' {
l.state = PropNameState
l.r.Move(1)
tt = PunctuatorToken
}
case '\'', '"':
if l.consumeStringToken() {
l.state = SubscriptState
tt = StringToken
}
case ' ', '\t', '\v', '\f':
Expand All @@ -139,13 +215,27 @@ func (l *Lexer) Next() (TokenType, []byte) {
}
tt = LineTerminatorToken
case '`':
l.templateState = true
if l.consumeTemplateToken() {
tt = TemplateToken
}
default:
if l.consumeIdentifierToken() {
tt = IdentifierToken
if l.state != PropNameState {
switch hash := ToHash(l.r.Lexeme()); hash {
case 0, This, False, True, Null:
l.state = SubscriptState
case If, While, For, With:
l.state = StmtParensState
default:
// This will include keywords that can't be followed by a regexp, but only
// by a specified char (like `switch` or `try`), but we don't check for syntax
// errors as we don't attempt to parse a full JS grammar when streaming
l.state = ExprState
}
} else {
l.state = SubscriptState
}
} else if c >= 0xC0 {
if l.consumeWhitespace() {
for l.consumeWhitespace() {
Expand All @@ -163,23 +253,6 @@ func (l *Lexer) Next() (TokenType, []byte) {

l.emptyLine = tt == LineTerminatorToken

// differentiate between divisor and regexp state, because the '/' character is ambiguous!
// ErrorToken, WhitespaceToken and CommentToken are already returned
if tt == LineTerminatorToken || tt == PunctuatorToken && regexpStateByte[c] {
l.regexpState = true
} else if tt == IdentifierToken {
switch hash := ToHash(l.r.Lexeme()); hash {
case 0, This, False, True, Null:
l.regexpState = false
default:
// This will include keywords that can't be followed by a regexp, but only
// by a specified char (like `if` or `try`), but we don't check for syntax
// errors as we don't attempt to parse a full JS grammar when streaming
l.regexpState = true
}
} else {
l.regexpState = false
}
if tt == UnknownToken {
_, n := l.r.PeekRune(0)
l.r.Move(n)
Expand Down Expand Up @@ -560,10 +633,12 @@ func (l *Lexer) consumeTemplateToken() bool {
for {
c := l.r.Peek(0)
if c == '`' {
l.templateState = false
l.state = SubscriptState
l.r.Move(1)
return true
} else if c == '$' && l.r.Peek(1) == '{' {
l.enterContext(TemplateContext)
l.state = ExprState
l.r.Move(2)
return true
} else if c == 0 {
Expand Down
47 changes: 39 additions & 8 deletions js/lex_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@ import (
"github.com/tdewolff/test"
)

func helperStringify(t *testing.T, input string) string {
func helperStringify(t *testing.T, input string, index int) string {
s := ""
l := NewLexer(bytes.NewBufferString(input))
for i := 0; i < 10; i++ {
for i := 0; i <= index; i++ {
tt, data := l.Next()
if tt == ErrorToken {
if l.Err() != nil {
Expand All @@ -28,7 +28,7 @@ func helperStringify(t *testing.T, input string) string {
s += tt.String() + "('" + string(data) + "') "
}
}
return s
return s + " with code: " + strconv.Quote(input)
}

////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -63,8 +63,8 @@ func TestTokens(t *testing.T) {
{"1 /*comment\nmultiline*/ -->nothing\n", TTs{NumericToken, CommentToken, CommentToken, LineTerminatorToken}},
{"$ _\u200C \\u2000 \u200C", TTs{IdentifierToken, IdentifierToken, IdentifierToken, UnknownToken}},
{">>>=>>>>=", TTs{PunctuatorToken, PunctuatorToken, PunctuatorToken}},
{"/", TTs{PunctuatorToken}},
{"/=", TTs{PunctuatorToken}},
{"1/", TTs{NumericToken, PunctuatorToken}},
{"1/=", TTs{NumericToken, PunctuatorToken}},
{"010xF", TTs{NumericToken, NumericToken, IdentifierToken}},
{"50e+-0", TTs{NumericToken, IdentifierToken, PunctuatorToken, PunctuatorToken, NumericToken}},
{"'str\\i\\'ng'", TTs{StringToken}},
Expand All @@ -80,6 +80,7 @@ func TestTokens(t *testing.T) {
{"`template`", TTs{TemplateToken}},
{"`a${x+y}b`", TTs{TemplateToken, IdentifierToken, PunctuatorToken, IdentifierToken, TemplateToken}},
{"`temp\nlate`", TTs{TemplateToken}},
{"`outer${{x: 10}}bar${ raw`nested${2}endnest` }end`", TTs{TemplateToken, PunctuatorToken, IdentifierToken, PunctuatorToken, NumericToken, PunctuatorToken, TemplateToken, IdentifierToken, TemplateToken, NumericToken, TemplateToken, TemplateToken}},

// early endings
{"'string", TTs{StringToken}},
Expand Down Expand Up @@ -108,31 +109,61 @@ func TestTokens(t *testing.T) {
{"return /abc/;", TTs{IdentifierToken, RegexpToken, PunctuatorToken}},
{"yield /abc/;", TTs{IdentifierToken, RegexpToken, PunctuatorToken}},
{"a/b/g", TTs{IdentifierToken, PunctuatorToken, IdentifierToken, PunctuatorToken, IdentifierToken}},
{"{}/1/g", TTs{PunctuatorToken, PunctuatorToken, RegexpToken}},
{"i(0)/1/g", TTs{IdentifierToken, PunctuatorToken, NumericToken, PunctuatorToken, PunctuatorToken, NumericToken, PunctuatorToken, IdentifierToken}},
{"if(0)/1/g", TTs{IdentifierToken, PunctuatorToken, NumericToken, PunctuatorToken, RegexpToken}},
{"a.if(0)/1/g", TTs{IdentifierToken, PunctuatorToken, IdentifierToken, PunctuatorToken, NumericToken, PunctuatorToken, PunctuatorToken, NumericToken, PunctuatorToken, IdentifierToken}},
{"while(0)/1/g", TTs{IdentifierToken, PunctuatorToken, NumericToken, PunctuatorToken, RegexpToken}},
{"for(;;)/1/g", TTs{IdentifierToken, PunctuatorToken, PunctuatorToken, PunctuatorToken, PunctuatorToken, RegexpToken}},
{"with(0)/1/g", TTs{IdentifierToken, PunctuatorToken, NumericToken, PunctuatorToken, RegexpToken}},
{"this/1/g", TTs{IdentifierToken, PunctuatorToken, NumericToken, PunctuatorToken, IdentifierToken}},
{"case /1/g:", TTs{IdentifierToken, RegexpToken, PunctuatorToken}},
{"function f(){}/1/g", TTs{IdentifierToken, IdentifierToken, PunctuatorToken, PunctuatorToken, PunctuatorToken, PunctuatorToken, RegexpToken}},
{"this.return/1/g", TTs{IdentifierToken, PunctuatorToken, IdentifierToken, PunctuatorToken, NumericToken, PunctuatorToken, IdentifierToken}},
{"(a+b)/1/g", TTs{PunctuatorToken, IdentifierToken, PunctuatorToken, IdentifierToken, PunctuatorToken, PunctuatorToken, NumericToken, PunctuatorToken, IdentifierToken}},

// go fuzz
{"`", TTs{UnknownToken}},
}

passed := 0

for _, tt := range tokenTests {
stringify := helperStringify(t, tt.js)
l := NewLexer(bytes.NewBufferString(tt.js))
i := 0
j := 0
for {
token, _ := l.Next()
j++
if token == ErrorToken {
stringify := helperStringify(t, tt.js, j)
test.That(t, i == len(tt.expected), "when error occurred we must be at the end in "+stringify)
test.Error(t, l.Err(), io.EOF, "in "+stringify)
passed++
break
} else if token == WhitespaceToken {
continue
}
test.That(t, i < len(tt.expected), "index", i, "must not exceed expected token types size", len(tt.expected), "in "+stringify)
if i < len(tt.expected) {
test.That(t, token == tt.expected[i], "token types must match at index "+strconv.Itoa(i)+" in "+stringify)
expected := tt.expected[i]
if token != expected {
stringify := helperStringify(t, tt.js, j)
test.String(t, token.String(), expected.String(), "token types must match at index "+strconv.Itoa(i)+" in "+stringify)
break
}
} else {
stringify := helperStringify(t, tt.js, j)
test.That(t, false, "index", i, "must not exceed expected token types size", len(tt.expected), "in "+stringify)
break
}
i++
}
}

if passed != len(tokenTests) {
t.Logf("Failed %d / %d token tests", len(tokenTests)-passed, len(tokenTests))
}

test.String(t, WhitespaceToken.String(), "Whitespace")
test.String(t, TokenType(100).String(), "Invalid(100)")
}
Expand Down
Loading

0 comments on commit e3d09bb

Please sign in to comment.