gnoverse · notJoon · Jan 19, 2025 · Jan 18, 2025 · Jan 18, 2025 · Jan 18, 2025
diff --git a/.github/golangci.yml b/.github/golangci.yml
@@ -81,3 +81,10 @@ issues:
     - path: _\.gno
       linters:
         - errorlint # Disabled linting of error comparisons, because of lacking std lib support
+
+    - path: fixer_v2/query/internal\.go$
+      linters:
+        - gofmt
+        - gofumpt
+        - goimports
+        - whitespace
diff --git a/Makefile b/Makefile
@@ -63,6 +63,6 @@ lint:
 	golangci-lint run
 
 fmt:
-	go fmt ./...
+	find . -name "*.go" ! -path "./fixer_v2/query/internal.go" -exec go fmt {} \;
 
 .PHONY: all build test clean run deps build-linux build-windows build-mac build-all install-linter lint
diff --git a/fixer_v2/query/internal.go b/fixer_v2/query/internal.go
@@ -0,0 +1,252 @@
+package query
+
+import (
+	"fmt"
+	"strings"
+)
+
+/*
+State Transition Machine Design Rationale
+
+This lexer uses a state transition machine approach instead of a traditional
+hand-coded switch-case lexer for several performance-critical reasons:
+
+1. Branch Prediction Optimization
+   - Traditional lexers with multiple if/switch statements suffer from branch
+     misprediction penalties. Each token type check causes a branch, and modern
+     CPUs struggle to predict these branches effectively.
+   - State machine approach consolidates branching into a single, predictable
+     loop with table-driven transitions, reducing branch mispredictions.
+
+2. Token Processing Efficiency
+   - The state machine processes input character-by-character in a tight loop
+     using lookup tables, rather than repeatedly examining characters with
+     conditional logic.
+   - Token length tracking is integrated into the state machine loop via the
+     'in_token' table, eliminating the need for separate length calculations.
+
+3. Memory Access Patterns
+   - The transition table, while larger than hand-coded logic, provides more
+     predictable memory access patterns that modern CPU caches can handle efficiently.
+   - Character class equivalence is used to reduce the transition table size
+     while maintaining performance (e.g., most alphabetic characters behave similarly).
+
+4. Unified Whitespace and Token Processing
+   - The state machine handles both whitespace skipping and token recognition
+     in the same loop, eliminating additional branch mispredictions that would
+     occur when switching between these modes.
+
+5. Extensibility and Maintainability
+   - Adding new token types only requires updating the transition table rather
+     than modifying complex branching logic.
+   - The state machine structure makes it easier to verify and maintain the lexer's
+     behavior compared to nested conditional logic.
+
+Implementation Notes:
+   1. States are arranged so that final states have lower numbers, allowing for a single
+    comparison to detect when token recognition is complete.
+   2. The transition table is structured for efficient CPU cache usage by minimizing
+    the table size through character equivalence classes.
+   3. The design supports both simple tokens (like operators) and complex tokens
+    (like identifiers) while maintaining consistent performance characteristics.
+
+Reference:
+ [1] https://nothings.org/computer/lexing.html
+ */
+
+type (
+	States  int8 // Represents possible states of the parser
+	Classes int8 // Represents character classes in the pattern
+)
+
+// States represent different stages of lexical analysis:
+//   - GO (0)  - Initial state, ready to start processing input
+//   - OK (1)  - Accept state, token successfully recognized
+//   - CL (2)  - After seeing a colon, expecting bracket or identifier
+//   - OB (3)  - After first opening bracket, may start double bracket
+//   - DB (4)  - After double bracket, expecting identifier
+//   - NM (5)  - Reading name part of metavariable
+//   - ID (6)  - Reading type identifier (after colon in name)
+//   - CB (7)  - After first closing bracket
+//   - QB (8)  - After second closing bracket
+//   - QT (9)  - Processing quantifier (*, +, ?)
+//   - TX (10) - Processing regular text
+//   - WS (11) - Processing whitespace
+//   - BR (12) - Processing block delimiters ({, })
+//
+// The state numbering is significant - states <= OK are final states,
+// allowing for efficient loop termination with a single comparison.
+const (
+	GO States = iota // Initial state
+	OK               // Accept state (successful parse)
+	CL               // After colon state (:)
+	OB               // After first bracket state ([)
+	DB               // After double bracket state ([[)
+	NM               // Reading name state
+	ID               // Reading type identifier state
+	CB               // After closing bracket state (])
+	QB               // After double closing bracket state (]])
+	QT               // Reading quantifier state (*, +, ?)
+	TX               // Reading text state
+	WS               // Reading whitespace state
+	BR               // Reading block state ({, })
+)
+
+// Character class definitions
+const (
+	C_COLON  Classes = iota // Colon character (:)
+	C_LBRACK                // Left bracket ([)
+	C_RBRACK                // Right bracket (])
+	C_LBRACE                // Left brace ({)
+	C_RBRACE                // Right brace (})
+	C_SPACE                 // Whitespace characters (space, tab, newline)
+	C_IDENT                 // Identifier characters (alphanumeric, _, -)
+	C_QUANT                 // Quantifiers (*, +, ?)
+	C_OTHER                 // Any other character
+)
+
+// State transition table for the pattern parser
+// Key considerations in the transitions:
+//  1. After NM state, a colon transitions to ID state for type specifications
+//  2. CB and QB states allow whitespace transitions for better error recovery
+//  3. After quantifiers (QT), we can continue with any valid pattern start
+//  4. TX (text) state allows transitioning back to pattern parsing
+var StateTransitionTable = [13][9]States{
+    //          COLON  LBRACK RBRACK LBRACE RBRACE SPACE  IDENT  QUANT  OTHER
+    /* GO  */ { CL,    TX,    TX,    BR,    BR,    WS,    TX,    TX,    TX   },
+    /* OK  */ { CL,    TX,    TX,    BR,    BR,    WS,    TX,    TX,    TX   },
+    /* CL  */ { TX,    OB,    TX,    TX,    TX,    TX,    ID,    TX,    TX   },
+    /* OB  */ { TX,    DB,    TX,    TX,    TX,    TX,    NM,    TX,    TX   },
+    /* DB  */ { TX,    TX,    TX,    TX,    TX,    TX,    NM,    TX,    TX   },
+    /* NM  */ { ID,    TX,    CB,    TX,    TX,    TX,    NM,    TX,    TX   }, // Transition to ID state when colon is encountered
+    /* ID  */ { TX,    TX,    CB,    TX,    TX,    TX,    ID,    TX,    TX   },
+    /* CB  */ { OK,    TX,    QB,    TX,    TX,    WS,    TX,    QT,    TX   }, // Handle whitespace for better error recovery
+    /* QB  */ { OK,    TX,    TX,    TX,    TX,    WS,    TX,    QT,    TX   }, // Handle whitespace for better error recovery
+    /* QT  */ { CL,    TX,    TX,    BR,    BR,    WS,    TX,    TX,    TX   },
+    /* TX  */ { CL,    TX,    TX,    BR,    BR,    WS,    TX,    TX,    TX   },
+    /* WS  */ { CL,    TX,    TX,    BR,    BR,    WS,    TX,    TX,    TX   },
+    /* BR  */ { CL,    TX,    TX,    BR,    OK,    WS,    TX,    TX,    TX   },
+}
+
+func (c Classes) String() string {
+	switch c {
+	case C_COLON:
+		return "COLON"
+	case C_LBRACK:
+		return "LBRACK"
+	case C_RBRACK:
+		return "RBRACK"
+	case C_LBRACE:
+		return "LBRACE"
+	case C_RBRACE:
+		return "RBRACE"
+	case C_SPACE:
+		return "SPACE"
+	case C_IDENT:
+		return "IDENT"
+	case C_QUANT:
+		return "QUANT"
+	case C_OTHER:
+		return "OTHER"
+	default:
+		return "UNKNOWN"
+	}
+}
+
+// StateMachine represents the parser's state machine
+type StateMachine struct {
+	state    States // Current state
+	input    string // Input pattern to parse
+	position int    // Current position in input
+}
+
+func NewStateMachine(input string) *StateMachine {
+	return &StateMachine{
+		state:    GO,
+		input:    input,
+		position: 0,
+	}
+}
+
+// Transition records the transition details between states
+type Transition struct {
+	char      byte
+	fromState States
+	class     Classes
+	toState   States
+}
+
+func (sm *StateMachine) recordTransitions() []Transition {
+	var transitions []Transition
+
+	for sm.position < len(sm.input) {
+		c := sm.input[sm.position]
+		class := getCharacterClass(c)
+		currentState := sm.state
+		nextState := StateTransitionTable[currentState][class]
+
+		transitions = append(transitions, Transition{
+			char:      c,
+			fromState: currentState,
+			class:     class,
+			toState:   nextState,
+		})
+
+		sm.state = nextState
+		sm.position++
+	}
+
+	return transitions
+}
+
+func visualizeTransitions(transitions []Transition) string {
+	var b strings.Builder
+	for _, t := range transitions {
+		fmt.Fprintf(&b, "%c: %v -%v-> %v\n",
+			t.char, t.fromState, t.class, t.toState)
+	}
+	return b.String()
+}
+
+// getCharacterClass determines the character class for a given byte
+// Handles special characters, whitespace, and identifier characters
+// Returns C_OTHER for any character that doesn't fit other categories
+func getCharacterClass(c byte) Classes {
+	// Check special characters first
+	switch c {
+	case ':':
+		return C_COLON
+	case '[':
+		return C_LBRACK
+	case ']':
+		return C_RBRACK
+	case '{':
+		return C_LBRACE
+	case '}':
+		return C_RBRACE
+	case '*', '+', '?':
+		return C_QUANT
+	}
+
+	// Check for whitespace
+	if isWhitespace(c) {
+		return C_SPACE
+	}
+
+	// Check for identifier characters
+	if isIdentChar(c) {
+		return C_IDENT
+	}
+
+	return C_OTHER
+}
+
+// isIdentChar checks if a character is valid in an identifier
+// Allows: alphanumeric, underscore, and hyphen (comby-specific)
+func isIdentChar(c byte) bool {
+	return ('a' <= c && c <= 'z') ||
+		('A' <= c && c <= 'Z') ||
+		('0' <= c && c <= '9') ||
+		c == '_' ||
+		c == '-' // Comby syntax allows hyphens in identifiers
+}
diff --git a/fixer_v2/query/internal_test.go b/fixer_v2/query/internal_test.go
@@ -0,0 +1,59 @@
+package query
+
+import (
+	"reflect"
+	"testing"
+)
+
+func TestSpecialPatterns(t *testing.T) {
+	patterns := []struct {
+		input    string
+		desc     string
+		expected []States
+	}{
+		{
+			input: ":[[var:identifier]]*",
+			desc:  "Identifier with zero-or-more",
+			expected: []States{
+				CL, OB, DB,
+				NM, NM, NM,
+				ID, ID, ID,
+				ID, ID, ID,
+				ID, ID, ID,
+				ID, ID, CB,
+				QB, QT,
+			},
+		},
+		{
+			input: ":[var] :[next]",
+			desc:  "Multiple holes",
+			expected: []States{
+				CL, OB, NM,
+				NM, NM, CB,
+				WS, CL, OB,
+				NM, NM, NM,
+				NM, CB,
+			},
+		},
+	}
+
+	for _, p := range patterns {
+		t.Run(p.desc, func(t *testing.T) {
+			sm := NewStateMachine(p.input)
+			transitions := sm.recordTransitions()
+
+			states := make([]States, len(transitions))
+			for i, tr := range transitions {
+				states[i] = tr.toState
+			}
+
+			t.Logf("\n Input: %s", p.input)
+			t.Logf("\nTransitions:\n%s", visualizeTransitions(transitions))
+
+			if !reflect.DeepEqual(states, p.expected) {
+				t.Errorf("\nGot:  %v\nWant: %v", states, p.expected)
+				t.Logf("\nTransitions:\n%s", visualizeTransitions(transitions))
+			}
+		})
+	}
+}