diff --git a/lexc/main.go b/cmd/lexc/main.go similarity index 100% rename from lexc/main.go rename to cmd/lexc/main.go diff --git a/cmd/lexgen/main.go b/cmd/lexgen/main.go new file mode 100644 index 0000000..11d5f53 --- /dev/null +++ b/cmd/lexgen/main.go @@ -0,0 +1,86 @@ +package main + +import ( + "fmt" + "log" + "os" + + "github.com/timtadh/getopt" + "github.com/timtadh/lexmachine/codegen/python" + "github.com/timtadh/lexmachine/dfa" + "github.com/timtadh/lexmachine/frontend" +) + +var usageMessage = "lexgen -p [-p ]*" +var extendedMessage = ` +lexgen compiles regular expressions to a program + +Options + -h, --help print this message + -p, --pattern= a regex pattern + +Specs + + a regex pattern +` + +func usage(code int) { + fmt.Fprintln(os.Stderr, usageMessage) + if code == 0 { + fmt.Fprintln(os.Stderr, extendedMessage) + code = 1 + } else { + fmt.Fprintln(os.Stderr, "Try -h or --help for help") + } + os.Exit(code) +} + +func main() { + + short := "hp:" + long := []string{ + "help", + "pattern=", + } + + _, optargs, err := getopt.GetOpt(os.Args[1:], short, long) + if err != nil { + log.Print(err) + usage(1) + } + + patterns := make([]string, 0, 10) + for _, oa := range optargs { + switch oa.Opt() { + case "-h", "--help": + usage(0) + case "-p", "--pattern": + patterns = append(patterns, oa.Arg()) + } + } + + if len(patterns) <= 0 { + log.Print("Must supply some regulars expressions!") + usage(1) + } + + asts := make([]frontend.AST, 0, len(patterns)) + for _, p := range patterns { + ast, err := frontend.Parse([]byte(p)) + if err != nil { + log.Fatal(err) + } + asts = append(asts, ast) + } + + lexast := asts[len(asts)-1] + for i := len(asts) - 2; i >= 0; i-- { + lexast = frontend.NewAltMatch(asts[i], lexast) + } + + pydfa := python.Generate(dfa.Generate(lexast)) + if err != nil { + log.Fatal(err) + } + fmt.Println(pydfa) +} diff --git a/codegen/python/pygen.go b/codegen/python/pygen.go new file mode 100644 index 0000000..d58f388 --- /dev/null +++ b/codegen/python/pygen.go @@ -0,0 +1,124 @@ +package python + +import ( + "fmt" + "strings" + + "github.com/timtadh/lexmachine/dfa" +) + +var header = ` + +def tokenize(input): + return _Scanner(input).tokenize() + +class Match(object): + + def __init__(self, match_id, lexeme): + self.match_id = match_id + self.lexeme = lexeme + + def __repr__(self): + return self.__str__() + + def __str__(self): + return "Match({}, {})".format(self.match_id, repr(self.lexeme)) + +class _Scanner(object): + + def __init__(self, input): + self.input = input + self.idx = 0 + self.buf = list() + self.tokens = list() + + def tokenize(self): + state = self.start() + while state != None: + state = state() + if self.idx != len(self.input): + self.eosError() + return self.tokens + + def mvto(self, next_state): + if self.idx >= len(self.input): + raise Exception("internal DFA error, index out of bounds") + self.buf.append(self.input[self.idx]) + self.idx += 1 + return next_state + + def match(self, match_id): + self.tokens.append(Match(match_id, ''.join(self.buf))) + self.buf = list() + if self.idx < len(self.input): + return self.start() + return None + + def eosError(self, state=None): + raise Exception("UnconsumedInput, {}".format(repr(self.input[self.idx-len(self.buf):]))) + + def error(self, state, expected): + raise Exception("UnexpectedInput, {}. expected one of: {}".format( + repr(self.input[self.idx]), + [chr(x) for x in expected])) + +` + +func Generate(dfa *dfa.DFA) string { + stateFuncs := make([]string, 0, len(dfa.Trans)) + stateFuncs = append(stateFuncs, genStart(dfa)) + for state := range dfa.Trans { + stateFuncs = append(stateFuncs, genState(dfa, state)) + } + return header + strings.Join(stateFuncs, "\n\n") +} + +func genStart(dfa *dfa.DFA) string { + lines := make([]string, 0, 3) + lines = append(lines, fmt.Sprintf(" def start(self):")) + lines = append(lines, fmt.Sprintf(" return self.state_%d", dfa.Start)) + return strings.Join(lines, "\n") +} + +func genState(dfa *dfa.DFA, state int) string { + trans := dfa.Trans[state] + matchID, accepting := dfa.Accepting[state] + lines := make([]string, 0, len(trans)) + lines = append(lines, fmt.Sprintf(" def state_%v(self):", state)) + if dfa.Error == state { + lines = append(lines, fmt.Sprintf(" self.error(%d, [])", state)) + return strings.Join(lines, "\n") + } + if len(trans) > 0 && accepting { + lines = append(lines, fmt.Sprintf(" if self.idx >= len(self.input):")) + lines = append(lines, fmt.Sprintf(" return self.match(%v)", matchID)) + } else if len(trans) > 0 { + lines = append(lines, fmt.Sprintf(" if self.idx >= len(self.input):")) + lines = append(lines, fmt.Sprintf(" self.eosError(%v)", state)) + lines = append(lines, fmt.Sprintf(" return")) + } + first := true + allowed := make([]string, 0, len(trans)) + for ord := 0; ord < len(trans); ord++ { + if trans[ord] == dfa.Error { + continue + } + allowed = append(allowed, fmt.Sprint(ord)) + if first { + lines = append(lines, fmt.Sprintf(" if ord(self.input[self.idx]) == %d:", ord)) + first = false + } else { + lines = append(lines, fmt.Sprintf(" elif ord(self.input[self.idx]) == %d:", ord)) + } + lines = append(lines, fmt.Sprintf(" return self.mvto(self.state_%d)", trans[ord])) + } + if len(allowed) == 0 && !accepting && dfa.Error != state { + panic("bad dfa") + } + if accepting { + lines = append(lines, fmt.Sprintf(" return self.match(%d)", matchID)) + } else { + lines = append(lines, fmt.Sprintf(" self.error(%d, [%v])", state, strings.Join(allowed, ", "))) + } + return strings.Join(lines, "\n") +}