Skip to content

Commit

Permalink
refactor: share some pcre parser bits (#254)
Browse files Browse the repository at this point in the history
In particular, share the basic building block for tracking our position
within the parsed string and looking for specific chars.

Signed-off-by: Rudi Grinberg <[email protected]>
  • Loading branch information
rgrinberg authored Apr 17, 2024
1 parent fc2c8b6 commit b67cbc2
Show file tree
Hide file tree
Showing 5 changed files with 107 additions and 69 deletions.
15 changes: 6 additions & 9 deletions lib/emacs.ml
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,12 @@ exception Parse_error
exception Not_supported

let parse s =
let i = ref 0 in
let l = String.length s in
let eos () = !i = l in
let test c = not (eos ()) && s.[!i] = c in
let test2 c c' = !i + 1 < l && s.[!i] = c && s.[!i + 1] = c' in
let accept c = let r = test c in if r then incr i; r in
let accept2 c c' = let r = test2 c c' in if r then i := !i + 2; r in
let get () = let r = s.[!i] in incr i; r in

let buf = Parse_buffer.create s in
let accept = Parse_buffer.accept buf in
let accept2 = Parse_buffer.accept2 buf in
let eos () = Parse_buffer.eos buf in
let test2 = Parse_buffer.test2 buf in
let get () = Parse_buffer.get buf in
let rec regexp () = regexp' (branch ())
and regexp' left =
if accept2 '\\' '|' then regexp' (Re.alt [left; branch ()]) else left
Expand Down
66 changes: 66 additions & 0 deletions lib/parse_buffer.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
type t =
{ str : string
; mutable pos : int
}

exception Parse_error

let create str = { str ; pos = 0 }

let unget t = t.pos <- t.pos - 1

let junk t = t.pos <- t.pos + 1

let eos t = t.pos = String.length t.str

let test t c = not (eos t) && t.str.[t.pos] = c

let test2 t c c' =
t.pos + 1 < String.length t.str && t.str.[t.pos] = c && t.str.[t.pos + 1] = c'

let accept t c =
let r = test t c in
if r then t.pos <- t.pos + 1;
r

let accept2 t c c' =
let r = test2 t c c' in
if r then t.pos <- t.pos + 2;
r

let get t =
let r = t.str.[t.pos] in
t.pos <- t.pos + 1;
r

let accept_s t s' =
let len = String.length s' in
try
for j = 0 to len - 1 do
try if s'.[j] <> t.str.[t.pos + j] then raise_notrace Exit
with _ -> raise_notrace Exit
done;
t.pos <- t.pos + len;
true
with Exit -> false

let rec integer' t i =
if eos t then
Some i
else
match get t with
| '0'..'9' as d ->
let i' = 10 * i + (Char.code d - Char.code '0') in
if i' < i then raise Parse_error;
integer' t i'
| _ ->
unget t; Some i

let integer t =
if eos t then
None
else
match get t with
| '0'..'9' as d -> integer' t (Char.code d - Char.code '0')
| _ -> unget t; None

15 changes: 15 additions & 0 deletions lib/parse_buffer.mli
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
type t

exception Parse_error

val create : string -> t
val junk : t -> unit
val unget : t -> unit
val eos : t -> bool
val test : t -> char -> bool
val test2 : t -> char -> char -> bool
val get : t -> char
val accept : t -> char -> bool
val accept2 : t -> char -> char -> bool
val accept_s : t -> string -> bool
val integer : t -> int option
46 changes: 11 additions & 35 deletions lib/perl.ml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

module Re = Core

exception Parse_error
exception Parse_error = Parse_buffer.Parse_error
exception Not_supported

let posix_class_of_string = function
Expand Down Expand Up @@ -50,23 +50,13 @@ let posix_class_strings =
; "graph" ; "xdigit" ]

let parse multiline dollar_endonly dotall ungreedy s =
let i = ref 0 in
let l = String.length s in
let eos () = !i = l in
let test c = not (eos ()) && s.[!i] = c in
let accept c = let r = test c in if r then incr i; r in
let accept_s s' =
let len = String.length s' in
try
for j = 0 to len - 1 do
try if s'.[j] <> s.[!i + j] then raise_notrace Exit
with _ -> raise_notrace Exit
done;
i := !i + len;
true
with Exit -> false in
let get () = let r = s.[!i] in incr i; r in
let unget () = decr i in
let buf = Parse_buffer.create s in
let accept = Parse_buffer.accept buf in
let eos () = Parse_buffer.eos buf in
let test c = Parse_buffer.test buf c in
let unget () = Parse_buffer.unget buf in
let get () = Parse_buffer.get buf in
let accept_s = Parse_buffer.accept_s buf in
let greedy_mod r =
let gr = accept '?' in
let gr = if ungreedy then not gr else gr in
Expand All @@ -85,9 +75,9 @@ let parse multiline dollar_endonly dotall ungreedy s =
if accept '+' then greedy_mod (Re.rep1 r) else
if accept '?' then greedy_mod (Re.opt r) else
if accept '{' then
match integer () with
match Parse_buffer.integer buf with
Some i ->
let j = if accept ',' then integer () else Some i in
let j = if accept ',' then Parse_buffer.integer buf else Some i in
if not (accept '}') then raise Parse_error;
begin match j with
Some j when j < i -> raise Parse_error | _ -> ()
Expand Down Expand Up @@ -195,20 +185,6 @@ let parse multiline dollar_endonly dotall ungreedy s =
| 'a'..'f' as d -> Char.code d - Char.code 'a' + 10
| 'A'..'F' as d -> Char.code d - Char.code 'A' + 10
| _ -> raise Parse_error
and integer () =
if eos () then None else
match get () with
'0'..'9' as d -> integer' (Char.code d - Char.code '0')
| _ -> unget (); None
and integer' i =
if eos () then Some i else
match get () with
'0'..'9' as d ->
let i' = 10 * i + (Char.code d - Char.code '0') in
if i' < i then raise Parse_error;
integer' i'
| _ ->
unget (); Some i
and name () =
if eos () then raise Parse_error else
match get () with
Expand Down Expand Up @@ -291,7 +267,7 @@ let parse multiline dollar_endonly dotall ungreedy s =
`Char c
and comment () =
if eos () then raise Parse_error;
if accept ')' then Re.epsilon else begin incr i; comment () end
if accept ')' then Re.epsilon else begin Parse_buffer.junk buf; comment () end
in
let res = regexp () in
if not (eos ()) then raise Parse_error;
Expand Down
34 changes: 9 additions & 25 deletions lib/posix.ml
Original file line number Diff line number Diff line change
Expand Up @@ -30,18 +30,16 @@ Note that it should be possible to handle "(((ab)c)d)e" efficiently
*)
module Re = Core

exception Parse_error
exception Parse_error = Parse_buffer.Parse_error
exception Not_supported

let parse newline s =
let i = ref 0 in
let l = String.length s in
let eos () = !i = l in
let test c = not (eos ()) && s.[!i] = c in
let accept c = let r = test c in if r then incr i; r in
let get () = let r = s.[!i] in incr i; r in
let unget () = decr i in

let buf = Parse_buffer.create s in
let accept = Parse_buffer.accept buf in
let eos () = Parse_buffer.eos buf in
let test c = Parse_buffer.test buf c in
let unget () = Parse_buffer.unget buf in
let get () = Parse_buffer.get buf in
let rec regexp () = regexp' (branch ())
and regexp' left =
if accept '|' then regexp' (Re.alt [left; branch ()]) else left
Expand All @@ -55,9 +53,9 @@ let parse newline s =
if accept '+' then Re.rep1 (Re.nest r) else
if accept '?' then Re.opt r else
if accept '{' then
match integer () with
match Parse_buffer.integer buf with
Some i ->
let j = if accept ',' then integer () else Some i in
let j = if accept ',' then Parse_buffer.integer buf else Some i in
if not (accept '}') then raise Parse_error;
begin match j with
Some j when j < i -> raise Parse_error | _ -> ()
Expand Down Expand Up @@ -97,20 +95,6 @@ let parse newline s =
'*' | '+' | '?' | '{' | '\\' -> raise Parse_error
| c -> Re.char c
end
and integer () =
if eos () then None else
match get () with
'0'..'9' as d -> integer' (Char.code d - Char.code '0')
| _ -> unget (); None
and integer' i =
if eos () then Some i else
match get () with
'0'..'9' as d ->
let i' = 10 * i + (Char.code d - Char.code '0') in
if i' < i then raise Parse_error;
integer' i'
| _ ->
unget (); Some i
and bracket s =
if s <> [] && accept ']' then s else begin
let c = char () in
Expand Down

0 comments on commit b67cbc2

Please sign in to comment.