Skip to content

Commit

Permalink
feature: add streaming interface
Browse files Browse the repository at this point in the history
This allows us to partially match a string and then resume a match from where
we've ended.
  • Loading branch information
rgrinberg committed Oct 25, 2024
1 parent 73e4c4e commit 6276ee2
Show file tree
Hide file tree
Showing 9 changed files with 533 additions and 1 deletion.
2 changes: 2 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ Unreleased
* Introduce parsing functions in `Re.{Perl,Pcre,Emacs,Glob}` that return a
result instead of raising. (#542)

* Introduce experimental streaming API `Re.Stream`. (#456)

1.13.1 (30-Sep-2024)
--------------------

Expand Down
176 changes: 176 additions & 0 deletions lib/compile.ml
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,182 @@ let make_match_str re positions ~len ~groups ~partial s ~pos =
else final_boundary_check re positions ~last ~slen s state_info ~groups
;;

module Stream = struct
type nonrec t =
{ state : State.t
; re : re
}

type 'a feed =
| Ok of 'a
| No_match

let create re =
let category = Category.(search_boundary ++ inexistant) in
let state = find_initial_state re category in
{ state; re }
;;

let feed t s ~pos ~len =
(* TODO bound checks? *)
let last = pos + len in
let state = loop_no_mark t.re ~colors:t.re.colors s ~last ~pos t.state t.state in
let info = State.get_info state in
if Idx.is_break info.idx
&&
match Automata.State.status info.desc with
| Failed -> true
| Match _ | Running -> false
then No_match
else Ok { t with state }
;;

let finalize t s ~pos ~len =
(* TODO bound checks? *)
let last = pos + len in
let state = scan_str t.re Positions.empty s t.state ~last ~pos ~groups:false in
let info = State.get_info state in
match
let _idx, res =
let final_cat = Category.(search_boundary ++ inexistant) in
final t.re Positions.empty info final_cat
in
res
with
| Running | Failed -> false
| Match _ -> true
;;

module Group = struct
type nonrec t =
{ t : t
; positions : Positions.t
; slices : Slice.L.t
; abs_pos : int
; first_match_pos : int
}

let no_match_starts_before t = t.first_match_pos

let create t =
{ t
; positions = Positions.make ~groups:true t.re
; slices = []
; abs_pos = 0
; first_match_pos = 0
}
;;

module Match = struct
type t =
{ pmarks : Pmark.Set.t
; slices : Slice.L.t
; marks : Mark_infos.t
; positions : int array
; start_pos : int
}

let test_mark t mark = Pmark.Set.mem mark t.pmarks

let get t i =
Mark_infos.offset t.marks i
|> Option.map (fun (start, stop) ->
let start = t.positions.(start) - t.start_pos in
let stop = t.positions.(stop) - t.start_pos in
Slice.L.get_substring t.slices ~start ~stop)
;;

let make ~start_pos ~pmarks ~slices ~marks ~positions =
let positions = Positions.all positions in
{ pmarks; slices; positions; marks; start_pos }
;;
end

let rec loop re ~abs_pos ~colors ~positions s ~pos ~last st0 st =
if pos < last
then (
let st' = next colors st s pos in
let idx = (State.get_info st').idx in
if Idx.is_idx idx
then (
Positions.set positions (Idx.idx idx) (abs_pos + pos);
loop re ~abs_pos ~colors ~positions s ~pos:(pos + 1) ~last st' st')
else if Idx.is_break idx
then (
Positions.set positions (Idx.break_idx idx) (abs_pos + pos);
st')
else (
(* Unknown *)
validate re positions s ~pos st0;
loop re ~abs_pos ~colors ~positions s ~pos ~last st0 st0))
else st
;;

let feed ({ t; positions; slices; abs_pos; first_match_pos = _ } as tt) s ~pos ~len =
let state =
(* TODO bound checks? *)
let last = pos + len in
loop t.re ~abs_pos ~colors:t.re.colors s ~positions ~last ~pos t.state t.state
in
let info = State.get_info state in
if Idx.is_break info.idx
&&
match Automata.State.status info.desc with
| Failed -> true
| Match _ | Running -> false
then No_match
else (
let t = { t with state } in
let slices = { Slice.s; pos; len } :: slices in
let first_match_pos = Positions.first positions in
let slices = Slice.L.drop_rev slices (first_match_pos - tt.first_match_pos) in
let abs_pos = abs_pos + len in
Ok { tt with t; slices; abs_pos; first_match_pos })
;;

let finalize
({ t; positions; slices; abs_pos; first_match_pos = _ } as tt)
s
~pos
~len
: Match.t feed
=
(* TODO bound checks? *)
let last = pos + len in
let info =
let state =
loop t.re ~abs_pos ~colors:t.re.colors s ~positions ~last ~pos t.state t.state
in
State.get_info state
in
match
match Automata.State.status info.desc with
| (Match _ | Failed) as s -> s
| Running ->
let idx, res =
let final_cat = Category.(search_boundary ++ inexistant) in
final t.re positions info final_cat
in
(match res with
| Running | Failed -> ()
| Match _ -> Positions.set positions (Automata.Idx.to_int idx) (abs_pos + last));
res
with
| Running | Failed -> No_match
| Match (marks, pmarks) ->
let first_match_position = Positions.first positions in
let slices =
let slices =
let slices = { Slice.s; pos; len } :: slices in
Slice.L.drop_rev slices (first_match_position - tt.first_match_pos)
in
List.rev slices
in
Ok (Match.make ~start_pos:first_match_position ~pmarks ~marks ~slices ~positions)
;;
end
end

let match_str_no_bounds ~groups ~partial re s ~pos ~len =
let positions = Positions.make ~groups re in
match make_match_str re positions ~len ~groups ~partial s ~pos with
Expand Down
29 changes: 29 additions & 0 deletions lib/compile.mli
Original file line number Diff line number Diff line change
@@ -1,5 +1,34 @@
type re

module Stream : sig
type t

type 'a feed =
| Ok of 'a
| No_match

val create : re -> t
val feed : t -> string -> pos:int -> len:int -> t feed
val finalize : t -> string -> pos:int -> len:int -> bool

module Group : sig
type stream := t
type t

module Match : sig
type t

val get : t -> int -> string option
val test_mark : t -> Pmark.t -> bool
end

val create : stream -> t
val feed : t -> string -> pos:int -> len:int -> t feed
val finalize : t -> string -> pos:int -> len:int -> Match.t feed
val no_match_starts_before : t -> int
end
end

type match_info =
| Match of Group.t
| Failed
Expand Down
1 change: 1 addition & 0 deletions lib/core.ml
Original file line number Diff line number Diff line change
Expand Up @@ -170,3 +170,4 @@ include struct
end

module Seq = Search
module Stream = Compile.Stream
40 changes: 39 additions & 1 deletion lib/core.mli
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ val exec_partial_detailed
(** Marks *)
module Mark : sig
(** Mark id *)
type t
type t = Pmark.t

(** Tell if a mark was matched. *)
val test : Group.t -> t -> bool
Expand Down Expand Up @@ -773,3 +773,41 @@ val marked : Group.t -> Mark.t -> bool
(** Same as {!Mark.all}. Deprecated *)
val mark_set : Group.t -> Mark.Set.t
[@@ocaml.deprecated "Use Mark.all"]

module Stream : sig
(** An experimental for matching a regular expression by feeding individual
string chunks.
This module is not covered by semver's stability guarantee. *)

type t

type 'a feed =
| Ok of 'a
| No_match

val create : re -> t
val feed : t -> string -> pos:int -> len:int -> t feed

(** [finalize s ~pos ~len] feed [s] from [pos] to [len] and return whether
the regular expression matched. *)
val finalize : t -> string -> pos:int -> len:int -> bool

module Group : sig
(** Match a string against a regular expression with capture groups *)

type stream := t
type t

module Match : sig
type t

val get : t -> int -> string option
val test_mark : t -> Pmark.t -> bool
end

val create : stream -> t
val feed : t -> string -> pos:int -> len:int -> t feed
val finalize : t -> string -> pos:int -> len:int -> Match.t feed
end
end
70 changes: 70 additions & 0 deletions lib/slice.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
open Import

type t =
{ s : string
; pos : int
; len : int
}

module L = struct
type nonrec t = t list

let get_substring slices ~start ~stop =
if stop = start
then ""
else (
let slices =
let rec drop slices remains =
if remains = 0
then slices
else (
match slices with
| [] -> assert false
| ({ s = _; pos; len } as slice) :: xs ->
let remains' = remains - len in
if remains' >= 0
then drop xs remains'
else (
let pos = pos + remains in
let len = len - remains in
{ slice with pos; len } :: xs))
in
drop slices start
in
let buf = Buffer.create (stop - start) in
let rec take slices remains =
if remains > 0
then (
match slices with
| [] -> assert false
| { s; pos; len } :: xs ->
let remains' = remains - len in
if remains' > 0
then (
Buffer.add_substring buf s pos len;
take xs remains')
else Buffer.add_substring buf s pos remains)
in
take slices (stop - start);
Buffer.contents buf)
;;

let rec drop t remains =
if remains = 0
then t
else (
match t with
| [] -> []
| ({ s = _; pos; len } as slice) :: t ->
if remains >= len
then drop t (remains - len)
else (
let delta = len - remains in
{ slice with pos = pos + delta; len = len - delta } :: t))
;;

let drop_rev t remains =
(* TODO Use a proper functional queue *)
if remains = 0 then t else List.rev (drop (List.rev t) remains)
;;
end
12 changes: 12 additions & 0 deletions lib/slice.mli
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
type t =
{ s : string
; pos : int
; len : int
}

module L : sig
type nonrec t = t list

val get_substring : t -> start:int -> stop:int -> string
val drop_rev : t -> int -> t
end
1 change: 1 addition & 0 deletions lib_test/expect/dune
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
(library
(name re_tests)
(modules import test_stream)
(libraries
re_private
;; This is because of the (implicit_transitive_deps false)
Expand Down
Loading

0 comments on commit 6276ee2

Please sign in to comment.