diff --git a/README.md b/README.md index 4a2b2494..7d4d8ee1 100644 --- a/README.md +++ b/README.md @@ -112,7 +112,7 @@ with a length different from one. Note: - - The OCaml source is assumed to be encoded in Latin1 (for string + - The OCaml source is assumed to be encoded in utf8 (for string and character literals). diff --git a/src/syntax/ppx_sedlex.ml b/src/syntax/ppx_sedlex.ml index 601ac945..356f7973 100644 --- a/src/syntax/ppx_sedlex.ml +++ b/src/syntax/ppx_sedlex.ml @@ -285,29 +285,52 @@ let codepoint i = failwith (Printf.sprintf "Invalid Unicode code point: %i" i); i -let regexp_for_char c = Sedlex.chars (Cset.singleton (Char.code c)) - -let regexp_for_string s = - let rec aux n = - if n = String.length s then Sedlex.eps - else Sedlex.seq (regexp_for_char s.[n]) (aux (succ n)) +let fold_bytes ~f acc s = + let rec loop acc n = + if n = String.length s then acc + else ( + let acc = f acc s.[n] in + loop acc (succ n)) in - aux 0 + loop acc 0 + +let regexp_for_char c = Sedlex.chars (Cset.singleton (Char.code c)) +let regexp_for_uchar c = Sedlex.chars (Cset.singleton (Uchar.to_int c)) let err loc s = raise (Location.Error (Location.Error.createf ~loc "Sedlex: %s" s)) +let fold_well_formed_utf8 ~loc ~f acc s = + Utf8.fold + ~f:(fun acc _ uchar -> + match uchar with + | `Malformed _ -> err loc "Malformed utf-8 string" + | `Uchar uchar -> f acc uchar) + acc s + +let regexp_for_string ~loc ~utf8 s = + let l_rev = + if utf8 then + fold_well_formed_utf8 ~loc ~f:(fun acc uchar -> uchar :: acc) [] s + else fold_bytes ~f:(fun acc c -> Uchar.of_char c :: acc) [] s + in + let rec aux = function + | [] -> Sedlex.eps + | x :: xs -> Sedlex.seq (regexp_for_uchar x) (aux xs) + in + aux (List.rev l_rev) + let rec repeat r = function | 0, 0 -> Sedlex.eps | 0, m -> Sedlex.alt Sedlex.eps (Sedlex.seq r (repeat r (0, m - 1))) | n, m -> Sedlex.seq r (repeat r (n - 1, m - 1)) let regexp_of_pattern env = - let rec char_pair_op func name p tuple = + let rec char_pair_op func name ~utf8 p tuple = (* Construct something like Sub(a,b) *) match tuple with | Some { ppat_desc = Ppat_tuple [p0; p1] } -> begin - match func (aux p0) (aux p1) with + match func (aux ~utf8 p0) (aux ~utf8 p1) with | Some r -> r | None -> err p.ppat_loc @@ "the " ^ name @@ -317,16 +340,20 @@ let regexp_of_pattern env = | _ -> err p.ppat_loc @@ "the " ^ name ^ " operator requires two arguments, like " ^ name ^ "(a,b)" - and aux p = + and aux ~utf8 p = (* interpret one pattern node *) match p.ppat_desc with - | Ppat_or (p1, p2) -> Sedlex.alt (aux p1) (aux p2) + | Ppat_or (p1, p2) -> Sedlex.alt (aux ~utf8 p1) (aux ~utf8 p2) | Ppat_tuple (p :: pl) -> - List.fold_left (fun r p -> Sedlex.seq r (aux p)) (aux p) pl + List.fold_left + (fun r p -> Sedlex.seq r (aux ~utf8 p)) + (aux ~utf8 p) pl | Ppat_construct ({ txt = Lident "Star" }, Some (_, p)) -> - Sedlex.rep (aux p) + Sedlex.rep (aux ~utf8 p) | Ppat_construct ({ txt = Lident "Plus" }, Some (_, p)) -> - Sedlex.plus (aux p) + Sedlex.plus (aux ~utf8 p) + | Ppat_construct ({ txt = Lident "Utf8" }, Some (_, p)) -> + aux ~utf8:true p | Ppat_construct ( { txt = Lident "Rep" }, Some @@ -346,7 +373,7 @@ let regexp_of_pattern env = | Pconst_integer (i1, _), Pconst_integer (i2, _) -> let i1 = int_of_string i1 in let i2 = int_of_string i2 in - if 0 <= i1 && i1 <= i2 then repeat (aux p0) (i1, i2) + if 0 <= i1 && i1 <= i2 then repeat (aux ~utf8 p0) (i1, i2) else err p.ppat_loc "Invalid range for Rep operator" | _ -> err p.ppat_loc "Rep must take an integer constant or interval" @@ -354,11 +381,11 @@ let regexp_of_pattern env = | Ppat_construct ({ txt = Lident "Rep" }, _) -> err p.ppat_loc "the Rep operator takes 2 arguments" | Ppat_construct ({ txt = Lident "Opt" }, Some (_, p)) -> - Sedlex.alt Sedlex.eps (aux p) + Sedlex.alt Sedlex.eps (aux ~utf8 p) | Ppat_construct ({ txt = Lident "Compl" }, arg) -> begin match arg with | Some (_, p0) -> begin - match Sedlex.compl (aux p0) with + match Sedlex.compl (aux ~utf8 p0) with | Some r -> r | None -> err p.ppat_loc @@ -368,26 +395,36 @@ let regexp_of_pattern env = | _ -> err p.ppat_loc "the Compl operator requires an argument" end | Ppat_construct ({ txt = Lident "Sub" }, arg) -> - char_pair_op Sedlex.subtract "Sub" p + char_pair_op ~utf8 Sedlex.subtract "Sub" p (Option.map (fun (_, arg) -> arg) arg) | Ppat_construct ({ txt = Lident "Intersect" }, arg) -> - char_pair_op Sedlex.intersection "Intersect" p + char_pair_op ~utf8 Sedlex.intersection "Intersect" p (Option.map (fun (_, arg) -> arg) arg) - | Ppat_construct ({ txt = Lident "Chars" }, arg) -> ( + | Ppat_construct ({ txt = Lident "Chars" }, arg) -> let const = match arg with | Some (_, { ppat_desc = Ppat_constant const }) -> Some const | _ -> None in - match const with - | Some (Pconst_string (s, _, _)) -> - let c = ref Cset.empty in - for i = 0 to String.length s - 1 do - c := Cset.union !c (Cset.singleton (Char.code s.[i])) - done; - Sedlex.chars !c - | _ -> - err p.ppat_loc "the Chars operator requires a string argument") + begin + match const with + | Some (Pconst_string (s, _, _)) -> + let chars = + if utf8 then + fold_well_formed_utf8 ~loc:p.ppat_loc + ~f:(fun acc uchar -> + Cset.union acc (Cset.singleton (Uchar.to_int uchar))) + Cset.empty s + else + fold_bytes + ~f:(fun acc c -> + Cset.union acc (Cset.singleton (Char.code c))) + Cset.empty s + in + Sedlex.chars chars + | _ -> + err p.ppat_loc "the Chars operator requires a string argument" + end | Ppat_interval (i_start, i_end) -> begin match (i_start, i_end) with | Pconst_char c1, Pconst_char c2 -> @@ -401,7 +438,8 @@ let regexp_of_pattern env = end | Ppat_constant const -> begin match const with - | Pconst_string (s, _, _) -> regexp_for_string s + | Pconst_string (s, _, _) -> + regexp_for_string ~loc:p.ppat_loc ~utf8 s | Pconst_char c -> regexp_for_char c | Pconst_integer (i, _) -> Sedlex.chars (Cset.singleton (codepoint (int_of_string i))) @@ -414,7 +452,7 @@ let regexp_of_pattern env = end | _ -> err p.ppat_loc "this pattern is not a valid regexp" in - aux + aux ~utf8:false let previous = ref [] let regexps = ref [] diff --git a/src/syntax/utf8.ml b/src/syntax/utf8.ml new file mode 100644 index 00000000..c999256a --- /dev/null +++ b/src/syntax/utf8.ml @@ -0,0 +1,73 @@ +let unsafe_byte s j = Char.code (String.unsafe_get s j) +let malformed s j l = `Malformed (String.sub s j l) + +let width = function + | '\000' .. '\127' -> 1 + | '\192' .. '\223' -> 2 + | '\224' .. '\239' -> 3 + | '\240' .. '\247' -> 4 + | _ -> 0 + +let r_utf_8 s j l = + (* assert (0 <= j && 0 <= l && j + l <= String.length s); *) + let uchar c = `Uchar (Uchar.unsafe_of_int c) in + match l with + | 1 -> uchar (unsafe_byte s j) + | 2 -> + let b0 = unsafe_byte s j in + let b1 = unsafe_byte s (j + 1) in + if b1 lsr 6 != 0b10 then malformed s j l + else uchar (((b0 land 0x1F) lsl 6) lor (b1 land 0x3F)) + | 3 -> + let b0 = unsafe_byte s j in + let b1 = unsafe_byte s (j + 1) in + let b2 = unsafe_byte s (j + 2) in + let c = + ((b0 land 0x0F) lsl 12) lor ((b1 land 0x3F) lsl 6) lor (b2 land 0x3F) + in + if b2 lsr 6 != 0b10 then malformed s j l + else begin + match b0 with + | 0xE0 -> + if b1 < 0xA0 || 0xBF < b1 then malformed s j l else uchar c + | 0xED -> + if b1 < 0x80 || 0x9F < b1 then malformed s j l else uchar c + | _ -> if b1 lsr 6 != 0b10 then malformed s j l else uchar c + end + | 4 -> + let b0 = unsafe_byte s j in + let b1 = unsafe_byte s (j + 1) in + let b2 = unsafe_byte s (j + 2) in + let b3 = unsafe_byte s (j + 3) in + let c = + ((b0 land 0x07) lsl 18) + lor ((b1 land 0x3F) lsl 12) + lor ((b2 land 0x3F) lsl 6) + lor (b3 land 0x3F) + in + if b3 lsr 6 != 0b10 || b2 lsr 6 != 0b10 then malformed s j l + else begin + match b0 with + | 0xF0 -> + if b1 < 0x90 || 0xBF < b1 then malformed s j l else uchar c + | 0xF4 -> + if b1 < 0x80 || 0x8F < b1 then malformed s j l else uchar c + | _ -> if b1 lsr 6 != 0b10 then malformed s j l else uchar c + end + | _ -> assert false + +let fold ~f acc s = + let rec loop acc f s i last = + if i > last then acc + else ( + let need = width (String.unsafe_get s i) in + if need = 0 then loop (f acc i (malformed s i 1)) f s (i + 1) last + else ( + let rem = last - i + 1 in + if rem < need then f acc i (malformed s i rem) + else loop (f acc i (r_utf_8 s i need)) f s (i + need) last)) + in + let pos = 0 in + let len = String.length s in + let last = pos + len - 1 in + loop acc f s pos last diff --git a/src/syntax/utf8.mli b/src/syntax/utf8.mli new file mode 100644 index 00000000..c5cf2ddc --- /dev/null +++ b/src/syntax/utf8.mli @@ -0,0 +1,5 @@ +val fold : + f:('a -> int -> [> `Malformed of string | `Uchar of Uchar.t ] -> 'a) -> + 'a -> + string -> + 'a diff --git a/test/utf8.ml b/test/utf8.ml new file mode 100644 index 00000000..31d850fc --- /dev/null +++ b/test/utf8.ml @@ -0,0 +1,19 @@ +open Printf + +let next_tok buf = + let open Sedlexing.Utf8 in + match%sedlex buf with + | "a", Utf8 (Chars "+-×÷") -> sprintf "with Chars: %s" (lexeme buf) + | "b", Utf8 ("+" | "-" | "×" | "÷") -> + sprintf "with or_pattern: %s" (lexeme buf) + | _ -> failwith (sprintf "Unexpected character: %s" (lexeme buf)) + +let%expect_test _ = + Sedlexing.Utf8.from_string "a+" |> next_tok |> print_string; + [%expect {| with Chars: a+ |}]; + Sedlexing.Utf8.from_string "a÷" |> next_tok |> print_string; + [%expect {| with Chars: a÷ |}]; + Sedlexing.Utf8.from_string "b+" |> next_tok |> print_string; + [%expect {| with or_pattern: b+ |}]; + Sedlexing.Utf8.from_string "b÷" |> next_tok |> print_string; + [%expect {| with or_pattern: b÷ |}]