From d21481ff64092c63e0f32134c0b3bd5fcc114885 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ulysse=20G=C3=A9rard?= <thevoodoos@gmail.com>
Date: Thu, 12 Dec 2024 18:00:11 +0100
Subject: [PATCH] Lexer: merge utf8 support and reduce distance with upstream

---
 src/ocaml/preprocess/lexer_raw.mli |   6 +
 src/ocaml/preprocess/lexer_raw.mll | 405 +++++++++++++++++++----------
 2 files changed, 275 insertions(+), 136 deletions(-)

diff --git a/src/ocaml/preprocess/lexer_raw.mli b/src/ocaml/preprocess/lexer_raw.mli
index 4f66ff0fe..3942ee7e9 100644
--- a/src/ocaml/preprocess/lexer_raw.mli
+++ b/src/ocaml/preprocess/lexer_raw.mli
@@ -22,7 +22,13 @@ type error =
   | Unterminated_string_in_comment of Location.t * Location.t
   | Empty_character_literal
   | Keyword_as_label of string
+  | Capitalized_label of string
   | Invalid_literal of string
+  | Invalid_directive of string * string option
+  | Invalid_encoding of string
+  | Invalid_char_in_ident of Uchar.t
+  | Non_lowercase_delimiter of string
+  | Capitalized_raw_identifier of string
   | Unknown_keyword of string
 exception Error of error * Location.t
 
diff --git a/src/ocaml/preprocess/lexer_raw.mll b/src/ocaml/preprocess/lexer_raw.mll
index 2c216ddc8..d5f7feb43 100644
--- a/src/ocaml/preprocess/lexer_raw.mll
+++ b/src/ocaml/preprocess/lexer_raw.mll
@@ -29,7 +29,13 @@ type error =
   | Unterminated_string_in_comment of Location.t * Location.t
   | Empty_character_literal
   | Keyword_as_label of string
+  | Capitalized_label of string
   | Invalid_literal of string
+  | Invalid_directive of string * string option
+  | Invalid_encoding of string
+  | Invalid_char_in_ident of Uchar.t
+  | Non_lowercase_delimiter of string
+  | Capitalized_raw_identifier of string
   | Unknown_keyword of string
 
 exception Error of error * Location.t
@@ -52,6 +58,9 @@ let rec (>>=) (m : 'a result) (f : 'a -> 'b result) : 'b result =
     Refill (fun () -> u () >>= f)
   | Fail _ as e -> e
 
+let (let*) = (>>=)
+let (let+) = fun m f -> (>>=) m (fun x -> return (f x))
+
 type preprocessor = (Lexing.lexbuf -> Parser_raw.token) -> Lexing.lexbuf -> Parser_raw.token
 
 type state = {
@@ -180,8 +189,11 @@ let list_keywords =
     Hashtbl.fold add_kw keywords init
 
 let store_string_char buf c = Buffer.add_char buf c
+let store_string_utf_8_uchar buf u = Buffer.add_utf_8_uchar buf u
+let store_string buf s = Buffer.add_string buf s
 let store_substring buf s ~pos ~len = Buffer.add_substring buf s pos len
 
+let store_lexeme buf lexbuf = store_string buf (Lexing.lexeme lexbuf)
 let store_normalized_newline buf newline =
   (* #12502: we normalize "\r\n" to "\n" at lexing time,
      to avoid behavior difference due to OS-specific
@@ -210,13 +222,18 @@ let store_normalized_newline buf newline =
 
 (* To store the position of the beginning of a string and comment *)
 let in_comment state = state.comment_start_loc <> []
+let print_warnings = ref true
 
 (* Escaped chars are interpreted in strings unless they are in comments. *)
-let store_escaped_uchar state lexbuf u =
+let store_escaped_char state lexbuf c =
   if in_comment state
-  then Buffer.add_string state.buffer (Lexing.lexeme lexbuf)
-  else Buffer.add_utf_8_uchar state.buffer u
+  then store_lexeme state.buffer lexbuf
+  else store_string_char state.buffer c
 
+let store_escaped_uchar state lexbuf u =
+  if in_comment state
+  then store_lexeme state.buffer lexbuf
+  else store_string_utf_8_uchar state.buffer u
 
 let compute_quoted_string_idloc {Location.loc_start = orig_loc; _ } shift id =
   let id_start_pos = orig_loc.Lexing.pos_cnum + shift in
@@ -243,6 +260,16 @@ let wrap_string_lexer f state lexbuf =
   state.string_start_loc <- Location.none;
   return (Buffer.contents state.buffer, loc)
 
+let wrap_comment_lexer state comment lexbuf =
+  let start_loc = Location.curr lexbuf  in
+  state.comment_start_loc <- [start_loc];
+  Buffer.reset state.buffer;
+  let+ end_loc = comment state lexbuf in
+  let s = Buffer.contents state.buffer in
+  Buffer.reset state.buffer;
+  s,
+  { start_loc with Location.loc_end = end_loc.Location.loc_end }
+
 (* to translate escape sequences *)
 
 let digit_value c =
@@ -316,6 +343,39 @@ let uchar_for_uchar_escape lexbuf =
       illegal_escape lexbuf
         (Printf.sprintf "%X is not a Unicode scalar value" cp)
 
+let validate_encoding lexbuf raw_name =
+  match Utf8_lexeme.normalize raw_name with
+  | Error _ -> fail lexbuf (Invalid_encoding raw_name)
+  | Ok name -> return name
+
+let ident_for_extended lexbuf raw_name =
+  let* name = validate_encoding lexbuf raw_name in
+  match Utf8_lexeme.validate_identifier name with
+  | Utf8_lexeme.Valid -> return name
+  | Utf8_lexeme.Invalid_character u -> fail lexbuf (Invalid_char_in_ident u)
+  | Utf8_lexeme.Invalid_beginning _ ->
+  assert false (* excluded by the regexps *)
+
+let validate_delim lexbuf raw_name =
+  let* name = validate_encoding lexbuf raw_name in
+  if Utf8_lexeme.is_lowercase name then return name
+  else fail lexbuf (Non_lowercase_delimiter name)
+
+let validate_ext lexbuf name =
+  let* name = validate_encoding lexbuf name in
+  match Utf8_lexeme.validate_identifier ~with_dot:true name with
+  | Utf8_lexeme.Valid -> return name
+  | Utf8_lexeme.Invalid_character u -> fail lexbuf (Invalid_char_in_ident u)
+  | Utf8_lexeme.Invalid_beginning _ ->
+  assert false (* excluded by the regexps *)
+
+let lax_delim raw_name =
+  match Utf8_lexeme.normalize raw_name with
+  | Error _ -> None
+  | Ok name ->
+     if Utf8_lexeme.is_lowercase name then Some name
+     else None
+
 let keyword_or state s default =
   try Hashtbl.find state.keywords s
   with Not_found ->
@@ -337,9 +397,11 @@ let find_keyword state lexbuf ~name ~default =
   try return @@ Hashtbl.find state.keywords name
   with Not_found -> find_keyword lexbuf name default
 
-let check_label_name lexbuf name =
-  if is_keyword name
-  then fail lexbuf (Keyword_as_label name)
+let check_label_name ?(raw_escape=false) lexbuf name =
+  if Utf8_lexeme.is_capitalized name then
+    fail lexbuf (Capitalized_label name)
+  else if not raw_escape && is_keyword name then
+    fail lexbuf (Keyword_as_label name)
   else return name
 
 (* Update the current location with file name and line number. *)
@@ -357,11 +419,9 @@ let update_loc lexbuf _file line absolute chars =
     pos_bol = pos.pos_cnum - chars;
   }
 
-(* Warn about Latin-1 characters used in idents *)
 
-let warn_latin1 lexbuf =
-  Location.deprecated (Location.curr lexbuf)
-    "ISO-Latin1 characters in identifiers"
+(* TODO Merlin should we support this ?*)
+let handle_docstrings = ref false
 
 (* Error report *)
 
@@ -399,8 +459,31 @@ let prepare_error loc = function
   | Keyword_as_label kwd ->
       Location.errorf ~loc
         "%a is a keyword, it cannot be used as label name" Style.inline_code kwd
+  | Capitalized_label lbl ->
+      Location.errorf ~loc
+        "%a cannot be used as label name, \
+         it must start with a lowercase letter" Style.inline_code lbl
   | Invalid_literal s ->
       Location.errorf ~loc "Invalid literal %s" s
+  | Invalid_directive (dir, explanation) ->
+      Location.errorf ~loc "Invalid lexer directive %S%t" dir
+        (fun ppf -> match explanation with
+           | None -> ()
+           | Some expl -> fprintf ppf ": %s" expl)
+  | Invalid_encoding s ->
+    Location.errorf ~loc "Invalid encoding of identifier %s." s
+  | Invalid_char_in_ident u ->
+      Location.errorf ~loc "Invalid character U+%X in identifier"
+         (Uchar.to_int u)
+  | Capitalized_raw_identifier lbl ->
+      Location.errorf ~loc
+        "%a cannot be used as a raw identifier, \
+         it must start with a lowercase letter" Style.inline_code lbl
+  | Non_lowercase_delimiter name ->
+      Location.errorf ~loc
+        "%a cannot be used as a quoted string delimiter,@ \
+         it must contain only lowercase letters."
+         Style.inline_code name
   | Unknown_keyword name ->
       Location.errorf ~loc
       "%a has been defined as an additional keyword.@ \
@@ -423,14 +506,26 @@ let newline = ('\013'* '\010')
 let blank = [' ' '\009' '\012']
 let lowercase = ['a'-'z' '_']
 let uppercase = ['A'-'Z']
-let identchar = ['A'-'Z' 'a'-'z' '_' '\'' '0'-'9' '\128'-'\255']
+let identstart = lowercase | uppercase
+let identchar = ['A'-'Z' 'a'-'z' '_' '\'' '0'-'9']
+let utf8 = ['\192'-'\255'] ['\128'-'\191']*
+let identstart_ext = identstart | utf8
+let identchar_ext = identchar | utf8
+let delim_ext = (lowercase | uppercase | utf8)*
+(* ascii uppercase letters in quoted string delimiters ({delim||delim}) are
+   rejected by the delimiter validation function, we accept them temporarily to
+   have the same error message for ascii and non-ascii uppercase letters *)
+
+(* TODO REMOVE *)
 let lowercase_latin1 = ['a'-'z' '\223'-'\246' '\248'-'\255' '_']
 let uppercase_latin1 = ['A'-'Z' '\192'-'\214' '\216'-'\222']
 let identchar_latin1 = identchar
   (*['A'-'Z' 'a'-'z' '_' '\192'-'\214' '\216'-'\246' '\248'-'\255' '\'' '0'-'9']*)
+(* END TODO REMOVE *)
+
 let symbolchar =
   ['!' '$' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>' '?' '@' '^' '|' '~']
-let symbolcharnopercent =
+let symbolcharnopercent = (* TODO ???? *)
   ['!' '$' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>' '?' '@' '^' '|' '~']
 let dotsymbolchar =
   ['!' '$' '%' '&' '*' '+' '-' '/' ':' '=' '>' '?' '@' '^' '|']
@@ -440,6 +535,7 @@ let kwdopchar =
   ['$' '&' '*' '+' '-' '/' '<' '=' '>' '@' '^' '|']
 
 let ident = (lowercase | uppercase) identchar*
+let ident_ext = identstart_ext  identchar_ext*
 let extattrident = ident ('.' ident)*
 
 let decimal_literal =
@@ -500,32 +596,36 @@ rule token state = parse
       { fail lexbuf
           (Reserved_sequence (".~", Some "is reserved for use in MetaOCaml")) }
       *)
-  | "~" raw_ident_escape (lowercase identchar * as name) ':'
-      { return (LABEL name) }
-  | "~" (lowercase identchar * as name) ':'
+  | "~" (identstart identchar * as name) ':'
       { lABEL (check_label_name lexbuf name) }
-  | "~" (lowercase_latin1 identchar_latin1 * as name) ':'
-      { warn_latin1 lexbuf;
-        return (LABEL name) }
+  | "~" (raw_ident_escape? as escape) (ident_ext as raw_name) ':'
+      { ident_for_extended lexbuf raw_name
+        >>= check_label_name ~raw_escape:(escape<>"") lexbuf
+        |> lABEL }
   | "?"
       { return QUESTION }
-  | "?" raw_ident_escape (lowercase identchar * as name) ':'
-      { return (OPTLABEL name) }
   | "?" (lowercase identchar * as name) ':'
       { oPTLABEL (check_label_name lexbuf name) }
-  | "?" (lowercase_latin1 identchar_latin1 * as name) ':'
-      { warn_latin1 lexbuf; return (OPTLABEL name) }
-  | raw_ident_escape (lowercase identchar * as name)
-      { return (LIDENT name) }
+  | "?" (raw_ident_escape? as escape) (ident_ext as raw_name) ':'
+      { ident_for_extended lexbuf raw_name
+        >>= check_label_name ~raw_escape:(escape<>"") lexbuf
+        |> oPTLABEL }
+  (* | raw_ident_escape (lowercase identchar * as name)
+      { return (LIDENT name) } *)
   | lowercase identchar * as name
     { (find_keyword state lexbuf ~name ~default:(LIDENT name)) }
-  | lowercase_latin1 identchar_latin1 * as name
-      { warn_latin1 lexbuf; return (LIDENT name) }
   | uppercase identchar * as name
     { (* Capitalized keywords for OUnit *)
       (find_keyword state lexbuf ~name ~default:(UIDENT name))}
-  | uppercase_latin1 identchar_latin1 * as name
-    { warn_latin1 lexbuf; return (UIDENT name) }
+  | (raw_ident_escape? as escape) (ident_ext as raw_name)
+    { let* name = ident_for_extended lexbuf raw_name in
+      if Utf8_lexeme.is_capitalized name then begin
+        if escape="" then return (UIDENT name)
+        else return (UIDENT name)
+          (* we don't have capitalized keywords, and thus no needs for
+             capitalized raw identifiers. *)
+          (*fail lexbuf (Capitalized_raw_identifier name)*)
+      end else return (LIDENT name) }
   | int_literal as lit { return (INT (lit, None)) }
   | (int_literal as lit) (literal_modifier as modif)
     { return (INT (lit, Some modif)) }
@@ -538,37 +638,36 @@ rule token state = parse
   | "\""
       { wrap_string_lexer string state lexbuf >>= fun (str, loc) ->
         return (STRING (str, loc, None)) }
-  | "\'\'"
-      { wrap_string_lexer string state lexbuf >>= fun (str, loc) ->
-        return (STRING (str, loc, None)) }
-  | "{" (lowercase* as delim) "|"
-      { wrap_string_lexer (quoted_string delim) state lexbuf
-        >>= fun (str, loc) ->
-        return (STRING (str, loc, Some delim)) }
-  | "{%" (extattrident as id) "|"
+  | "{" (delim_ext as raw_name) '|'
+      { let* delim = validate_delim lexbuf raw_name in
+        let+ s, loc = wrap_string_lexer (quoted_string delim) state lexbuf in
+        STRING (s, loc, Some delim) }
+  | "{%" (extattrident as raw_id) "|"
       { let orig_loc = Location.curr lexbuf in
-        wrap_string_lexer (quoted_string "") state lexbuf
-        >>= fun (str, loc) ->
+        let* id = validate_ext lexbuf raw_id in
+        let+ s, loc =wrap_string_lexer (quoted_string "") state lexbuf in
         let idloc = compute_quoted_string_idloc orig_loc 2 id in
-        return (QUOTED_STRING_EXPR (id, idloc, str, loc, Some "")) }
-  | "{%" (extattrident as id) blank+ (lowercase* as delim) "|"
+        QUOTED_STRING_EXPR (id, idloc, s, loc, Some "") }
+  | "{%" (extattrident as raw_id) blank+ (delim_ext as raw_delim) "|"
       { let orig_loc = Location.curr lexbuf in
-        wrap_string_lexer (quoted_string delim) state lexbuf
-        >>= fun (str, loc) ->
+        let* id = validate_ext lexbuf raw_id in
+        let* delim = validate_delim lexbuf raw_delim in
+        let+ s, loc = wrap_string_lexer (quoted_string delim) state lexbuf in
         let idloc = compute_quoted_string_idloc orig_loc 2 id in
-        return (QUOTED_STRING_EXPR (id, idloc, str, loc, Some delim)) }
-  | "{%%" (extattrident as id) "|"
+        QUOTED_STRING_EXPR (id, idloc, s, loc, Some delim) }
+  | "{%%" (extattrident as raw_id) "|"
       { let orig_loc = Location.curr lexbuf in
-        wrap_string_lexer (quoted_string "") state lexbuf
-        >>= fun (str, loc) ->
+        let* id = validate_ext lexbuf raw_id in
+        let+ s, loc = wrap_string_lexer (quoted_string "") state lexbuf in
         let idloc = compute_quoted_string_idloc orig_loc 3 id in
-        return (QUOTED_STRING_ITEM (id, idloc, str, loc, Some "")) }
-  | "{%%" (extattrident as id) blank+ (lowercase* as delim) "|"
+        QUOTED_STRING_ITEM (id, idloc, s, loc, Some "") }
+  | "{%%" (extattrident as raw_id) blank+ (delim_ext as raw_delim) "|"
       { let orig_loc = Location.curr lexbuf in
-        wrap_string_lexer (quoted_string delim) state lexbuf
-        >>= fun (str, loc) ->
+        let* id = validate_ext lexbuf raw_id in
+        let* delim = validate_delim lexbuf raw_delim in
+        let+ s, loc = wrap_string_lexer (quoted_string delim) state lexbuf in
         let idloc = compute_quoted_string_idloc orig_loc 3 id in
-        return (QUOTED_STRING_ITEM (id, idloc, str, loc, Some delim)) }
+        QUOTED_STRING_ITEM (id, idloc, s, loc, Some delim) }
   | "\'" newline "\'"
     { update_loc lexbuf None 1 false 1;
       (* newline is ('\013'* '\010') *)
@@ -577,34 +676,45 @@ rule token state = parse
     { return (CHAR c) }
   | "\'\\" (['\\' '\'' '\"' 'n' 't' 'b' 'r' ' '] as c) "\'"
     { return (CHAR (char_for_backslash c)) }
-  | "\'\\" 'o' ['0'-'3'] ['0'-'7'] ['0'-'7'] "\'"
-    { char_for_octal_code state lexbuf 3 >>= fun c -> return (CHAR c) }
   | "\'\\" ['0'-'9'] ['0'-'9'] ['0'-'9'] "\'"
     { char_for_decimal_code state lexbuf 2 >>= fun c -> return (CHAR c) }
+  | "\'\\" 'o' ['0'-'3'] ['0'-'7'] ['0'-'7'] "\'"
+    { char_for_octal_code state lexbuf 3 >>= fun c -> return (CHAR c) }
   | "\'\\" 'x' ['0'-'9' 'a'-'f' 'A'-'F'] ['0'-'9' 'a'-'f' 'A'-'F'] "\'"
     { return (CHAR (char_for_hexadecimal_code lexbuf 3)) }
   | "\'" ("\\" [^ '#'] as esc)
       { fail lexbuf (Illegal_escape (esc, None)) }
   | "(*"
-      { let start_loc = Location.curr lexbuf in
-        state.comment_start_loc <- [start_loc];
-        Buffer.reset state.buffer;
-        comment state lexbuf >>= fun end_loc ->
-        let s = Buffer.contents state.buffer in
-        Buffer.reset state.buffer;
-        return (COMMENT (s, { start_loc with
-                              Location.loc_end = end_loc.Location.loc_end }))
+      { let+ s, loc = wrap_comment_lexer state comment lexbuf in
+        COMMENT (s, loc) }
+  | "(**"
+      { let+ s, loc = wrap_comment_lexer state comment lexbuf in
+        if !handle_docstrings then
+          DOCSTRING (Docstrings.docstring s loc)
+        else
+          COMMENT ("*" ^ s, loc)
       }
+  | "(**" (('*'+) as stars)
+      { let+ s, loc =
+          wrap_comment_lexer
+            state
+            (fun state lexbuf ->
+               store_string state.buffer ("*" ^ stars);
+               comment state  lexbuf)
+            lexbuf
+        in
+        COMMENT (s, loc) }
   | "(*)"
-      { let loc = Location.curr lexbuf in
-        Location.prerr_warning loc Warnings.Comment_start;
-        state.comment_start_loc <- [loc];
-        Buffer.reset state.buffer;
-        comment state lexbuf >>= fun end_loc ->
-        let s = Buffer.contents state.buffer in
-        Buffer.reset state.buffer;
-        return (COMMENT (s, { loc with Location.loc_end = end_loc.Location.loc_end }))
-      }
+      { if !print_warnings then
+          Location.prerr_warning (Location.curr lexbuf) Warnings.Comment_start;
+        let+ s, loc = wrap_comment_lexer state comment lexbuf in
+        COMMENT (s, loc) }
+  | "(*" (('*'*) as stars) "*)"
+      { if !handle_docstrings && stars="" then
+         (* (**) is an empty docstring *)
+          return (DOCSTRING(Docstrings.docstring "" (Location.curr lexbuf)))
+        else
+          return (COMMENT (stars, Location.curr lexbuf)) }
   | "*)"
       { let loc = Location.curr lexbuf in
         Location.prerr_warning loc Warnings.Comment_not_end;
@@ -613,13 +723,12 @@ rule token state = parse
         lexbuf.lex_curr_p <- { curpos with pos_cnum = curpos.pos_cnum - 1 };
         return STAR
       }
-  | "#" [' ' '\t']* (['0'-'9']+ as num) [' ' '\t']*
-        ("\"" ([^ '\010' '\013' '\"' ] * as name) "\"")?
-        [^ '\010' '\013'] * newline
-      { update_loc lexbuf name (int_of_string num) true 0;
-        token state lexbuf
+  | "#"
+      { let at_beginning_of_line pos = (pos.pos_cnum = pos.pos_bol) in
+        if not (at_beginning_of_line lexbuf.lex_start_p)
+        then return HASH
+        else try directive state lexbuf with Failure _ -> return HASH
       }
-  | "#"  { return HASH }
   | "&"  { return AMPERSAND }
   | "&&" { return AMPERAMPER }
   | "`"  { return BACKQUOTE }
@@ -672,7 +781,7 @@ rule token state = parse
             { return (PREFIXOP op) }
   | ['~' '?'] symbolchar_or_hash + as op
             { return (PREFIXOP op) }
-  | ['=' '<' '|' '&' '$' '>'] symbolchar * as op
+  | ['=' '<' '>' '|' '&' '$'] symbolchar * as op
             { return (keyword_or state op
                        (INFIXOP0 op)) }
   | ['@' '^'] symbolchar * as op
@@ -697,18 +806,35 @@ rule token state = parse
   | _ as illegal_char
       { fail lexbuf (Illegal_character illegal_char) }
 
+and directive state = parse
+  | ([' ' '\t']* (['0'-'9']+ as num) [' ' '\t']*
+        ("\"" ([^ '\010' '\013' '\"' ] * as name) "\"") as directive)
+        [^ '\010' '\013'] *
+      {
+        match int_of_string num with
+        | exception _ ->
+            (* PR#7165 *)
+            let explanation = "line number out of range" in
+            fail lexbuf (Invalid_directive ("#" ^ directive, Some explanation))
+        | line_num ->
+           (* Documentation says that the line number should be
+              positive, but we have never guarded against this and it
+              might have useful hackish uses. *)
+            update_loc lexbuf (Some name) (line_num - 1) true 0;
+            token state lexbuf
+      }
 and comment state = parse
     "(*"
       { state.comment_start_loc <- (Location.curr lexbuf) :: state.comment_start_loc;
-      Buffer.add_string state.buffer (Lexing.lexeme lexbuf);
-      comment state lexbuf
-    }
+        store_lexeme state.buffer lexbuf;
+        comment state lexbuf
+      }
   | "*)"
       { match state.comment_start_loc with
         | [] -> assert false
         | [_] -> state.comment_start_loc <- []; return (Location.curr lexbuf)
         | _ :: l -> state.comment_start_loc <- l;
-                  Buffer.add_string state.buffer (Lexing.lexeme lexbuf);
+                  store_lexeme state.buffer lexbuf;
                   comment state lexbuf
        }
   | "\""
@@ -729,35 +855,37 @@ and comment state = parse
              | e -> fail_loc e l
            )
         ) >>= fun _loc ->
-      state.string_start_loc <- Location.none;
-      Buffer.add_string buffer (String.escaped (Buffer.contents state.buffer));
-      state.buffer <- buffer;
-      Buffer.add_char state.buffer '\"';
-      comment state lexbuf }
-  | "{" ('%' '%'? extattrident blank*)? (lowercase* as delim) "|"
-      {
-        state.string_start_loc <- Location.curr lexbuf;
-        Buffer.add_string state.buffer (Lexing.lexeme lexbuf);
-        (catch (quoted_string delim state lexbuf) (fun e l -> match e with
-             | Unterminated_string ->
-               begin match state.comment_start_loc with
-                 | [] -> assert false
-                 | loc :: _ ->
-                   let start = List.hd (List.rev state.comment_start_loc) in
-                   state.comment_start_loc <- [];
-                   fail_loc (Unterminated_string_in_comment (start, l)) loc
-               end
-             | e -> fail_loc e l
-           )
-        ) >>= fun _loc ->
         state.string_start_loc <- Location.none;
-        Buffer.add_char state.buffer '|';
-        Buffer.add_string state.buffer delim;
-        Buffer.add_char state.buffer '}';
+        Buffer.add_string buffer (String.escaped (Buffer.contents state.buffer));
+        state.buffer <- buffer;
+        store_string_char state.buffer '\"';
         comment state lexbuf }
+  | "{" ('%' '%'? extattrident blank*)? (delim_ext as raw_delim) "|"
+      { match lax_delim raw_delim with
+        | None -> store_lexeme state.buffer lexbuf; comment state lexbuf
+        | Some delim ->
+          state.string_start_loc <- Location.curr lexbuf;
+          Buffer.add_string state.buffer (Lexing.lexeme lexbuf);
+          (catch (quoted_string delim state lexbuf) (fun e l -> match e with
+              | Unterminated_string ->
+                begin match state.comment_start_loc with
+                  | [] -> assert false
+                  | loc :: _ ->
+                    let start = List.hd (List.rev state.comment_start_loc) in
+                    state.comment_start_loc <- [];
+                    fail_loc (Unterminated_string_in_comment (start, l)) loc
+                end
+              | e -> fail_loc e l
+            )
+          ) >>= fun _loc ->
+          state.string_start_loc <- Location.none;
+          Buffer.add_char state.buffer '|';
+          Buffer.add_string state.buffer delim;
+          Buffer.add_char state.buffer '}';
+          comment state lexbuf }
 
   | "\'\'"
-      { Buffer.add_string state.buffer (Lexing.lexeme lexbuf); comment state lexbuf }
+      { store_lexeme state.buffer lexbuf; comment state lexbuf }
   | "\'" (newline as nl) "\'"
       { update_loc lexbuf None 1 false 1;
         store_string_char state.buffer '\'';
@@ -765,16 +893,16 @@ and comment state = parse
         store_string_char state.buffer '\'';
         comment state lexbuf
       }
-  | "\'" [^ '\\' '\'' '\010' '\013' ] "'"
-      { Buffer.add_string state.buffer (Lexing.lexeme lexbuf); comment state lexbuf }
-  | "\'\\" ['\\' '\"' '\'' 'n' 't' 'b' 'r' ' '] "'"
-      { Buffer.add_string state.buffer (Lexing.lexeme lexbuf); comment state lexbuf }
-  | "\'\\" ['0'-'9'] ['0'-'9'] ['0'-'9'] "'"
-      { Buffer.add_string state.buffer (Lexing.lexeme lexbuf); comment state lexbuf }
+  | "\'" [^ '\\' '\'' '\010' '\013' ] "\'"
+      { store_lexeme state.buffer lexbuf; comment state lexbuf }
+  | "\'\\" ['\\' '\"' '\'' 'n' 't' 'b' 'r' ' '] "\'"
+      { store_lexeme state.buffer lexbuf; comment state lexbuf }
+  | "\'\\" ['0'-'9'] ['0'-'9'] ['0'-'9'] "\'"
+      { store_lexeme state.buffer lexbuf; comment state lexbuf }
   | "\'\\" 'o' ['0'-'3'] ['0'-'7'] ['0'-'7'] "\'"
-      { Buffer.add_string state.buffer (Lexing.lexeme lexbuf); comment state lexbuf }
-  | "\'\\" 'x' ['0'-'9' 'a'-'f' 'A'-'F'] ['0'-'9' 'a'-'f' 'A'-'F'] "'"
-      { Buffer.add_string state.buffer (Lexing.lexeme lexbuf); comment state lexbuf }
+      { store_lexeme state.buffer lexbuf; comment state lexbuf }
+  | "\'\\" 'x' ['0'-'9' 'a'-'f' 'A'-'F'] ['0'-'9' 'a'-'f' 'A'-'F'] "\'"
+      { store_lexeme state.buffer lexbuf; comment state lexbuf }
   | eof
       { match state.comment_start_loc with
         | [] -> assert false
@@ -788,29 +916,37 @@ and comment state = parse
         store_normalized_newline state.buffer nl;
         comment state lexbuf
       }
-  | (lowercase | uppercase) identchar *
-      { Buffer.add_string state.buffer (Lexing.lexeme lexbuf); comment state lexbuf }
+  | ident
+      { store_lexeme state.buffer lexbuf; comment state lexbuf }
   | _
-      { Buffer.add_string state.buffer (Lexing.lexeme lexbuf); comment state lexbuf }
+      { store_lexeme state.buffer lexbuf; comment state lexbuf }
 
 and string state = parse
     '\"'
       { return lexbuf.lex_start_p  }
-  | '\\' newline ([' ' '\t'] * as space)
+  | '\\' (newline as nl) ([' ' '\t'] * as space)
       { update_loc lexbuf None 1 false (String.length space);
+        if in_comment state then begin
+          store_string_char state.buffer '\\';
+          store_normalized_newline state.buffer nl;
+          store_string state.buffer space;
+        end;
         string state lexbuf
       }
-  | '\\' ['\\' '\'' '\"' 'n' 't' 'b' 'r' ' ']
-      { Buffer.add_char state.buffer
-          (char_for_backslash (Lexing.lexeme_char lexbuf 1));
+  | '\\' (['\\' '\'' '\"' 'n' 't' 'b' 'r' ' '] as c)
+      { store_escaped_char state lexbuf (char_for_backslash c);
         string state lexbuf }
   | '\\' ['0'-'9'] ['0'-'9'] ['0'-'9']
       { char_for_decimal_code state lexbuf 1 >>= fun c ->
-        Buffer.add_char state.buffer c;
-        string state lexbuf }
+        store_escaped_char state lexbuf c;
+         string state lexbuf }
+  | '\\' 'o' ['0'-'7'] ['0'-'7'] ['0'-'7']
+      { char_for_octal_code state lexbuf 2 >>= fun c ->
+        store_escaped_char state lexbuf c;
+         string state lexbuf }
   | '\\' 'x' ['0'-'9' 'a'-'f' 'A'-'F'] ['0'-'9' 'a'-'f' 'A'-'F']
-      { Buffer.add_char state.buffer (char_for_hexadecimal_code lexbuf 2);
-        string state lexbuf }
+      { store_escaped_char state lexbuf (char_for_hexadecimal_code lexbuf 2);
+         string state lexbuf }
   | '\\' 'u' '{' hex_digit+ '}'
       { store_escaped_uchar state lexbuf (uchar_for_uchar_escape lexbuf);
         string state lexbuf }
@@ -819,13 +955,11 @@ and string state = parse
         then string state lexbuf
         else begin
 (*  Should be an error, but we are very lax.
-                  fail (Illegal_escape (Lexing.lexeme lexbuf),
-                        (Location.curr lexbuf)
+          error lexbuf (Illegal_escape (Lexing.lexeme lexbuf, None))
 *)
           let loc = Location.curr lexbuf in
           Location.prerr_warning loc Warnings.Illegal_backslash;
-          Buffer.add_char state.buffer (Lexing.lexeme_char lexbuf 0);
-          Buffer.add_char state.buffer (Lexing.lexeme_char lexbuf 1);
+          store_lexeme state.buffer lexbuf;
           string state lexbuf
         end
       }
@@ -852,16 +986,15 @@ and quoted_string delim state = parse
       { let loc = state.string_start_loc in
         state.string_start_loc <- Location.none;
         fail_loc Unterminated_string loc }
-  | "|" lowercase* "}"
+  | "|" (ident_ext? as raw_edelim) "}"
       {
-        let edelim = Lexing.lexeme lexbuf in
-        let edelim = String.sub edelim ~pos:1 ~len:(String.length edelim - 2) in
+        let* edelim = validate_encoding lexbuf raw_edelim in
         if delim = edelim then return lexbuf.lex_start_p
-        else (Buffer.add_string state.buffer (Lexing.lexeme lexbuf);
+        else (store_lexeme state.buffer lexbuf;
               quoted_string delim state lexbuf)
       }
-  | _
-      { Buffer.add_char state.buffer (Lexing.lexeme_char lexbuf 0);
+  | (_ as c)
+      { store_string_char state.buffer c;
         quoted_string delim state lexbuf }
 
 and skip_sharp_bang state = parse