From acf4fb9ed9986e8f79554d19aaff07474dc05859 Mon Sep 17 00:00:00 2001 From: Andrea Bedini Date: Tue, 30 Jan 2024 14:30:11 +0800 Subject: [PATCH] Add comments and whitespace tokens to the lexer --- Cabal-syntax/src/Distribution/Fields/Lexer.x | 22 +++++++++++-------- .../src/Distribution/Fields/Parser.hs | 15 ++++++++++++- 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/Cabal-syntax/src/Distribution/Fields/Lexer.x b/Cabal-syntax/src/Distribution/Fields/Lexer.x index 4fc501d5186..4be9fa02303 100644 --- a/Cabal-syntax/src/Distribution/Fields/Lexer.x +++ b/Cabal-syntax/src/Distribution/Fields/Lexer.x @@ -15,7 +15,7 @@ #endif {-# OPTIONS_GHC -fno-warn-unused-imports #-} module Distribution.Fields.Lexer - (ltest, lexToken, Token(..), LToken(..) + (ltest, lexString, lexByteString, lexToken, Token(..), LToken(..) ,bol_section, in_section, in_field_layout, in_field_braces ,mkLexState) where @@ -84,8 +84,9 @@ tokens :- { @nbspspacetab* @nl { \pos len inp -> checkWhitespace pos len inp >> adjustPos retPos >> lexToken } -- no @nl here to allow for comments on last line of the file with no trailing \n - $spacetab* "--" $comment* ; -- TODO: check the lack of @nl works here - -- including counting line numbers + -- TODO: check the lack of @nl works here including counting line numbers + $spacetab* { toki Whitespace } + "--" $comment* { toki Comment } } { @@ -100,14 +101,16 @@ tokens :- when (len' /= len) $ adjustPos (incPos (len' - len)) setStartCode in_section return (L pos (Indent len')) } + + -- FIXME: this whitespace needs to be captured $spacetab* \{ { tok OpenBrace } $spacetab* \} { tok CloseBrace } } { - $spacetab+ ; --TODO: don't allow tab as leading space - - "--" $comment* ; + --TODO: don't allow tab as leading space + $spacetab+ { toki Whitespace } + "--" $comment* { toki Comment } @name { toki TokSym } @string { \pos len inp -> return $! L pos (TokStr (B.take (len - 2) (B.tail inp))) } @@ -133,7 +136,7 @@ tokens :- } { - $spacetab+; + $spacetab+ { toki Whitespace } $field_layout' $field_layout* { toki TokFieldLine } @nl { \_ _ _ -> adjustPos retPos >> setStartCode bol_field_layout >> lexToken } } @@ -143,7 +146,7 @@ tokens :- } { - $spacetab+; + $spacetab+ { toki Whitespace } $field_braces' $field_braces* { toki TokFieldLine } \{ { tok OpenBrace } \} { tok CloseBrace } @@ -161,6 +164,8 @@ data Token = TokSym !ByteString -- ^ Haskell-like identifier, number or | Colon | OpenBrace | CloseBrace + | Whitespace !ByteString + | Comment !ByteString | EOF | LexicalError InputStream --TODO: add separate string lexical error deriving Show @@ -230,7 +235,6 @@ lexToken = do setInput inp' let !len_bytes = B.length inp - B.length inp' t <- action pos len_bytes inp - --traceShow t $ return tok return t diff --git a/Cabal-syntax/src/Distribution/Fields/Parser.hs b/Cabal-syntax/src/Distribution/Fields/Parser.hs index e018caa7fe0..db124be219d 100644 --- a/Cabal-syntax/src/Distribution/Fields/Parser.hs +++ b/Cabal-syntax/src/Distribution/Fields/Parser.hs @@ -1,7 +1,7 @@ {-# LANGUAGE CPP #-} +{-# LANGUAGE LambdaCase #-} {-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE OverloadedStrings #-} -{-# LANGUAGE PatternGuards #-} ----------------------------------------------------------------------------- @@ -77,6 +77,10 @@ instance Stream LexState' Identity LToken where uncons (LexState' _ (tok, st')) = case tok of L _ EOF -> return Nothing + -- FIXME: DEBUG: uncomment these lines to skip new tokens and restore old lexer behaviour + -- L _ (Whitespace _) -> uncons st' + -- L _ (Comment _) -> uncons st' + -- FIXME: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ _ -> return (Just (tok, st')) -- | Get lexer warnings accumulated so far @@ -116,6 +120,8 @@ describeToken t = case t of OpenBrace -> "\"{\"" CloseBrace -> "\"}\"" -- SemiColon -> "\";\"" + Whitespace s -> "whitespace " ++ show s + Comment s -> "comment " ++ show s EOF -> "end of file" LexicalError is -> "character in input " ++ show (B8.head is) @@ -135,6 +141,12 @@ tokOpenBrace = getTokenWithPos $ \t -> case t of L pos OpenBrace -> Just pos; _ tokCloseBrace = getToken $ \t -> case t of CloseBrace -> Just (); _ -> Nothing tokFieldLine = getTokenWithPos $ \t -> case t of L pos (TokFieldLine s) -> Just (FieldLine pos s); _ -> Nothing +tokComment :: Parser B8.ByteString +tokComment = getToken (\case Comment s -> Just s; _ -> Nothing) *> tokWhitespace + +tokWhitespace :: Parser B8.ByteString +tokWhitespace = getToken (\case Whitespace s -> Just s; _ -> Nothing) + colon, openBrace, closeBrace :: Parser () sectionArg :: Parser (SectionArg Position) sectionArg = tokSym' <|> tokStr <|> tokOther "section parameter" @@ -161,6 +173,7 @@ incIndentLevel (IndentLevel i) = IndentLevel (succ i) indentOfAtLeast :: IndentLevel -> Parser IndentLevel indentOfAtLeast (IndentLevel i) = try $ do + skipMany (skipOptional tokWhitespace >> tokComment) j <- tokIndent guard (j >= i) "indentation of at least " ++ show i return (IndentLevel j)