Add comments and whitespace tokens to the lexer

andreabedini · Jan 30, 2024 · acf4fb9 · acf4fb9
1 parent e268c0a
commit acf4fb9
Show file tree

Hide file tree

Showing 2 changed files with 27 additions and 10 deletions.
diff --git a/Cabal-syntax/src/Distribution/Fields/Lexer.x b/Cabal-syntax/src/Distribution/Fields/Lexer.x
@@ -15,7 +15,7 @@
 #endif
 {-# OPTIONS_GHC -fno-warn-unused-imports #-}
 module Distribution.Fields.Lexer
-  (ltest, lexToken, Token(..), LToken(..)
+  (ltest, lexString, lexByteString, lexToken, Token(..), LToken(..)
   ,bol_section, in_section, in_field_layout, in_field_braces
   ,mkLexState) where
 
@@ -84,8 +84,9 @@ tokens :-
 <bol_section, bol_field_layout, bol_field_braces> {
   @nbspspacetab* @nl         { \pos len inp -> checkWhitespace pos len inp >> adjustPos retPos >> lexToken }
   -- no @nl here to allow for comments on last line of the file with no trailing \n
-  $spacetab* "--" $comment*  ;  -- TODO: check the lack of @nl works here
-                                -- including counting line numbers
+  -- TODO: check the lack of @nl works here including counting line numbers
+  $spacetab*      { toki Whitespace }
+  "--" $comment*  { toki Comment }
 }
 
 <bol_section> {
@@ -100,14 +101,16 @@ tokens :-
                                         when (len' /= len) $ adjustPos (incPos (len' - len))
                                         setStartCode in_section
                                         return (L pos (Indent len')) }
+
+  -- FIXME: this whitespace needs to be captured
   $spacetab* \{    { tok  OpenBrace }
   $spacetab* \}    { tok  CloseBrace }
 }
 
 <in_section> {
-  $spacetab+   ; --TODO: don't allow tab as leading space
-
-  "--" $comment* ;
+  --TODO: don't allow tab as leading space
+  $spacetab+     { toki Whitespace }
+  "--" $comment* { toki Comment }
 
   @name        { toki TokSym }
   @string      { \pos len inp -> return $! L pos (TokStr (B.take (len - 2) (B.tail inp))) }
@@ -133,7 +136,7 @@ tokens :-
 }
 
 <in_field_layout> {
-  $spacetab+;
+  $spacetab+                     { toki Whitespace }
   $field_layout' $field_layout*  { toki TokFieldLine }
   @nl             { \_ _ _ -> adjustPos retPos >> setStartCode bol_field_layout >> lexToken }
 }
@@ -143,7 +146,7 @@ tokens :-
 }
 
 <in_field_braces> {
-  $spacetab+;
+  $spacetab+                       { toki Whitespace }
   $field_braces' $field_braces*    { toki TokFieldLine }
   \{                { tok  OpenBrace  }
   \}                { tok  CloseBrace }
@@ -161,6 +164,8 @@ data Token = TokSym   !ByteString       -- ^ Haskell-like identifier, number or
            | Colon
            | OpenBrace
            | CloseBrace
+           | Whitespace   !ByteString
+           | Comment      !ByteString
            | EOF
            | LexicalError InputStream --TODO: add separate string lexical error
   deriving Show
@@ -230,7 +235,6 @@ lexToken = do
         setInput inp'
         let !len_bytes = B.length inp - B.length inp'
         t <- action pos len_bytes inp
-        --traceShow t $ return tok
         return t
 
 

diff --git a/Cabal-syntax/src/Distribution/Fields/Parser.hs b/Cabal-syntax/src/Distribution/Fields/Parser.hs
@@ -1,7 +1,7 @@
 {-# LANGUAGE CPP #-}
+{-# LANGUAGE LambdaCase #-}
 {-# LANGUAGE MultiParamTypeClasses #-}
 {-# LANGUAGE OverloadedStrings #-}
-{-# LANGUAGE PatternGuards #-}
 
 -----------------------------------------------------------------------------
 
@@ -77,6 +77,10 @@ instance Stream LexState' Identity LToken where
   uncons (LexState' _ (tok, st')) =
     case tok of
       L _ EOF -> return Nothing
+      -- FIXME: DEBUG: uncomment these lines to skip new tokens and restore old lexer behaviour
+      -- L _ (Whitespace _) -> uncons st'
+      -- L _ (Comment _) -> uncons st'
+      -- FIXME: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
       _ -> return (Just (tok, st'))
 
 -- | Get lexer warnings accumulated so far
@@ -116,6 +120,8 @@ describeToken t = case t of
   OpenBrace -> "\"{\""
   CloseBrace -> "\"}\""
   --  SemiColon       -> "\";\""
+  Whitespace s -> "whitespace " ++ show s
+  Comment s -> "comment " ++ show s
   EOF -> "end of file"
   LexicalError is -> "character in input " ++ show (B8.head is)
 
@@ -135,6 +141,12 @@ tokOpenBrace = getTokenWithPos $ \t -> case t of L pos OpenBrace -> Just pos; _
 tokCloseBrace = getToken $ \t -> case t of CloseBrace -> Just (); _ -> Nothing
 tokFieldLine = getTokenWithPos $ \t -> case t of L pos (TokFieldLine s) -> Just (FieldLine pos s); _ -> Nothing
 
+tokComment :: Parser B8.ByteString
+tokComment = getToken (\case Comment s -> Just s; _ -> Nothing) *> tokWhitespace
+
+tokWhitespace :: Parser B8.ByteString
+tokWhitespace = getToken (\case Whitespace s -> Just s; _ -> Nothing)
+
 colon, openBrace, closeBrace :: Parser ()
 sectionArg :: Parser (SectionArg Position)
 sectionArg = tokSym' <|> tokStr <|> tokOther <?> "section parameter"
@@ -161,6 +173,7 @@ incIndentLevel (IndentLevel i) = IndentLevel (succ i)
 
 indentOfAtLeast :: IndentLevel -> Parser IndentLevel
 indentOfAtLeast (IndentLevel i) = try $ do
+  skipMany (skipOptional tokWhitespace >> tokComment)
   j <- tokIndent
   guard (j >= i) <?> "indentation of at least " ++ show i
   return (IndentLevel j)