Skip to content

Commit

Permalink
Add comments and whitespace tokens to the lexer
Browse files Browse the repository at this point in the history
  • Loading branch information
andreabedini committed Jan 30, 2024
1 parent e268c0a commit acf4fb9
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 10 deletions.
22 changes: 13 additions & 9 deletions Cabal-syntax/src/Distribution/Fields/Lexer.x
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
#endif
{-# OPTIONS_GHC -fno-warn-unused-imports #-}
module Distribution.Fields.Lexer
(ltest, lexToken, Token(..), LToken(..)
(ltest, lexString, lexByteString, lexToken, Token(..), LToken(..)
,bol_section, in_section, in_field_layout, in_field_braces
,mkLexState) where

Expand Down Expand Up @@ -84,8 +84,9 @@ tokens :-
<bol_section, bol_field_layout, bol_field_braces> {
@nbspspacetab* @nl { \pos len inp -> checkWhitespace pos len inp >> adjustPos retPos >> lexToken }
-- no @nl here to allow for comments on last line of the file with no trailing \n
$spacetab* "--" $comment* ; -- TODO: check the lack of @nl works here
-- including counting line numbers
-- TODO: check the lack of @nl works here including counting line numbers
$spacetab* { toki Whitespace }
"--" $comment* { toki Comment }
}

<bol_section> {
Expand All @@ -100,14 +101,16 @@ tokens :-
when (len' /= len) $ adjustPos (incPos (len' - len))
setStartCode in_section
return (L pos (Indent len')) }

-- FIXME: this whitespace needs to be captured
$spacetab* \{ { tok OpenBrace }
$spacetab* \} { tok CloseBrace }
}

<in_section> {
$spacetab+ ; --TODO: don't allow tab as leading space

"--" $comment* ;
--TODO: don't allow tab as leading space
$spacetab+ { toki Whitespace }
"--" $comment* { toki Comment }

@name { toki TokSym }
@string { \pos len inp -> return $! L pos (TokStr (B.take (len - 2) (B.tail inp))) }
Expand All @@ -133,7 +136,7 @@ tokens :-
}

<in_field_layout> {
$spacetab+;
$spacetab+ { toki Whitespace }
$field_layout' $field_layout* { toki TokFieldLine }
@nl { \_ _ _ -> adjustPos retPos >> setStartCode bol_field_layout >> lexToken }
}
Expand All @@ -143,7 +146,7 @@ tokens :-
}

<in_field_braces> {
$spacetab+;
$spacetab+ { toki Whitespace }
$field_braces' $field_braces* { toki TokFieldLine }
\{ { tok OpenBrace }
\} { tok CloseBrace }
Expand All @@ -161,6 +164,8 @@ data Token = TokSym !ByteString -- ^ Haskell-like identifier, number or
| Colon
| OpenBrace
| CloseBrace
| Whitespace !ByteString
| Comment !ByteString
| EOF
| LexicalError InputStream --TODO: add separate string lexical error
deriving Show
Expand Down Expand Up @@ -230,7 +235,6 @@ lexToken = do
setInput inp'
let !len_bytes = B.length inp - B.length inp'
t <- action pos len_bytes inp
--traceShow t $ return tok
return t


Expand Down
15 changes: 14 additions & 1 deletion Cabal-syntax/src/Distribution/Fields/Parser.hs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{-# LANGUAGE CPP #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE PatternGuards #-}

-----------------------------------------------------------------------------

Expand Down Expand Up @@ -77,6 +77,10 @@ instance Stream LexState' Identity LToken where
uncons (LexState' _ (tok, st')) =
case tok of
L _ EOF -> return Nothing
-- FIXME: DEBUG: uncomment these lines to skip new tokens and restore old lexer behaviour
-- L _ (Whitespace _) -> uncons st'
-- L _ (Comment _) -> uncons st'
-- FIXME: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
_ -> return (Just (tok, st'))

-- | Get lexer warnings accumulated so far
Expand Down Expand Up @@ -116,6 +120,8 @@ describeToken t = case t of
OpenBrace -> "\"{\""
CloseBrace -> "\"}\""
-- SemiColon -> "\";\""
Whitespace s -> "whitespace " ++ show s
Comment s -> "comment " ++ show s
EOF -> "end of file"
LexicalError is -> "character in input " ++ show (B8.head is)

Expand All @@ -135,6 +141,12 @@ tokOpenBrace = getTokenWithPos $ \t -> case t of L pos OpenBrace -> Just pos; _
tokCloseBrace = getToken $ \t -> case t of CloseBrace -> Just (); _ -> Nothing
tokFieldLine = getTokenWithPos $ \t -> case t of L pos (TokFieldLine s) -> Just (FieldLine pos s); _ -> Nothing

tokComment :: Parser B8.ByteString
tokComment = getToken (\case Comment s -> Just s; _ -> Nothing) *> tokWhitespace

tokWhitespace :: Parser B8.ByteString
tokWhitespace = getToken (\case Whitespace s -> Just s; _ -> Nothing)

colon, openBrace, closeBrace :: Parser ()
sectionArg :: Parser (SectionArg Position)
sectionArg = tokSym' <|> tokStr <|> tokOther <?> "section parameter"
Expand All @@ -161,6 +173,7 @@ incIndentLevel (IndentLevel i) = IndentLevel (succ i)

indentOfAtLeast :: IndentLevel -> Parser IndentLevel
indentOfAtLeast (IndentLevel i) = try $ do
skipMany (skipOptional tokWhitespace >> tokComment)
j <- tokIndent
guard (j >= i) <?> "indentation of at least " ++ show i
return (IndentLevel j)
Expand Down

0 comments on commit acf4fb9

Please sign in to comment.