diff --git a/cryptol.cabal b/cryptol.cabal index df4207922..8e68eb274 100644 --- a/cryptol.cabal +++ b/cryptol.cabal @@ -81,6 +81,8 @@ library Exposed-modules: Cryptol.Parser, Cryptol.Parser.Lexer, + Cryptol.Parser.Token, + Cryptol.Parser.Layout, Cryptol.Parser.AST, Cryptol.Parser.Position, Cryptol.Parser.Names, diff --git a/src/Cryptol/Parser.y b/src/Cryptol/Parser.y index dfc982931..a9fb04d1b 100644 --- a/src/Cryptol/Parser.y +++ b/src/Cryptol/Parser.y @@ -35,6 +35,7 @@ import Control.Monad(liftM2,msum) import Cryptol.Parser.AST import Cryptol.Parser.Position import Cryptol.Parser.LexerUtils hiding (mkIdent) +import Cryptol.Parser.Token import Cryptol.Parser.ParserUtils import Cryptol.Parser.Unlit(PreProc(..), guessPreProc) import Cryptol.Utils.Ident(paramInstModName) diff --git a/src/Cryptol/Parser/Layout.hs b/src/Cryptol/Parser/Layout.hs new file mode 100644 index 000000000..6508de120 --- /dev/null +++ b/src/Cryptol/Parser/Layout.hs @@ -0,0 +1,237 @@ +{-# Language BlockArguments #-} +{-# Language OverloadedStrings #-} +module Cryptol.Parser.Layout where + +import Cryptol.Utils.Panic(panic) +import Cryptol.Parser.Position +import Cryptol.Parser.Token + +{- + +We assume the existence of an explicit EOF token at the end of the input. This token is *less* indented +than all other tokens (i.e., it is at column 0) + +Explicit Layout Blocks + + * The symbols `(`, `{`, and `[` start an explicit layout block. + * While in an explicit layout block we pass through tokens, except: + - We may start new implicit or explicit layout blocks + - A `,` terminates any *nested* layout blocks + - We terminate the current layout block if we encounter the matching + closing symbol `)`, `}`, `]` + +Implicit Layout Blocks + + * The keywords `where`, `private`, and `parameter` start an implicit + layout block. + * The layout block starts at the column of the *following* token and we + insert "virtual start block" between the current and the following tokens. + * While in an implicit layout block: + - We may start new implicit or explicit layout blocks + - We insert a "virtual separator" before tokens starting at the same + column as the layout block, EXCEPT: + * we do not insert a separator if the previous token was a + "documentation comment" + * we do not insert a separator before the first token in the block + + - The implicit layout block is ended by: + * a token than is less indented that the block, or + * `)`, `}`, `]`, or + * ',' but only if there is an outer paren block + block's column. + - When an implicit layout block ends, we insert a "virtual end block" + token just before the token that caused the block to end. + +Examples: + +f = x where x = 0x1 -- end implicit layout by layout +g = 0x3 -- (`g` is less indented than `x`) + +f (x where x = 2) -- end implicit layout by `)` + +[ x where x = 2, 3 ] -- end implicit layout by `,` + +module A where -- two implicit layout blocks with the +private -- *same* indentation (`where` and `private`) +x = 0x2 +-} + + +layout :: Bool -> [Located Token] -> [Located Token] +layout isMod ts0 + + -- Star an implicit layout block at the top of the module + | let t = head ts0 + rng = srcRange t + blockCol = max 1 (col (from rng)) -- see startImplicitBlock + , isMod && tokenType (thing t) /= KW KW_module = + virt rng VCurlyL : go [ Virtual blockCol ] blockCol True ts0 + + | otherwise = + go [] 0 False ts0 + + where + + {- State parameters for `go`: + + stack: + The stack of implicit and explicit blocks + + lastVirt: + The indentation of the outer most implicit block, or 0 if none. + This can be computed from the stack but we cache + it here as we need to check it on each token. + + noVirtSep: + Do not emit a virtual separator even if token matches block alignment. + This is enabled at the beginning of a block, or after a doc string, + or if we just emitted a separtor, but have not yet consumed the + next token. + + tokens: + remaining tokens to process + -} + + go stack lastVirt noVirtSep tokens + + -- End implicit layout due to indentation. If the outermost block + -- is a lyout block we just end it. If the outermost block is an + -- explicit layout block we report a lexical error. + | col curLoc < lastVirt = + endImplictBlock + + -- End implicit layout block due to a symbol + | Just (Virtual {}) <- curBlock, endsLayout curTokTy = + endImplictBlock + + -- End implicit layout block due to a comma + | Just (Virtual {}) <- curBlock + , Sym Comma <- curTokTy + , not (null [ () | Explicit _ <- popStack ]) = + endImplictBlock + + -- Insert a virtual separator + | Just (Virtual {}) <- curBlock + , col curLoc == lastVirt && not noVirtSep = + virt curRange VSemi : go stack lastVirt True tokens + + -- Start a new implicit layout. Advances token position. + | startsLayout curTokTy = startImplicitBlock + + -- Start a paren block. Advances token position + | Just close <- startsParenBlock curTokTy = + curTok : go (Explicit close : stack) lastVirt False advanceTokens + + -- End a paren block. Advances token position + | Just (Explicit close) <- curBlock, close == curTokTy = + curTok : go popStack lastVirt False advanceTokens + + -- Disable virtual separator after doc string. Advances token position + | White DocStr <- curTokTy = + curTok : go stack lastVirt True advanceTokens + + -- Check to see if we are done. Note that if we got here, implicit layout + -- blocks should have already been closed, as `EOF` is less indented than + -- all other tokens + | EOF <- curTokTy = + [curTok] + + -- Any other token, just emit. Advances token position + | otherwise = + curTok : go stack lastVirt False advanceTokens + + where + curTok : advanceTokens = tokens + curTokTy = tokenType (thing curTok) + curRange = srcRange curTok + curLoc = from curRange + + (curBlock,popStack) = + case stack of + a : b -> (Just a,b) + [] -> (Nothing, panic "layout" ["pop empty stack"]) + + + startImplicitBlock = + let nextRng = srcRange (head advanceTokens) + nextLoc = from nextRng + blockCol = max 1 (col nextLoc) + -- the `max` ensuraes that indentation is always at least 1, + -- in case we are starting a block at the very end of the input + + in curTok + : virt nextRng VCurlyL + : go (Virtual blockCol : stack) blockCol True advanceTokens + + + endImplictBlock = + case curBlock of + Just (Virtual {}) -> + virt curRange VCurlyR + : go popStack newVirt False tokens + where newVirt = case [ n | Virtual n <- popStack ] of + n : _ -> n + _ -> 0 + + Just (Explicit c) -> + errTok curRange (InvalidIndentation c) : advanceTokens + + Nothing -> panic "layout" ["endImplictBlock with empty stack"] + + +-------------------------------------------------------------------------------- + +data Block = + Virtual Int -- ^ Virtual layout block + | Explicit TokenT -- ^ An explicit layout block, expecting this ending token. + deriving (Show) + +-- | These tokens start an implicit layout block +startsLayout :: TokenT -> Bool +startsLayout ty = + case ty of + KW KW_where -> True + KW KW_private -> True + KW KW_parameter -> True + _ -> False + +-- | These tokens end an implicit layout block +endsLayout :: TokenT -> Bool +endsLayout ty = + case ty of + Sym BracketR -> True + Sym ParenR -> True + Sym CurlyR -> True + _ -> False + +-- | These tokens start an explicit "paren" layout block. +-- If so, the result contains the corresponding closing paren. +startsParenBlock :: TokenT -> Maybe TokenT +startsParenBlock ty = + case ty of + Sym BracketL -> Just (Sym BracketR) + Sym ParenL -> Just (Sym ParenR) + Sym CurlyL -> Just (Sym CurlyR) + _ -> Nothing + + +-------------------------------------------------------------------------------- + +-- | Make a virtual token of the given type +virt :: Range -> TokenV -> Located Token +virt rng x = Located { srcRange = rng { to = from rng }, thing = t } + where + t = Token (Virt x) + case x of + VCurlyL -> "beginning of layout block" + VCurlyR -> "end of layout block" + VSemi -> "layout block separator" + +errTok :: Range -> TokenErr -> Located Token +errTok rng x = Located { srcRange = rng { to = from rng }, thing = t } + where + t = Token { tokenType = Err x, tokenText = "" } + + + + diff --git a/src/Cryptol/Parser/Lexer.x b/src/Cryptol/Parser/Lexer.x index 36eba1275..74063ce0a 100644 --- a/src/Cryptol/Parser/Lexer.x +++ b/src/Cryptol/Parser/Lexer.x @@ -18,10 +18,13 @@ module Cryptol.Parser.Lexer , Located(..) , Config(..) , defaultConfig + , dbgLex ) where import Cryptol.Parser.Position +import Cryptol.Parser.Token import Cryptol.Parser.LexerUtils +import qualified Cryptol.Parser.Layout as L import Cryptol.Parser.Unlit(unLit) import Data.Text (Text) import qualified Data.Text as Text @@ -195,7 +198,7 @@ stateToInt (InChar {}) = char -- This stream is fed to the parser. lexer :: Config -> Text -> ([Located Token], Position) lexer cfg cs = ( case cfgLayout cfg of - Layout -> layout cfg lexemes + Layout -> L.layout (cfgModuleScope cfg) lexemes NoLayout -> lexemes , finalPos ) @@ -253,5 +256,10 @@ primLexer cfg cs = run inp Normal (rest,pos) = run i' $! s' in (mtok ++ rest, pos) +dbgLex file = + do txt <- readFile file + let (ts,_) = lexer defaultConfig (Text.pack txt) + mapM_ (print . thing) ts + -- vim: ft=haskell } diff --git a/src/Cryptol/Parser/LexerUtils.hs b/src/Cryptol/Parser/LexerUtils.hs index 7b18c515e..41bb3ff0c 100644 --- a/src/Cryptol/Parser/LexerUtils.hs +++ b/src/Cryptol/Parser/LexerUtils.hs @@ -5,19 +5,9 @@ -- Maintainer : cryptol@galois.com -- Stability : provisional -- Portability : portable - -{-# LANGUAGE DeriveAnyClass #-} -{-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE OverloadedStrings #-} -{-# LANGUAGE PatternGuards #-} -{-# LANGUAGE BlockArguments #-} module Cryptol.Parser.LexerUtils where -import Cryptol.Parser.Position -import Cryptol.Parser.Unlit(PreProc(None)) -import Cryptol.Utils.PP -import Cryptol.Utils.Panic - import Control.Monad(guard) import Data.Char(toLower,generalCategory,isAscii,ord,isSpace, isAlphaNum,isAlpha) @@ -27,8 +17,12 @@ import qualified Data.Text as T import qualified Data.Text.Read as T import Data.Word(Word8) -import GHC.Generics (Generic) -import Control.DeepSeq +import Cryptol.Utils.Panic +import Cryptol.Parser.Position +import Cryptol.Parser.Token +import Cryptol.Parser.Unlit(PreProc(None)) + + data Config = Config { cfgSource :: !FilePath -- ^ File that we are working on @@ -362,210 +356,6 @@ dropWhite = filter (notWhite . tokenType . thing) notWhite _ = True -data Block = Virtual Int -- ^ Virtual layout block - | Explicit TokenT -- ^ An explicit layout block, expecting this ending - -- token. - deriving (Show) - -isExplicit :: Block -> Bool -isExplicit Explicit{} = True -isExplicit Virtual{} = False - -startsLayout :: TokenT -> Bool -startsLayout (KW KW_where) = True -startsLayout (KW KW_private) = True -startsLayout (KW KW_parameter) = True -startsLayout _ = False - --- Add separators computed from layout -layout :: Config -> [Located Token] -> [Located Token] -layout cfg ts0 = loop False implicitScope [] ts0 - where - - (_pos0,implicitScope) = case ts0 of - t : _ -> (from (srcRange t), cfgModuleScope cfg && tokenType (thing t) /= KW KW_module) - _ -> (start,False) - - - loop :: Bool -> Bool -> [Block] -> [Located Token] -> [Located Token] - loop afterDoc startBlock stack (t : ts) - | startsLayout ty = toks ++ loop False True stack' ts - - -- We don't do layout within these delimeters - | Sym ParenL <- ty = toks ++ loop False False (Explicit (Sym ParenR) : stack') ts - | Sym CurlyL <- ty = toks ++ loop False False (Explicit (Sym CurlyR) : stack') ts - | Sym BracketL <- ty = toks ++ loop False False (Explicit (Sym BracketR) : stack') ts - - | EOF <- ty = toks - | White DocStr <- ty = toks ++ loop True False stack' ts - | otherwise = toks ++ loop False False stack' ts - - where - ty = tokenType (thing t) - pos = srcRange t - - (toks,offStack) - | afterDoc = ([t], stack) - | otherwise = offsides startToks t stack - - -- add any block start tokens, and push a level on the stack - (startToks,stack') - | startBlock && ty == EOF = ( [ virt cfg (to pos) VCurlyR - , virt cfg (to pos) VCurlyL ] - , offStack ) - | startBlock = ( [ virt cfg (to pos) VCurlyL ], Virtual (col (from pos)) : offStack ) - | otherwise = ( [], offStack ) - - loop _ _ _ [] = panic "[Lexer] layout" ["Missing EOF token"] - - - offsides :: [Located Token] -> Located Token -> [Block] -> ([Located Token], [Block]) - offsides startToks t = go startToks - where - go virts stack = case stack of - - -- delimit or close a layout block - Virtual c : rest - -- commas only close to an explicit marker, so if there is none, the - -- comma doesn't close anything - | Sym Comma == ty -> - if any isExplicit rest - then go (virt cfg (to pos) VCurlyR : virts) rest - else done virts stack - - | closingToken -> go (virt cfg (to pos) VCurlyR : virts) rest - | col (from pos) == c -> done (virt cfg (to pos) VSemi : virts) stack - | col (from pos) < c -> go (virt cfg (to pos) VCurlyR : virts) rest - - -- close an explicit block - Explicit close : rest | close == ty -> done virts rest - | Sym Comma == ty -> done virts stack - - _ -> done virts stack - - ty = tokenType (thing t) - pos = srcRange t - - done ts s = (reverse (t:ts), s) - - closingToken = ty `elem` [ Sym ParenR, Sym BracketR, Sym CurlyR ] - -virt :: Config -> Position -> TokenV -> Located Token -virt cfg pos x = Located { srcRange = Range - { from = pos - , to = pos - , source = cfgSource cfg - } - , thing = t } - where t = Token (Virt x) $ case x of - VCurlyL -> "beginning of layout block" - VCurlyR -> "end of layout block" - VSemi -> "layout block separator" - --------------------------------------------------------------------------------- - -data Token = Token { tokenType :: !TokenT, tokenText :: !Text } - deriving (Show, Generic, NFData) - --- | Virtual tokens, inserted by layout processing. -data TokenV = VCurlyL| VCurlyR | VSemi - deriving (Eq, Show, Generic, NFData) - -data TokenW = BlockComment | LineComment | Space | DocStr - deriving (Eq, Show, Generic, NFData) - -data TokenKW = KW_else - | KW_extern - | KW_fin - | KW_if - | KW_private - | KW_include - | KW_inf - | KW_lg2 - | KW_lengthFromThen - | KW_lengthFromThenTo - | KW_max - | KW_min - | KW_module - | KW_submodule - | KW_newtype - | KW_pragma - | KW_property - | KW_then - | KW_type - | KW_where - | KW_let - | KW_x - | KW_import - | KW_as - | KW_hiding - | KW_infixl - | KW_infixr - | KW_infix - | KW_primitive - | KW_parameter - | KW_constraint - | KW_Prop - deriving (Eq, Show, Generic, NFData) - --- | The named operators are a special case for parsing types, and 'Other' is --- used for all other cases that lexed as an operator. -data TokenOp = Plus | Minus | Mul | Div | Exp | Mod - | Equal | LEQ | GEQ - | Complement | Hash | At - | Other [T.Text] T.Text - deriving (Eq, Show, Generic, NFData) - -data TokenSym = Bar - | ArrL | ArrR | FatArrR - | Lambda - | EqDef - | Comma - | Semi - | Dot - | DotDot - | DotDotDot - | DotDotLt - | Colon - | BackTick - | ParenL | ParenR - | BracketL | BracketR - | CurlyL | CurlyR - | TriL | TriR - | Lt - | Underscore - deriving (Eq, Show, Generic, NFData) - -data TokenErr = UnterminatedComment - | UnterminatedString - | UnterminatedChar - | InvalidString - | InvalidChar - | LexicalError - | MalformedLiteral - | MalformedSelector - deriving (Eq, Show, Generic, NFData) - -data SelectorType = RecordSelectorTok Text | TupleSelectorTok Int - deriving (Eq, Show, Generic, NFData) - -data TokenT = Num !Integer !Int !Int -- ^ value, base, number of digits - | Frac !Rational !Int -- ^ value, base. - | ChrLit !Char -- ^ character literal - | Ident ![T.Text] !T.Text -- ^ (qualified) identifier - | StrLit !String -- ^ string literal - | Selector !SelectorType -- ^ .hello or .123 - | KW !TokenKW -- ^ keyword - | Op !TokenOp -- ^ operator - | Sym !TokenSym -- ^ symbol - | Virt !TokenV -- ^ virtual token (for layout) - | White !TokenW -- ^ white space token - | Err !TokenErr -- ^ error token - | EOF - deriving (Eq, Show, Generic, NFData) - -instance PP Token where - ppPrec _ (Token _ s) = text (T.unpack s) -- | Collapse characters into a single Word8, identifying ASCII, and classes of -- unicode. This came from: diff --git a/src/Cryptol/Parser/ParserUtils.hs b/src/Cryptol/Parser/ParserUtils.hs index 0f1ebda36..f64313187 100644 --- a/src/Cryptol/Parser/ParserUtils.hs +++ b/src/Cryptol/Parser/ParserUtils.hs @@ -35,7 +35,7 @@ import Prelude.Compat import Cryptol.Parser.AST import Cryptol.Parser.Lexer -import Cryptol.Parser.LexerUtils(SelectorType(..)) +import Cryptol.Parser.Token(SelectorType(..)) import Cryptol.Parser.Position import Cryptol.Parser.Utils (translateExprToNumT,widthIdent) import Cryptol.Utils.Ident(packModName,packIdent,modNameChunks) @@ -81,6 +81,12 @@ lexerP k = P $ \cfg p s -> T.unpack (tokenText it) MalformedSelector -> "malformed selector: " ++ T.unpack (tokenText it) + InvalidIndentation c -> "invalid indentation, unmatched " ++ + case c of + Sym CurlyR -> "{ ... } " + Sym ParenR -> "( ... )" + Sym BracketR -> "[ ... ]" + _ -> show c -- basically panic ] where it = thing t diff --git a/src/Cryptol/Parser/Token.hs b/src/Cryptol/Parser/Token.hs new file mode 100644 index 000000000..280f8157d --- /dev/null +++ b/src/Cryptol/Parser/Token.hs @@ -0,0 +1,116 @@ +{-# LANGUAGE DeriveAnyClass #-} +{-# LANGUAGE DeriveGeneric #-} +module Cryptol.Parser.Token where + +import Data.Text(Text) +import qualified Data.Text as Text +import Control.DeepSeq +import GHC.Generics + +import Cryptol.Utils.PP + +data Token = Token { tokenType :: !TokenT, tokenText :: !Text } + deriving (Show, Generic, NFData) + +-- | Virtual tokens, inserted by layout processing. +data TokenV = VCurlyL| VCurlyR | VSemi + deriving (Eq, Show, Generic, NFData) + +data TokenW = BlockComment | LineComment | Space | DocStr + deriving (Eq, Show, Generic, NFData) + +data TokenKW = KW_else + | KW_extern + | KW_fin + | KW_if + | KW_private + | KW_include + | KW_inf + | KW_lg2 + | KW_lengthFromThen + | KW_lengthFromThenTo + | KW_max + | KW_min + | KW_module + | KW_submodule + | KW_newtype + | KW_pragma + | KW_property + | KW_then + | KW_type + | KW_where + | KW_let + | KW_x + | KW_import + | KW_as + | KW_hiding + | KW_infixl + | KW_infixr + | KW_infix + | KW_primitive + | KW_parameter + | KW_constraint + | KW_Prop + deriving (Eq, Show, Generic, NFData) + +-- | The named operators are a special case for parsing types, and 'Other' is +-- used for all other cases that lexed as an operator. +data TokenOp = Plus | Minus | Mul | Div | Exp | Mod + | Equal | LEQ | GEQ + | Complement | Hash | At + | Other [Text] Text + deriving (Eq, Show, Generic, NFData) + +data TokenSym = Bar + | ArrL | ArrR | FatArrR + | Lambda + | EqDef + | Comma + | Semi + | Dot + | DotDot + | DotDotDot + | DotDotLt + | Colon + | BackTick + | ParenL | ParenR + | BracketL | BracketR + | CurlyL | CurlyR + | TriL | TriR + | Lt + | Underscore + deriving (Eq, Show, Generic, NFData) + +data TokenErr = UnterminatedComment + | UnterminatedString + | UnterminatedChar + | InvalidString + | InvalidChar + | LexicalError + | MalformedLiteral + | MalformedSelector + | InvalidIndentation TokenT -- expected closing paren + deriving (Eq, Show, Generic, NFData) + +data SelectorType = RecordSelectorTok Text | TupleSelectorTok Int + deriving (Eq, Show, Generic, NFData) + +data TokenT = Num !Integer !Int !Int -- ^ value, base, number of digits + | Frac !Rational !Int -- ^ value, base. + | ChrLit !Char -- ^ character literal + | Ident ![Text] !Text -- ^ (qualified) identifier + | StrLit !String -- ^ string literal + | Selector !SelectorType -- ^ .hello or .123 + | KW !TokenKW -- ^ keyword + | Op !TokenOp -- ^ operator + | Sym !TokenSym -- ^ symbol + | Virt !TokenV -- ^ virtual token (for layout) + | White !TokenW -- ^ white space token + | Err !TokenErr -- ^ error token + | EOF + deriving (Eq, Show, Generic, NFData) + +instance PP Token where + ppPrec _ (Token _ s) = text (Text.unpack s) + + diff --git a/tests/issues/T1179.cry b/tests/issues/T1179.cry new file mode 100644 index 000000000..bb3a376b7 --- /dev/null +++ b/tests/issues/T1179.cry @@ -0,0 +1,5 @@ +module T1179 where + +x = 0x2 +private +y = x diff --git a/tests/issues/T1179.icry b/tests/issues/T1179.icry new file mode 100644 index 000000000..53dec4d9e --- /dev/null +++ b/tests/issues/T1179.icry @@ -0,0 +1 @@ +:load T1179.cry diff --git a/tests/issues/T1179.icry.stdout b/tests/issues/T1179.icry.stdout new file mode 100644 index 000000000..2cfcb9e64 --- /dev/null +++ b/tests/issues/T1179.icry.stdout @@ -0,0 +1,3 @@ +Loading module Cryptol +Loading module Cryptol +Loading module T1179 diff --git a/tests/regression/Layout02.cry b/tests/regression/Layout02.cry new file mode 100644 index 000000000..630348407 --- /dev/null +++ b/tests/regression/Layout02.cry @@ -0,0 +1,4 @@ +module Layout02 where + +x = 0x1 where y = ( 2 + ) diff --git a/tests/regression/layout02.icry b/tests/regression/layout02.icry new file mode 100644 index 000000000..6c47ccd37 --- /dev/null +++ b/tests/regression/layout02.icry @@ -0,0 +1 @@ +:load Layout02.cry diff --git a/tests/regression/layout02.icry.stdout b/tests/regression/layout02.icry.stdout new file mode 100644 index 000000000..f165224c7 --- /dev/null +++ b/tests/regression/layout02.icry.stdout @@ -0,0 +1,4 @@ +Loading module Cryptol + +Parse error at Layout02.cry:4:11--4:11 + invalid indentation, unmatched ( ... )