[ #257 ] support \r and \f in BNFC, Haskell, Java, Ocaml (only \r)

BNFC · May 21, 2019 · c3bda39 · c3bda39
1 parent 23dda42
commit c3bda39
Show file tree

Hide file tree

Showing 5 changed files with 36 additions and 26 deletions.
diff --git a/source/src/BNFC/Backend/Haskell/CFtoAlex3.hs b/source/src/BNFC/Backend/Haskell/CFtoAlex3.hs
@@ -91,9 +91,9 @@ restOfAlex _ shareStrings byteStrings cf = [
   userDefTokenTypes,
   ident,
 
-  ifC catString ("\\\" ([$u # [\\\" \\\\ \\n]] | (\\\\ (\\\" | \\\\ | \\' | n | t)))* \\\"" ++
+  ifC catString ("\\\" ([$u # [\\\" \\\\ \\n]] | (\\\\ (\\\" | \\\\ | \\' | n | t | r | f)))* \\\"" ++
                   "\n    { tok (\\p s -> PT p (TL $ share $ unescapeInitTail s)) }"),
-  ifC catChar    "\\\' ($u # [\\\' \\\\] | \\\\ [\\\\ \\\' n t]) \\'\n    { tok (\\p s -> PT p (TC $ share s))  }",
+  ifC catChar    "\\\' ($u # [\\\' \\\\] | \\\\ [\\\\ \\\' n t r f]) \\'\n    { tok (\\p s -> PT p (TC $ share s))  }",
   ifC catInteger "$d+\n    { tok (\\p s -> PT p (TI $ share s))    }",
   ifC catDouble  "$d+ \\. $d+ (e (\\-)? $d+)?\n    { tok (\\p s -> PT p (TD $ share s)) }",
   "",
@@ -172,6 +172,8 @@ restOfAlex _ shareStrings byteStrings cf = [
   "    '\\\\':c:cs | elem c ['\\\"', '\\\\', '\\\''] -> c : unesc cs",
   "    '\\\\':'n':cs  -> '\\n' : unesc cs",
   "    '\\\\':'t':cs  -> '\\t' : unesc cs",
+  "    '\\\\':'r':cs  -> '\\r' : unesc cs",
+  "    '\\\\':'f':cs  -> '\\f' : unesc cs",
   "    '\"':[]    -> []",
   "    c:cs      -> c : unesc cs",
   "    _         -> []",
@@ -341,6 +343,8 @@ instance Print Char where
   prt _ = \case
     '\n'             -> ["\\n"]
     '\t'             -> ["\\t"]
+    '\r'             -> ["\\r"]
+    '\f'             -> ["\\f"]
     c | isAlphaNum c -> [[c]]
     c | isPrint c    -> ['\\':[c]]
     c                -> ['\\':show (ord c)]

diff --git a/source/src/BNFC/Backend/Java/CFtoAntlr4Lexer.hs b/source/src/BNFC/Backend/Java/CFtoAntlr4Lexer.hs
@@ -130,10 +130,10 @@ restOfLexerGrammar cf = vcat
         "IDENT : IDENTIFIER_FIRST (IDENTIFIER_FIRST | DIGIT)*;"
         ]
     , "// Whitespace"
-    , "WS : (' ' | '\\r' | '\\t' | '\\n')+ ->  skip;"
+    , "WS : (' ' | '\\r' | '\\t' | '\\n' | '\\f')+ ->  skip;"
     , "// Escapable sequences"
     , "fragment"
-    , "Escapable : ('\"' | '\\\\' | 'n' | 't' | 'r');"
+    , "Escapable : ('\"' | '\\\\' | 'n' | 't' | 'r' | 'f');"
     , "ErrorToken : . ;"
     , ifString stringmodes
     , ifChar charmodes

diff --git a/source/src/BNFC/Backend/Java/CFtoJLex15.hs b/source/src/BNFC/Backend/Java/CFtoJLex15.hs
@@ -50,21 +50,18 @@ import BNFC.Options (JavaLexerParser(..), RecordPositions(..))
 import BNFC.Utils (cstring)
 import Text.PrettyPrint
 
---The environment must be returned for the parser to use.
+-- | The environment is returned for further use in the parser.
 cf2jlex :: JavaLexerParser -> RecordPositions -> String -> CF -> (Doc, SymEnv)
-cf2jlex jflex rp packageBase cf = (vcat
- [
-  prelude jflex rp packageBase,
-  cMacros,
-  lexSymbols jflex env,
-  restOfJLex jflex rp cf
- ], env)
+cf2jlex jflex rp packageBase cf = (, env) . vcat $
+  [ prelude jflex rp packageBase
+  , cMacros
+  , lexSymbols jflex env
+  , restOfJLex jflex rp cf
+  ]
   where
-   env = makeSymEnv (cfgSymbols cf ++ reservedWords cf) (0 :: Int)
-   makeSymEnv [] _ = []
-   makeSymEnv (s:symbs) n = (s, "_SYMB_" ++ show n) : makeSymEnv symbs (n+1)
+  env = zipWith (\ s n -> (s, "_SYMB_" ++ show n)) (cfgSymbols cf ++ reservedWords cf) [(0 :: Int)..]
 
--- | File prelude
+-- | File prelude.
 prelude :: JavaLexerParser -> RecordPositions -> String -> Doc
 prelude jflex rp packageBase = vcat
     [ "// This JLex file was machine-generated by the BNF converter"
@@ -113,15 +110,16 @@ prelude jflex rp packageBase = vcat
         else ""
     ]
   where
-    positionDeclarations =
+    positionDeclarations
       -- JFlex always defines yyline, yychar, yycolumn, even if unused.
-      if jflex == JFlexCup then ""
-        else if rp == RecordPositions then "int yycolumn = unknown - 1;"
-          else vcat
+      | jflex == JFlexCup     = ""
+      | rp == RecordPositions = "int yycolumn = unknown - 1;"
+      | otherwise             = vcat
             -- subtract one so that one based numbering still ends up with unknown.
             [ "int yyline = unknown - 1;"
             , "int yycolumn = unknown - 1;"
-            , "int yychar = unknown;" ]
+            , "int yychar = unknown;"
+            ]
 
 --For now all categories are included.
 --Optimally only the ones that are used should be generated.
@@ -203,9 +201,11 @@ restOfJLex jflex rp cf = vcat
           then "<STRING><<EOF>> { throw new Error(\"Unterminated string at EOF, beginning at \" + left.getLine() + \":\" + left.getColumn()); }"
           else ""
         , "<ESCAPED>n { pstring +=  \"\\n\"; yybegin(STRING); }"
+        , "<ESCAPED>t  { pstring += \"\\t\"; yybegin(STRING); }"
+        , "<ESCAPED>r  { pstring += \"\\r\"; yybegin(STRING); }"
+        , "<ESCAPED>f  { pstring += \"\\f\"; yybegin(STRING); }"
         , "<ESCAPED>\\\" { pstring += \"\\\"\"; yybegin(STRING); }"
         , "<ESCAPED>\\\\ { pstring += \"\\\\\"; yybegin(STRING); }"
-        , "<ESCAPED>t  { pstring += \"\\t\"; yybegin(STRING); }"
         , "<ESCAPED>.  { pstring += yytext(); yybegin(STRING); }"
         , "<ESCAPED>\\r\\n|\\r|\\n { throw new Error(\"Unterminated string on line \" + left.getLine() " <>
           (if jflex == JFlexCup then "+ \" beginning at column \" + left.getColumn()" else "") <> "); }"
@@ -224,6 +224,8 @@ restOfJLex jflex rp cf = vcat
           else ""
         , "<CHARESC>n { yybegin(CHAREND); return cf.newSymbol(\"\", sym._CHAR_, left, right_loc(), new Character('\\n')); }"
         , "<CHARESC>t { yybegin(CHAREND); return cf.newSymbol(\"\", sym._CHAR_, left, right_loc(), new Character('\\t')); }"
+        , "<CHARESC>r { yybegin(CHAREND); return cf.newSymbol(\"\", sym._CHAR_, left, right_loc(), new Character('\\r')); }"
+        , "<CHARESC>f { yybegin(CHAREND); return cf.newSymbol(\"\", sym._CHAR_, left, right_loc(), new Character('\\f')); }"
         , "<CHARESC>. { yybegin(CHAREND); return cf.newSymbol(\"\", sym._CHAR_, left, right_loc(), new Character(yytext().charAt(0))); }"
         , "<CHARESC>\\r\\n|\\r|\\n { throw new Error(\"Unterminated character literal on line \" + left.getLine() " <>
           (if jflex == JFlexCup then "+ \" beginning at column \" + left.getColumn()" else "") <> "); }"

diff --git a/source/src/BNFC/Backend/OCaml/CFtoOCamlLex.hs b/source/src/BNFC/Backend/OCaml/CFtoOCamlLex.hs
@@ -60,6 +60,8 @@ header parserMod cf = [
   "      '\\\\'::c::cs when List.mem c ['\\\"'; '\\\\'; '\\\''] -> c :: unesc cs",
   "    | '\\\\'::'n'::cs  -> '\\n' :: unesc cs",
   "    | '\\\\'::'t'::cs  -> '\\t' :: unesc cs",
+  "    | '\\\\'::'r'::cs  -> '\\r' :: unesc cs",
+  -- "    | '\\\\'::'f'::cs  -> '\\f' :: unesc cs",  -- \f not supported by ocaml
   "    | '\\\"'::[]    -> []",
   "    | c::cs      -> c :: unesc cs",
   "    | _         -> []",
@@ -208,10 +210,10 @@ rules cf = mkRule "token" $
     , ( "d+ '.' d+ ('e' ('-')? d+)?"
       , "let f = lexeme lexbuf in TOK_Double (float_of_string f)" )
     -- strings
-    , ( "'\\\"' ((u # ['\\\"' '\\\\' '\\n']) | ('\\\\' ('\\\"' | '\\\\' | '\\\'' | 'n' | 't')))* '\\\"'"
+    , ( "'\\\"' ((u # ['\\\"' '\\\\' '\\n']) | ('\\\\' ('\\\"' | '\\\\' | '\\\'' | 'n' | 't' | 'r')))* '\\\"'"
       , "let s = lexeme lexbuf in TOK_String (unescapeInitTail s)" )
     -- chars
-    , ( "'\\'' ((u # ['\\\'' '\\\\']) | ('\\\\' ('\\\\' | '\\\'' | 'n' | 't'))) '\\\''"
+    , ( "'\\'' ((u # ['\\\'' '\\\\']) | ('\\\\' ('\\\\' | '\\\'' | 'n' | 't' | 'r'))) '\\\''"
       , "let s = lexeme lexbuf in TOK_Char s.[1]")
     -- spaces
     , ( "[' ' '\\t']", "token lexbuf")

diff --git a/source/src/LexBNF.x b/source/src/LexBNF.x
@@ -33,9 +33,9 @@ $white+ ;
 
 $l $i*
     { tok (\p s -> PT p (eitherResIdent (TV . share) s)) }
-\" ([$u # [\" \\ \n]] | (\\ (\" | \\ | \' | n | t)))* \"
+\" ([$u # [\" \\ \n]] | (\\ (\" | \\ | \' | n | t | r | f)))* \"
     { tok (\p s -> PT p (TL $ share $ unescapeInitTail s)) }
-\' ($u # [\' \\] | \\ [\\ \' n t]) \'
+\' ($u # [\' \\] | \\ [\\ \' n t r f]) \'
     { tok (\p s -> PT p (TC $ share s))  }
 $d+
     { tok (\p s -> PT p (TI $ share s))    }
@@ -117,6 +117,8 @@ unescapeInitTail = id . unesc . tail . id where
     '\\':c:cs | elem c ['\"', '\\', '\''] -> c : unesc cs
     '\\':'n':cs  -> '\n' : unesc cs
     '\\':'t':cs  -> '\t' : unesc cs
+    '\\':'r':cs  -> '\r' : unesc cs
+    '\\':'f':cs  -> '\f' : unesc cs
     '"':[]    -> []
     c:cs      -> c : unesc cs
     _         -> []