diff --git a/doc/lexer-rules.md b/doc/lexer-rules.md index 6b4912118df..46aa60b366c 100644 --- a/doc/lexer-rules.md +++ b/doc/lexer-rules.md @@ -225,6 +225,7 @@ An alternative can have more than one command separated by commas. Here are the * pushMode( x ) * type( x ) * channel( x ) +* caseInsensitive( x ) See the book source code for usage, some examples of which are shown here: @@ -308,3 +309,21 @@ As of 4.5, you can also define channel names like enumerations with the followin ``` channels { WSCHANNEL, MYHIDDEN } ``` + +### caseInsensitive() + +Defines if the token is case-insensitive. +The argument can be `true`, `false`, or skipped. +Rewrites `caseInsensitive` option value if it's defined. + +```g4 +options { caseInsensitive=true; } +STRING: 'N'? '\'' (~'\'' | '\'\'')* '\'' -> caseInsensitive(false); // lower n is not allowed +``` + +If argument is skipped, `true` value is used: + +```g4 +CASE_INSENSITIVE_STRING: [a-z]+ -> caseInsensitive; // it matches ASDF +``` + diff --git a/tool-testsuite/test/org/antlr/v4/test/tool/TestATNLexerInterpreter.java b/tool-testsuite/test/org/antlr/v4/test/tool/TestATNLexerInterpreter.java index 81d1bc98ecd..bfaa6b0cd08 100644 --- a/tool-testsuite/test/org/antlr/v4/test/tool/TestATNLexerInterpreter.java +++ b/tool-testsuite/test/org/antlr/v4/test/tool/TestATNLexerInterpreter.java @@ -505,6 +505,27 @@ public void testSetUp() throws Exception { checkLexerMatches(lg, inputString, "TOKEN, EOF"); } + @Test public void testCaseInsensitiveInLexerRule() throws Exception { + LexerGrammar lg = new LexerGrammar( + "lexer grammar L;\n" + + "TOKEN1: [a-f]+ -> caseInsensitive(true);\n" + + "TOKEN2: [g-l]+ -> caseInsensitive;\n" + + "WS: [ ]+ -> skip;" + ); + + checkLexerMatches(lg, "ABCDEF GHIJKL", "TOKEN1, WS, TOKEN2, EOF"); + } + + @Test public void testCaseInsensitiveInLexerRuleOverridesGlobalValue() throws Exception { + String grammar = + "lexer grammar L;\n" + + "options { caseInsensitive = true; }\n" + + "STRING: 'N'? '\\'' (~'\\'' | '\\'\\'')* '\\'' -> caseInsensitive(false);\n"; + + execLexer("L.g4", grammar, "L", "n'sample'"); + assertEquals("line 1:0 token recognition error at: 'n'\n", getParseErrors()); + } + protected void checkLexerMatches(LexerGrammar lg, String inputString, String expecting) { ATN atn = createATN(lg, true); CharStream input = CharStreams.fromString(inputString); diff --git a/tool-testsuite/test/org/antlr/v4/test/tool/TestSymbolIssues.java b/tool-testsuite/test/org/antlr/v4/test/tool/TestSymbolIssues.java index 14ae5eff61e..2846985271b 100644 --- a/tool-testsuite/test/org/antlr/v4/test/tool/TestSymbolIssues.java +++ b/tool-testsuite/test/org/antlr/v4/test/tool/TestSymbolIssues.java @@ -239,12 +239,16 @@ public void testSetUp() throws Exception { "mode MODE2;\n" + "MODE2_TOKEN: 'f';\n" + "MODE2_TOKEN1: 'g' -> type(TEST1), type(TEST2);\n" + - "MODE2_TOKEN2: 'h' -> channel(CHANNEL1), channel(CHANNEL2), channel(DEFAULT_TOKEN_CHANNEL);", + "MODE2_TOKEN2: 'h' -> channel(CHANNEL1), channel(CHANNEL2), channel(DEFAULT_TOKEN_CHANNEL);\n" + + "CASE_INSENSITIVE1: 'c1' -> caseInsensitive(true), caseInsensitive(true);\n" + + "CASE_INSENSITIVE2: 'c2' -> caseInsensitive, caseInsensitive;\n", "warning(" + ErrorType.DUPLICATED_COMMAND.code + "): Lexer.g4:4:27: duplicated command mode\n" + "warning(" + ErrorType.DUPLICATED_COMMAND.code + "): Lexer.g4:12:34: duplicated command type\n" + "warning(" + ErrorType.DUPLICATED_COMMAND.code + "): Lexer.g4:13:40: duplicated command channel\n" + - "warning(" + ErrorType.DUPLICATED_COMMAND.code + "): Lexer.g4:13:59: duplicated command channel\n" + "warning(" + ErrorType.DUPLICATED_COMMAND.code + "): Lexer.g4:13:59: duplicated command channel\n" + + "warning(" + ErrorType.DUPLICATED_COMMAND.code + "): Lexer.g4:14:50: duplicated command caseInsensitive\n" + + "warning(" + ErrorType.DUPLICATED_COMMAND.code + "): Lexer.g4:15:44: duplicated command caseInsensitive\n" }; testErrors(test, false); @@ -472,18 +476,39 @@ public void testLabelsForTokensWithMixedTypesLRWithoutLabels() { testErrors(test, false); } - @Test public void testIllegalModeOption() { + @Test public void testIllegalCaseInsensitiveOptionValue() { String[] test = { "lexer grammar L;\n" + "options { caseInsensitive = badValue; }\n" + - "DEFAULT_TOKEN: [A-F]+;\n", + "TOKEN_1: [A-F]+ -> caseInsensitive(badValue);\n", - "warning(" + ErrorType.ILLEGAL_OPTION_VALUE.code + "): L.g4:2:28: unsupported option value caseInsensitive=badValue\n" + "warning(" + ErrorType.ILLEGAL_OPTION_VALUE.code + "): L.g4:2:28: unsupported option value caseInsensitive=badValue\n" + + "warning(" + ErrorType.ILLEGAL_OPTION_VALUE.code + "): L.g4:3:35: unsupported option value caseInsensitive=badValue\n" }; testErrors(test, false); } + @Test public void testRedundantCaseInsensitiveLexerCommand() { + String[] test = { + "lexer grammar L;\n" + + "options { caseInsensitive = true; }\n" + + "TOKEN: [A-F]+ -> caseInsensitive;\n", + + "warning(" + ErrorType.REDUNDANT_CASE_INSENSITIVE_LEXER_COMMAND.code + "): L.g4:3:17: caseInsensitive lexer command is redundant because its value equals to global value (true)\n" + }; + testErrors(test, false); + + String[] test2 = { + "lexer grammar L;\n" + + "options { caseInsensitive = false; }\n" + + "TOKEN: [A-F]+ -> caseInsensitive(false);\n", + + "warning(" + ErrorType.REDUNDANT_CASE_INSENSITIVE_LEXER_COMMAND.code + "): L.g4:3:17: caseInsensitive lexer command is redundant because its value equals to global value (false)\n" + }; + testErrors(test2, false); + } + @Test public void testNotImpliedCharacters() { String[] test = { "lexer grammar Test;\n" + diff --git a/tool/src/org/antlr/v4/automata/LexerATNFactory.java b/tool/src/org/antlr/v4/automata/LexerATNFactory.java index defeec53144..6091a7b3605 100644 --- a/tool/src/org/antlr/v4/automata/LexerATNFactory.java +++ b/tool/src/org/antlr/v4/automata/LexerATNFactory.java @@ -207,51 +207,46 @@ public Handle lexerAltCommands(Handle alt, Handle cmds) { @Override public Handle lexerCallCommand(GrammarAST ID, GrammarAST arg) { - LexerAction lexerAction = createLexerAction(ID, arg); - if (lexerAction != null) { - return action(ID, lexerAction); - } - - // fall back to standard action generation for the command - ST cmdST = codegenTemplates.getInstanceOf("Lexer" + - CharSupport.capitalize(ID.getText())+ - "Command"); - if (cmdST == null) { - g.tool.errMgr.grammarError(ErrorType.INVALID_LEXER_COMMAND, g.fileName, ID.token, ID.getText()); - return epsilon(ID); - } - - if (cmdST.impl.formalArguments == null || !cmdST.impl.formalArguments.containsKey("arg")) { - g.tool.errMgr.grammarError(ErrorType.UNWANTED_LEXER_COMMAND_ARGUMENT, g.fileName, ID.token, ID.getText()); - return epsilon(ID); - } - - cmdST.add("arg", arg.getText()); - cmdST.add("grammar", arg.g); - return action(cmdST.render()); + return lexerCallCommandOrCommand(ID, arg); } @Override public Handle lexerCommand(GrammarAST ID) { - LexerAction lexerAction = createLexerAction(ID, null); + return lexerCallCommandOrCommand(ID, null); + } + + private Handle lexerCallCommandOrCommand(GrammarAST ID, GrammarAST arg) { + LexerAction lexerAction = createLexerAction(ID, arg); if (lexerAction != null) { return action(ID, lexerAction); } + if (Rule.caseInsensitiveLexerCommandKey.equals(ID.getText())) { + return epsilon(ID); + } + // fall back to standard action generation for the command ST cmdST = codegenTemplates.getInstanceOf("Lexer" + - CharSupport.capitalize(ID.getText()) + + CharSupport.capitalize(ID.getText())+ "Command"); if (cmdST == null) { g.tool.errMgr.grammarError(ErrorType.INVALID_LEXER_COMMAND, g.fileName, ID.token, ID.getText()); return epsilon(ID); } - if (cmdST.impl.formalArguments != null && cmdST.impl.formalArguments.containsKey("arg")) { - g.tool.errMgr.grammarError(ErrorType.MISSING_LEXER_COMMAND_ARGUMENT, g.fileName, ID.token, ID.getText()); + boolean callCommand = arg != null; + boolean containsArg = cmdST.impl.formalArguments != null && cmdST.impl.formalArguments.containsKey("arg"); + if (callCommand != containsArg) { + ErrorType errorType = callCommand ? ErrorType.UNWANTED_LEXER_COMMAND_ARGUMENT : ErrorType.MISSING_LEXER_COMMAND_ARGUMENT; + g.tool.errMgr.grammarError(errorType, g.fileName, ID.token, ID.getText()); return epsilon(ID); } + if (callCommand) { + cmdST.add("arg", arg.getText()); + cmdST.add("grammar", arg.g); + } + return action(cmdST.render()); } @@ -279,7 +274,7 @@ public Handle set(GrammarAST associatedAST, List alts, boolean inver int a = CharSupport.getCharValueFromGrammarCharLiteral(t.getChild(0).getText()); int b = CharSupport.getCharValueFromGrammarCharLiteral(t.getChild(1).getText()); if (checkRange((GrammarAST)t.getChild(0), (GrammarAST)t.getChild(1), a, b)) { - checkRangeAndAddToSet(associatedAST, t, set, a, b, caseInsensitive, null); + checkRangeAndAddToSet(associatedAST, t, set, a, b, getCurrentCaseInsensitivity(), null); } } else if ( t.getType()==ANTLRParser.LEXER_CHAR_SET ) { @@ -567,11 +562,11 @@ private void applyPrevState(GrammarAST charSetAST, IntervalSet set, CharSetParse } private void checkCharAndAddToSet(GrammarAST ast, IntervalSet set, int c) { - checkRangeAndAddToSet(ast, ast, set, c, c, caseInsensitive, null); + checkRangeAndAddToSet(ast, ast, set, c, c, getCurrentCaseInsensitivity(), null); } private void checkRangeAndAddToSet(GrammarAST mainAst, IntervalSet set, int a, int b) { - checkRangeAndAddToSet(mainAst, mainAst, set, a, b, caseInsensitive, null); + checkRangeAndAddToSet(mainAst, mainAst, set, a, b, getCurrentCaseInsensitivity(), null); } private CharactersDataCheckStatus checkRangeAndAddToSet(GrammarAST rootAst, GrammarAST ast, IntervalSet set, int a, int b, boolean caseInsensitive, CharactersDataCheckStatus previousStatus) { @@ -630,7 +625,7 @@ private CharactersDataCheckStatus checkRangeAndAddToSet(GrammarAST rootAst, Gram private Transition createTransition(ATNState target, int from, int to, CommonTree tree) { RangeBorderCharactersData charactersData = RangeBorderCharactersData.getAndCheckCharactersData(from, to, g, tree, true); - if (caseInsensitive) { + if (getCurrentCaseInsensitivity()) { if (charactersData.isSingleRange()) { return CodePointTransitions.createWithCodePointRange(target, from, to); } @@ -646,6 +641,10 @@ private Transition createTransition(ATNState target, int from, int to, CommonTre } } + private boolean getCurrentCaseInsensitivity() { + return currentRule.caseInsensitive != null ? currentRule.caseInsensitive : caseInsensitive; + } + @Override public Handle tokenRef(TerminalAST node) { // Ref to EOF in lexer yields char transition on -1 diff --git a/tool/src/org/antlr/v4/semantics/SemanticPipeline.java b/tool/src/org/antlr/v4/semantics/SemanticPipeline.java index 06537f95225..571bc2beed7 100644 --- a/tool/src/org/antlr/v4/semantics/SemanticPipeline.java +++ b/tool/src/org/antlr/v4/semantics/SemanticPipeline.java @@ -6,6 +6,8 @@ package org.antlr.v4.semantics; +import org.antlr.runtime.tree.CommonTree; +import org.antlr.runtime.tree.Tree; import org.antlr.v4.analysis.LeftRecursiveRuleTransformer; import org.antlr.v4.automata.LexerATNFactory; import org.antlr.v4.parse.ANTLRParser; @@ -102,7 +104,7 @@ public void process() { // ASSIGN TOKEN TYPES g.importTokensFromTokensFile(); if ( g.isLexer() ) { - assignLexerTokenTypes(g, collector.tokensDefs); + assignLexerTokenTypesAndCaseInsensitivity(g, collector.tokensDefs); } else { assignTokenTypes(g, collector.tokensDefs, @@ -136,7 +138,7 @@ void identifyStartRules(SymbolCollector collector) { } } - void assignLexerTokenTypes(Grammar g, List tokensDefs) { + void assignLexerTokenTypesAndCaseInsensitivity(Grammar g, List tokensDefs) { Grammar G = g.getOutermostGrammar(); // put in root, even if imported for (GrammarAST def : tokensDefs) { // tokens { id (',' id)* } so must check IDs not TOKEN_REF @@ -152,6 +154,7 @@ void assignLexerTokenTypes(Grammar g, List tokensDefs) { if ( !r.isFragment() && !hasTypeOrMoreCommand(r) ) { G.defineTokenName(r.name); } + extractRuleCaseInsensitivity(r); } // FOR ALL X : 'xxx'; RULES, DEFINE 'xxx' AS TYPE X @@ -210,6 +213,63 @@ else if (LexerMoreAction.key.equals(node.getText())) { return false; } + void extractRuleCaseInsensitivity(Rule r) { + GrammarAST ast = r.ast; + if (ast == null) { + return; + } + + GrammarAST altActionAst = (GrammarAST) ast.getFirstDescendantWithType(ANTLRParser.LEXER_ALT_ACTION); + if (altActionAst == null) { + return; + } + + String caseInsensitiveOption = g.getOptionString("caseInsensitive"); + boolean defaultCaseInsensitive = caseInsensitiveOption != null && caseInsensitiveOption.equals("true"); + + for (int i = 1; i < altActionAst.getChildCount(); i++) { + GrammarAST node = (GrammarAST) altActionAst.getChild(i); + Tree commandKeyNode; + String commandKey; + Tree commandValueNode; + String commandValue; + if (node.getType() == ANTLRParser.LEXER_ACTION_CALL) { + commandKeyNode = node.getChild(0); + commandValueNode = node.getChild(1); + commandValue = commandValueNode.getText(); + } + else { + commandKeyNode = node; + commandValueNode = null; + commandValue = "true"; + } + commandKey = commandKeyNode.getText(); + + if (Rule.caseInsensitiveLexerCommandKey.equals(commandKey)) { + Boolean currentCaseInsensitivity = null; + if (commandValue != null) { + if (commandValue.equals("true")) { + currentCaseInsensitivity = true; + } + else if (commandValue.equals("false")) { + currentCaseInsensitivity = false; + } + } + if (currentCaseInsensitivity == null) { + g.tool.errMgr.grammarError(ErrorType.ILLEGAL_OPTION_VALUE, g.fileName, + ((GrammarAST) commandValueNode).getToken(), commandKey, commandValue); + } + else { + if (currentCaseInsensitivity == defaultCaseInsensitive) { + g.tool.errMgr.grammarError(ErrorType.REDUNDANT_CASE_INSENSITIVE_LEXER_COMMAND, + g.fileName, ((CommonTree)commandKeyNode).getToken(), defaultCaseInsensitive); + } + r.caseInsensitive = currentCaseInsensitivity; + } + } + } + } + void assignTokenTypes(Grammar g, List tokensDefs, List tokenIDs, List terminals) { diff --git a/tool/src/org/antlr/v4/tool/ErrorType.java b/tool/src/org/antlr/v4/tool/ErrorType.java index a0f374b3338..dfb7a325ae0 100644 --- a/tool/src/org/antlr/v4/tool/ErrorType.java +++ b/tool/src/org/antlr/v4/tool/ErrorType.java @@ -1107,6 +1107,19 @@ public enum ErrorType { "Range .. probably contains not implied characters . Both bounds should be defined in lower or UPPER case", ErrorSeverity.WARNING ), + /** + *

Redundant caseInsensitive lexer command

+ * + *
+	 * options { caseInsensitive=true; }
+	 * TOKEN: [a-z]+ -> caseInsensitive(true); // warning
+	 * 
+ */ + REDUNDANT_CASE_INSENSITIVE_LEXER_COMMAND( + 186, + "caseInsensitive lexer command is redundant because its value equals to global value ()", + ErrorSeverity.WARNING + ), /* * Backward incompatibility errors diff --git a/tool/src/org/antlr/v4/tool/Rule.java b/tool/src/org/antlr/v4/tool/Rule.java index 222651a3361..b80f5fbcebd 100644 --- a/tool/src/org/antlr/v4/tool/Rule.java +++ b/tool/src/org/antlr/v4/tool/Rule.java @@ -38,6 +38,8 @@ public class Rule implements AttributeResolver { predefinedRulePropertiesDict.add(new Attribute("ctx")); } + public static final String caseInsensitiveLexerCommandKey = "caseInsensitive"; + public String name; public List modifiers; @@ -52,6 +54,9 @@ public class Rule implements AttributeResolver { /** If we're in a lexer grammar, we might be in a mode */ public String mode; + /** If null then use value from global option that is false by default */ + public Boolean caseInsensitive = null; + /** Map a name to an action for this rule like @init {...}. * The code generator will use this to fill holes in the rule template. * I track the AST node for the action in case I need the line number diff --git a/tool/src/org/antlr/v4/tool/ast/GrammarAST.java b/tool/src/org/antlr/v4/tool/ast/GrammarAST.java index adad39568c6..39cd23f42e1 100644 --- a/tool/src/org/antlr/v4/tool/ast/GrammarAST.java +++ b/tool/src/org/antlr/v4/tool/ast/GrammarAST.java @@ -80,7 +80,12 @@ public List getNodesWithType(IntervalSet types) { t = work.remove(0); if ( types==null || types.contains(t.getType()) ) nodes.add(t); if ( t.children!=null ) { - work.addAll(Arrays.asList(t.getChildrenAsArray())); + GrammarAST[] childrenArray = t.getChildrenAsArray(); + for (GrammarAST child : childrenArray) { + if (child.getType() != ANTLRParser.LEXER_ACTION_CALL) { + work.add(child); + } + } } } return nodes;