Introduce caseInsensitive lexer command, fixes #3436

Refactor LexerATNFactory
antlr · Dec 26, 2021 · 214a45b · 214a45b
1 parent b8ab312
commit 214a45b
Show file tree

Hide file tree

Showing 8 changed files with 185 additions and 38 deletions.
diff --git a/doc/lexer-rules.md b/doc/lexer-rules.md
@@ -225,6 +225,7 @@ An alternative can have more than one command separated by commas. Here are the
 * pushMode( x )
 * type( x )
 * channel( x )
+* caseInsensitive( x )
 
 See the book source code for usage, some examples of which are shown here:
 
@@ -308,3 +309,21 @@ As of 4.5, you can also define channel names like enumerations with the followin
 ```
 channels { WSCHANNEL, MYHIDDEN }
 ```
+
+### caseInsensitive()
+
+Defines if the token is case-insensitive.
+The argument can be `true`, `false`, or skipped.
+Rewrites `caseInsensitive` option value if it's defined.
+
+```g4
+options { caseInsensitive=true; }
+STRING: 'N'? '\'' (~'\'' | '\'\'')* '\'' -> caseInsensitive(false); // lower n is not allowed
+```
+
+If argument is skipped, `true` value is used:
+
+```g4
+CASE_INSENSITIVE_STRING: [a-z]+ -> caseInsensitive; // it matches ASDF
+```
+
diff --git a/tool-testsuite/test/org/antlr/v4/test/tool/TestATNLexerInterpreter.java b/tool-testsuite/test/org/antlr/v4/test/tool/TestATNLexerInterpreter.java
@@ -505,6 +505,27 @@ public void testSetUp() throws Exception {
 		checkLexerMatches(lg, inputString, "TOKEN, EOF");
 	}
 
+	@Test public void testCaseInsensitiveInLexerRule() throws Exception {
+		LexerGrammar lg = new LexerGrammar(
+				"lexer grammar L;\n" +
+				"TOKEN1: [a-f]+ -> caseInsensitive(true);\n" +
+				"TOKEN2: [g-l]+ -> caseInsensitive;\n" +
+				"WS: [ ]+ -> skip;"
+		);
+
+		checkLexerMatches(lg, "ABCDEF GHIJKL", "TOKEN1, WS, TOKEN2, EOF");
+	}
+
+	@Test public void testCaseInsensitiveInLexerRuleOverridesGlobalValue() throws Exception {
+		String grammar =
+				"lexer grammar L;\n" +
+				"options { caseInsensitive = true; }\n" +
+				"STRING: 'N'? '\\'' (~'\\'' | '\\'\\'')* '\\'' -> caseInsensitive(false);\n";
+
+		execLexer("L.g4", grammar, "L", "n'sample'");
+		assertEquals("line 1:0 token recognition error at: 'n'\n", getParseErrors());
+	}
+
 	protected void checkLexerMatches(LexerGrammar lg, String inputString, String expecting) {
 		ATN atn = createATN(lg, true);
 		CharStream input = CharStreams.fromString(inputString);

diff --git a/tool-testsuite/test/org/antlr/v4/test/tool/TestSymbolIssues.java b/tool-testsuite/test/org/antlr/v4/test/tool/TestSymbolIssues.java
@@ -239,12 +239,16 @@ public void testSetUp() throws Exception {
 			"mode MODE2;\n" +
 			"MODE2_TOKEN: 'f';\n" +
 			"MODE2_TOKEN1: 'g' -> type(TEST1), type(TEST2);\n" +
-			"MODE2_TOKEN2: 'h' -> channel(CHANNEL1), channel(CHANNEL2), channel(DEFAULT_TOKEN_CHANNEL);",
+			"MODE2_TOKEN2: 'h' -> channel(CHANNEL1), channel(CHANNEL2), channel(DEFAULT_TOKEN_CHANNEL);\n" +
+			"CASE_INSENSITIVE1: 'c1' -> caseInsensitive(true), caseInsensitive(true);\n" +
+			"CASE_INSENSITIVE2: 'c2' -> caseInsensitive, caseInsensitive;\n",
 
 			"warning(" + ErrorType.DUPLICATED_COMMAND.code + "): Lexer.g4:4:27: duplicated command mode\n" +
 			"warning(" + ErrorType.DUPLICATED_COMMAND.code + "): Lexer.g4:12:34: duplicated command type\n" +
 			"warning(" + ErrorType.DUPLICATED_COMMAND.code + "): Lexer.g4:13:40: duplicated command channel\n" +
-			"warning(" + ErrorType.DUPLICATED_COMMAND.code + "): Lexer.g4:13:59: duplicated command channel\n"
+			"warning(" + ErrorType.DUPLICATED_COMMAND.code + "): Lexer.g4:13:59: duplicated command channel\n" +
+			"warning(" + ErrorType.DUPLICATED_COMMAND.code + "): Lexer.g4:14:50: duplicated command caseInsensitive\n" +
+			"warning(" + ErrorType.DUPLICATED_COMMAND.code + "): Lexer.g4:15:44: duplicated command caseInsensitive\n"
 		};
 
 		testErrors(test, false);
@@ -472,18 +476,39 @@ public void testLabelsForTokensWithMixedTypesLRWithoutLabels() {
 		testErrors(test, false);
 	}
 
-	@Test public void testIllegalModeOption() {
+	@Test public void testIllegalCaseInsensitiveOptionValue() {
 		String[] test = {
 				"lexer grammar L;\n" +
 				"options { caseInsensitive = badValue; }\n" +
-				"DEFAULT_TOKEN: [A-F]+;\n",
+				"TOKEN_1: [A-F]+ -> caseInsensitive(badValue);\n",
 
-				"warning(" + ErrorType.ILLEGAL_OPTION_VALUE.code + "): L.g4:2:28: unsupported option value caseInsensitive=badValue\n"
+				"warning(" + ErrorType.ILLEGAL_OPTION_VALUE.code + "): L.g4:2:28: unsupported option value caseInsensitive=badValue\n" +
+				"warning(" + ErrorType.ILLEGAL_OPTION_VALUE.code + "): L.g4:3:35: unsupported option value caseInsensitive=badValue\n"
 		};
 
 		testErrors(test, false);
 	}
 
+	@Test public void testRedundantCaseInsensitiveLexerCommand() {
+		String[] test = {
+				"lexer grammar L;\n" +
+				"options { caseInsensitive = true; }\n" +
+				"TOKEN: [A-F]+ -> caseInsensitive;\n",
+
+				"warning(" + ErrorType.REDUNDANT_CASE_INSENSITIVE_LEXER_COMMAND.code + "): L.g4:3:17: caseInsensitive lexer command is redundant because its value equals to global value (true)\n"
+		};
+		testErrors(test, false);
+
+		String[] test2 = {
+				"lexer grammar L;\n" +
+				"options { caseInsensitive = false; }\n" +
+				"TOKEN: [A-F]+ -> caseInsensitive(false);\n",
+
+				"warning(" + ErrorType.REDUNDANT_CASE_INSENSITIVE_LEXER_COMMAND.code + "): L.g4:3:17: caseInsensitive lexer command is redundant because its value equals to global value (false)\n"
+		};
+		testErrors(test2, false);
+	}
+
 	@Test public void testNotImpliedCharacters() {
 		String[] test = {
 				"lexer grammar Test;\n" +

diff --git a/tool/src/org/antlr/v4/automata/LexerATNFactory.java b/tool/src/org/antlr/v4/automata/LexerATNFactory.java
@@ -207,51 +207,46 @@ public Handle lexerAltCommands(Handle alt, Handle cmds) {
 
 	@Override
 	public Handle lexerCallCommand(GrammarAST ID, GrammarAST arg) {
-		LexerAction lexerAction = createLexerAction(ID, arg);
-		if (lexerAction != null) {
-			return action(ID, lexerAction);
-		}
-
-		// fall back to standard action generation for the command
-		ST cmdST = codegenTemplates.getInstanceOf("Lexer" +
-												  CharSupport.capitalize(ID.getText())+
-												  "Command");
-		if (cmdST == null) {
-			g.tool.errMgr.grammarError(ErrorType.INVALID_LEXER_COMMAND, g.fileName, ID.token, ID.getText());
-			return epsilon(ID);
-		}
-
-		if (cmdST.impl.formalArguments == null || !cmdST.impl.formalArguments.containsKey("arg")) {
-			g.tool.errMgr.grammarError(ErrorType.UNWANTED_LEXER_COMMAND_ARGUMENT, g.fileName, ID.token, ID.getText());
-			return epsilon(ID);
-		}
-
-		cmdST.add("arg", arg.getText());
-		cmdST.add("grammar", arg.g);
-		return action(cmdST.render());
+		return lexerCallCommandOrCommand(ID, arg);
 	}
 
 	@Override
 	public Handle lexerCommand(GrammarAST ID) {
-		LexerAction lexerAction = createLexerAction(ID, null);
+		return lexerCallCommandOrCommand(ID, null);
+	}
+
+	private Handle lexerCallCommandOrCommand(GrammarAST ID, GrammarAST arg) {
+		LexerAction lexerAction = createLexerAction(ID, arg);
 		if (lexerAction != null) {
 			return action(ID, lexerAction);
 		}
 
+		if (Rule.caseInsensitiveLexerCommandKey.equals(ID.getText())) {
+			return epsilon(ID);
+		}
+
 		// fall back to standard action generation for the command
 		ST cmdST = codegenTemplates.getInstanceOf("Lexer" +
-				CharSupport.capitalize(ID.getText()) +
+				CharSupport.capitalize(ID.getText())+
 				"Command");
 		if (cmdST == null) {
 			g.tool.errMgr.grammarError(ErrorType.INVALID_LEXER_COMMAND, g.fileName, ID.token, ID.getText());
 			return epsilon(ID);
 		}
 
-		if (cmdST.impl.formalArguments != null && cmdST.impl.formalArguments.containsKey("arg")) {
-			g.tool.errMgr.grammarError(ErrorType.MISSING_LEXER_COMMAND_ARGUMENT, g.fileName, ID.token, ID.getText());
+		boolean callCommand = arg != null;
+		boolean containsArg = cmdST.impl.formalArguments != null && cmdST.impl.formalArguments.containsKey("arg");
+		if (callCommand != containsArg) {
+			ErrorType errorType = callCommand ? ErrorType.UNWANTED_LEXER_COMMAND_ARGUMENT : ErrorType.MISSING_LEXER_COMMAND_ARGUMENT;
+			g.tool.errMgr.grammarError(errorType, g.fileName, ID.token, ID.getText());
 			return epsilon(ID);
 		}
 
+		if (callCommand) {
+			cmdST.add("arg", arg.getText());
+			cmdST.add("grammar", arg.g);
+		}
+
 		return action(cmdST.render());
 	}
 
@@ -279,7 +274,7 @@ public Handle set(GrammarAST associatedAST, List<GrammarAST> alts, boolean inver
 				int a = CharSupport.getCharValueFromGrammarCharLiteral(t.getChild(0).getText());
 				int b = CharSupport.getCharValueFromGrammarCharLiteral(t.getChild(1).getText());
 				if (checkRange((GrammarAST)t.getChild(0), (GrammarAST)t.getChild(1), a, b)) {
-					checkRangeAndAddToSet(associatedAST, t, set, a, b, caseInsensitive, null);
+					checkRangeAndAddToSet(associatedAST, t, set, a, b, getCurrentCaseInsensitivity(), null);
 				}
 			}
 			else if ( t.getType()==ANTLRParser.LEXER_CHAR_SET ) {
@@ -567,11 +562,11 @@ private void applyPrevState(GrammarAST charSetAST, IntervalSet set, CharSetParse
 	}
 
 	private void checkCharAndAddToSet(GrammarAST ast, IntervalSet set, int c) {
-		checkRangeAndAddToSet(ast, ast, set, c, c, caseInsensitive, null);
+		checkRangeAndAddToSet(ast, ast, set, c, c, getCurrentCaseInsensitivity(), null);
 	}
 
 	private void checkRangeAndAddToSet(GrammarAST mainAst, IntervalSet set, int a, int b) {
-		checkRangeAndAddToSet(mainAst, mainAst, set, a, b, caseInsensitive, null);
+		checkRangeAndAddToSet(mainAst, mainAst, set, a, b, getCurrentCaseInsensitivity(), null);
 	}
 
 	private CharactersDataCheckStatus checkRangeAndAddToSet(GrammarAST rootAst, GrammarAST ast, IntervalSet set, int a, int b, boolean caseInsensitive, CharactersDataCheckStatus previousStatus) {
@@ -630,7 +625,7 @@ private CharactersDataCheckStatus checkRangeAndAddToSet(GrammarAST rootAst, Gram
 
 	private Transition createTransition(ATNState target, int from, int to, CommonTree tree) {
 		RangeBorderCharactersData charactersData = RangeBorderCharactersData.getAndCheckCharactersData(from, to, g, tree, true);
-		if (caseInsensitive) {
+		if (getCurrentCaseInsensitivity()) {
 			if (charactersData.isSingleRange()) {
 				return CodePointTransitions.createWithCodePointRange(target, from, to);
 			}
@@ -646,6 +641,10 @@ private Transition createTransition(ATNState target, int from, int to, CommonTre
 		}
 	}
 
+	private boolean getCurrentCaseInsensitivity() {
+		return currentRule.caseInsensitive != null ? currentRule.caseInsensitive : caseInsensitive;
+	}
+
 	@Override
 	public Handle tokenRef(TerminalAST node) {
 		// Ref to EOF in lexer yields char transition on -1

diff --git a/tool/src/org/antlr/v4/semantics/SemanticPipeline.java b/tool/src/org/antlr/v4/semantics/SemanticPipeline.java
@@ -6,6 +6,8 @@
 
 package org.antlr.v4.semantics;
 
+import org.antlr.runtime.tree.CommonTree;
+import org.antlr.runtime.tree.Tree;
 import org.antlr.v4.analysis.LeftRecursiveRuleTransformer;
 import org.antlr.v4.automata.LexerATNFactory;
 import org.antlr.v4.parse.ANTLRParser;
@@ -102,7 +104,7 @@ public void process() {
 		// ASSIGN TOKEN TYPES
 		g.importTokensFromTokensFile();
 		if ( g.isLexer() ) {
-			assignLexerTokenTypes(g, collector.tokensDefs);
+			assignLexerTokenTypesAndCaseInsensitivity(g, collector.tokensDefs);
 		}
 		else {
 			assignTokenTypes(g, collector.tokensDefs,
@@ -136,7 +138,7 @@ void identifyStartRules(SymbolCollector collector) {
 		}
 	}
 
-	void assignLexerTokenTypes(Grammar g, List<GrammarAST> tokensDefs) {
+	void assignLexerTokenTypesAndCaseInsensitivity(Grammar g, List<GrammarAST> tokensDefs) {
 		Grammar G = g.getOutermostGrammar(); // put in root, even if imported
 		for (GrammarAST def : tokensDefs) {
 			// tokens { id (',' id)* } so must check IDs not TOKEN_REF
@@ -152,6 +154,7 @@ void assignLexerTokenTypes(Grammar g, List<GrammarAST> tokensDefs) {
 			if ( !r.isFragment() && !hasTypeOrMoreCommand(r) ) {
 				G.defineTokenName(r.name);
 			}
+			extractRuleCaseInsensitivity(r);
 		}
 
 		// FOR ALL X : 'xxx'; RULES, DEFINE 'xxx' AS TYPE X
@@ -210,6 +213,63 @@ else if (LexerMoreAction.key.equals(node.getText())) {
 		return false;
 	}
 
+	void extractRuleCaseInsensitivity(Rule r) {
+		GrammarAST ast = r.ast;
+		if (ast == null) {
+			return;
+		}
+
+		GrammarAST altActionAst = (GrammarAST) ast.getFirstDescendantWithType(ANTLRParser.LEXER_ALT_ACTION);
+		if (altActionAst == null) {
+			return;
+		}
+
+		String caseInsensitiveOption = g.getOptionString("caseInsensitive");
+		boolean defaultCaseInsensitive = caseInsensitiveOption != null && caseInsensitiveOption.equals("true");
+
+		for (int i = 1; i < altActionAst.getChildCount(); i++) {
+			GrammarAST node = (GrammarAST) altActionAst.getChild(i);
+			Tree commandKeyNode;
+			String commandKey;
+			Tree commandValueNode;
+			String commandValue;
+			if (node.getType() == ANTLRParser.LEXER_ACTION_CALL) {
+				commandKeyNode = node.getChild(0);
+				commandValueNode = node.getChild(1);
+				commandValue = commandValueNode.getText();
+			}
+			else {
+				commandKeyNode = node;
+				commandValueNode = null;
+				commandValue = "true";
+			}
+			commandKey = commandKeyNode.getText();
+
+			if (Rule.caseInsensitiveLexerCommandKey.equals(commandKey)) {
+				Boolean currentCaseInsensitivity = null;
+				if (commandValue != null) {
+					if (commandValue.equals("true")) {
+						currentCaseInsensitivity = true;
+					}
+					else if (commandValue.equals("false")) {
+						currentCaseInsensitivity = false;
+					}
+				}
+				if (currentCaseInsensitivity == null) {
+					g.tool.errMgr.grammarError(ErrorType.ILLEGAL_OPTION_VALUE, g.fileName,
+							((GrammarAST) commandValueNode).getToken(), commandKey, commandValue);
+				}
+				else {
+					if (currentCaseInsensitivity == defaultCaseInsensitive) {
+						g.tool.errMgr.grammarError(ErrorType.REDUNDANT_CASE_INSENSITIVE_LEXER_COMMAND,
+								g.fileName, ((CommonTree)commandKeyNode).getToken(), defaultCaseInsensitive);
+					}
+					r.caseInsensitive = currentCaseInsensitivity;
+				}
+			}
+		}
+	}
+
 	void assignTokenTypes(Grammar g, List<GrammarAST> tokensDefs,
 						  List<GrammarAST> tokenIDs, List<GrammarAST> terminals)
 	{

diff --git a/tool/src/org/antlr/v4/tool/ErrorType.java b/tool/src/org/antlr/v4/tool/ErrorType.java
@@ -1107,6 +1107,19 @@ public enum ErrorType {
 			"Range <arg>..<arg2> probably contains not implied characters <arg3>. Both bounds should be defined in lower or UPPER case",
 			ErrorSeverity.WARNING
 	),
+	/**
+	 * <p>Redundant caseInsensitive lexer command</p>
+	 *
+	 * <pre>
+	 * options { caseInsensitive=true; }
+	 * TOKEN: [a-z]+ -> caseInsensitive(true); // warning
+	 * </pre>
+	 */
+	REDUNDANT_CASE_INSENSITIVE_LEXER_COMMAND(
+			186,
+			"caseInsensitive lexer command is redundant because its value equals to global value (<arg>)",
+			ErrorSeverity.WARNING
+	),
 
 	/*
 	 * Backward incompatibility errors

diff --git a/tool/src/org/antlr/v4/tool/Rule.java b/tool/src/org/antlr/v4/tool/Rule.java
@@ -38,6 +38,8 @@ public class Rule implements AttributeResolver {
 		predefinedRulePropertiesDict.add(new Attribute("ctx"));
 	}
 
+	public static final String caseInsensitiveLexerCommandKey = "caseInsensitive";
+
 	public String name;
 	public List<GrammarAST> modifiers;
 
@@ -52,6 +54,9 @@ public class Rule implements AttributeResolver {
 	/** If we're in a lexer grammar, we might be in a mode */
 	public String mode;
 
+	/** If null then use value from global option that is false by default */
+	public Boolean caseInsensitive = null;
+
     /** Map a name to an action for this rule like @init {...}.
      *  The code generator will use this to fill holes in the rule template.
      *  I track the AST node for the action in case I need the line number

diff --git a/tool/src/org/antlr/v4/tool/ast/GrammarAST.java b/tool/src/org/antlr/v4/tool/ast/GrammarAST.java
@@ -80,7 +80,12 @@ public List<GrammarAST> getNodesWithType(IntervalSet types) {
 			t = work.remove(0);
 			if ( types==null || types.contains(t.getType()) ) nodes.add(t);
 			if ( t.children!=null ) {
-				work.addAll(Arrays.asList(t.getChildrenAsArray()));
+				GrammarAST[] childrenArray = t.getChildrenAsArray();
+				for (GrammarAST child : childrenArray) {
+					if (child.getType() != ANTLRParser.LEXER_ACTION_CALL) {
+						work.add(child);
+					}
+				}
 			}
 		}
 		return nodes;