Skip to content

Commit

Permalink
Introduce caseInsensitive lexer command, fixes #3436
Browse files Browse the repository at this point in the history
Refactor LexerATNFactory
  • Loading branch information
KvanTTT committed Dec 26, 2021
1 parent b8ab312 commit 214a45b
Show file tree
Hide file tree
Showing 8 changed files with 185 additions and 38 deletions.
19 changes: 19 additions & 0 deletions doc/lexer-rules.md
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@ An alternative can have more than one command separated by commas. Here are the
* pushMode( x )
* type( x )
* channel( x )
* caseInsensitive( x )
See the book source code for usage, some examples of which are shown here:
Expand Down Expand Up @@ -308,3 +309,21 @@ As of 4.5, you can also define channel names like enumerations with the followin
```
channels { WSCHANNEL, MYHIDDEN }
```
### caseInsensitive()
Defines if the token is case-insensitive.
The argument can be `true`, `false`, or skipped.
Rewrites `caseInsensitive` option value if it's defined.
```g4
options { caseInsensitive=true; }
STRING: 'N'? '\'' (~'\'' | '\'\'')* '\'' -> caseInsensitive(false); // lower n is not allowed
```
If argument is skipped, `true` value is used:
```g4
CASE_INSENSITIVE_STRING: [a-z]+ -> caseInsensitive; // it matches ASDF
```
Original file line number Diff line number Diff line change
Expand Up @@ -505,6 +505,27 @@ public void testSetUp() throws Exception {
checkLexerMatches(lg, inputString, "TOKEN, EOF");
}

@Test public void testCaseInsensitiveInLexerRule() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n" +
"TOKEN1: [a-f]+ -> caseInsensitive(true);\n" +
"TOKEN2: [g-l]+ -> caseInsensitive;\n" +
"WS: [ ]+ -> skip;"
);

checkLexerMatches(lg, "ABCDEF GHIJKL", "TOKEN1, WS, TOKEN2, EOF");
}

@Test public void testCaseInsensitiveInLexerRuleOverridesGlobalValue() throws Exception {
String grammar =
"lexer grammar L;\n" +
"options { caseInsensitive = true; }\n" +
"STRING: 'N'? '\\'' (~'\\'' | '\\'\\'')* '\\'' -> caseInsensitive(false);\n";

execLexer("L.g4", grammar, "L", "n'sample'");
assertEquals("line 1:0 token recognition error at: 'n'\n", getParseErrors());
}

protected void checkLexerMatches(LexerGrammar lg, String inputString, String expecting) {
ATN atn = createATN(lg, true);
CharStream input = CharStreams.fromString(inputString);
Expand Down
35 changes: 30 additions & 5 deletions tool-testsuite/test/org/antlr/v4/test/tool/TestSymbolIssues.java
Original file line number Diff line number Diff line change
Expand Up @@ -239,12 +239,16 @@ public void testSetUp() throws Exception {
"mode MODE2;\n" +
"MODE2_TOKEN: 'f';\n" +
"MODE2_TOKEN1: 'g' -> type(TEST1), type(TEST2);\n" +
"MODE2_TOKEN2: 'h' -> channel(CHANNEL1), channel(CHANNEL2), channel(DEFAULT_TOKEN_CHANNEL);",
"MODE2_TOKEN2: 'h' -> channel(CHANNEL1), channel(CHANNEL2), channel(DEFAULT_TOKEN_CHANNEL);\n" +
"CASE_INSENSITIVE1: 'c1' -> caseInsensitive(true), caseInsensitive(true);\n" +
"CASE_INSENSITIVE2: 'c2' -> caseInsensitive, caseInsensitive;\n",

"warning(" + ErrorType.DUPLICATED_COMMAND.code + "): Lexer.g4:4:27: duplicated command mode\n" +
"warning(" + ErrorType.DUPLICATED_COMMAND.code + "): Lexer.g4:12:34: duplicated command type\n" +
"warning(" + ErrorType.DUPLICATED_COMMAND.code + "): Lexer.g4:13:40: duplicated command channel\n" +
"warning(" + ErrorType.DUPLICATED_COMMAND.code + "): Lexer.g4:13:59: duplicated command channel\n"
"warning(" + ErrorType.DUPLICATED_COMMAND.code + "): Lexer.g4:13:59: duplicated command channel\n" +
"warning(" + ErrorType.DUPLICATED_COMMAND.code + "): Lexer.g4:14:50: duplicated command caseInsensitive\n" +
"warning(" + ErrorType.DUPLICATED_COMMAND.code + "): Lexer.g4:15:44: duplicated command caseInsensitive\n"
};

testErrors(test, false);
Expand Down Expand Up @@ -472,18 +476,39 @@ public void testLabelsForTokensWithMixedTypesLRWithoutLabels() {
testErrors(test, false);
}

@Test public void testIllegalModeOption() {
@Test public void testIllegalCaseInsensitiveOptionValue() {
String[] test = {
"lexer grammar L;\n" +
"options { caseInsensitive = badValue; }\n" +
"DEFAULT_TOKEN: [A-F]+;\n",
"TOKEN_1: [A-F]+ -> caseInsensitive(badValue);\n",

"warning(" + ErrorType.ILLEGAL_OPTION_VALUE.code + "): L.g4:2:28: unsupported option value caseInsensitive=badValue\n"
"warning(" + ErrorType.ILLEGAL_OPTION_VALUE.code + "): L.g4:2:28: unsupported option value caseInsensitive=badValue\n" +
"warning(" + ErrorType.ILLEGAL_OPTION_VALUE.code + "): L.g4:3:35: unsupported option value caseInsensitive=badValue\n"
};

testErrors(test, false);
}

@Test public void testRedundantCaseInsensitiveLexerCommand() {
String[] test = {
"lexer grammar L;\n" +
"options { caseInsensitive = true; }\n" +
"TOKEN: [A-F]+ -> caseInsensitive;\n",

"warning(" + ErrorType.REDUNDANT_CASE_INSENSITIVE_LEXER_COMMAND.code + "): L.g4:3:17: caseInsensitive lexer command is redundant because its value equals to global value (true)\n"
};
testErrors(test, false);

String[] test2 = {
"lexer grammar L;\n" +
"options { caseInsensitive = false; }\n" +
"TOKEN: [A-F]+ -> caseInsensitive(false);\n",

"warning(" + ErrorType.REDUNDANT_CASE_INSENSITIVE_LEXER_COMMAND.code + "): L.g4:3:17: caseInsensitive lexer command is redundant because its value equals to global value (false)\n"
};
testErrors(test2, false);
}

@Test public void testNotImpliedCharacters() {
String[] test = {
"lexer grammar Test;\n" +
Expand Down
59 changes: 29 additions & 30 deletions tool/src/org/antlr/v4/automata/LexerATNFactory.java
Original file line number Diff line number Diff line change
Expand Up @@ -207,51 +207,46 @@ public Handle lexerAltCommands(Handle alt, Handle cmds) {

@Override
public Handle lexerCallCommand(GrammarAST ID, GrammarAST arg) {
LexerAction lexerAction = createLexerAction(ID, arg);
if (lexerAction != null) {
return action(ID, lexerAction);
}

// fall back to standard action generation for the command
ST cmdST = codegenTemplates.getInstanceOf("Lexer" +
CharSupport.capitalize(ID.getText())+
"Command");
if (cmdST == null) {
g.tool.errMgr.grammarError(ErrorType.INVALID_LEXER_COMMAND, g.fileName, ID.token, ID.getText());
return epsilon(ID);
}

if (cmdST.impl.formalArguments == null || !cmdST.impl.formalArguments.containsKey("arg")) {
g.tool.errMgr.grammarError(ErrorType.UNWANTED_LEXER_COMMAND_ARGUMENT, g.fileName, ID.token, ID.getText());
return epsilon(ID);
}

cmdST.add("arg", arg.getText());
cmdST.add("grammar", arg.g);
return action(cmdST.render());
return lexerCallCommandOrCommand(ID, arg);
}

@Override
public Handle lexerCommand(GrammarAST ID) {
LexerAction lexerAction = createLexerAction(ID, null);
return lexerCallCommandOrCommand(ID, null);
}

private Handle lexerCallCommandOrCommand(GrammarAST ID, GrammarAST arg) {
LexerAction lexerAction = createLexerAction(ID, arg);
if (lexerAction != null) {
return action(ID, lexerAction);
}

if (Rule.caseInsensitiveLexerCommandKey.equals(ID.getText())) {
return epsilon(ID);
}

// fall back to standard action generation for the command
ST cmdST = codegenTemplates.getInstanceOf("Lexer" +
CharSupport.capitalize(ID.getText()) +
CharSupport.capitalize(ID.getText())+
"Command");
if (cmdST == null) {
g.tool.errMgr.grammarError(ErrorType.INVALID_LEXER_COMMAND, g.fileName, ID.token, ID.getText());
return epsilon(ID);
}

if (cmdST.impl.formalArguments != null && cmdST.impl.formalArguments.containsKey("arg")) {
g.tool.errMgr.grammarError(ErrorType.MISSING_LEXER_COMMAND_ARGUMENT, g.fileName, ID.token, ID.getText());
boolean callCommand = arg != null;
boolean containsArg = cmdST.impl.formalArguments != null && cmdST.impl.formalArguments.containsKey("arg");
if (callCommand != containsArg) {
ErrorType errorType = callCommand ? ErrorType.UNWANTED_LEXER_COMMAND_ARGUMENT : ErrorType.MISSING_LEXER_COMMAND_ARGUMENT;
g.tool.errMgr.grammarError(errorType, g.fileName, ID.token, ID.getText());
return epsilon(ID);
}

if (callCommand) {
cmdST.add("arg", arg.getText());
cmdST.add("grammar", arg.g);
}

return action(cmdST.render());
}

Expand Down Expand Up @@ -279,7 +274,7 @@ public Handle set(GrammarAST associatedAST, List<GrammarAST> alts, boolean inver
int a = CharSupport.getCharValueFromGrammarCharLiteral(t.getChild(0).getText());
int b = CharSupport.getCharValueFromGrammarCharLiteral(t.getChild(1).getText());
if (checkRange((GrammarAST)t.getChild(0), (GrammarAST)t.getChild(1), a, b)) {
checkRangeAndAddToSet(associatedAST, t, set, a, b, caseInsensitive, null);
checkRangeAndAddToSet(associatedAST, t, set, a, b, getCurrentCaseInsensitivity(), null);
}
}
else if ( t.getType()==ANTLRParser.LEXER_CHAR_SET ) {
Expand Down Expand Up @@ -567,11 +562,11 @@ private void applyPrevState(GrammarAST charSetAST, IntervalSet set, CharSetParse
}

private void checkCharAndAddToSet(GrammarAST ast, IntervalSet set, int c) {
checkRangeAndAddToSet(ast, ast, set, c, c, caseInsensitive, null);
checkRangeAndAddToSet(ast, ast, set, c, c, getCurrentCaseInsensitivity(), null);
}

private void checkRangeAndAddToSet(GrammarAST mainAst, IntervalSet set, int a, int b) {
checkRangeAndAddToSet(mainAst, mainAst, set, a, b, caseInsensitive, null);
checkRangeAndAddToSet(mainAst, mainAst, set, a, b, getCurrentCaseInsensitivity(), null);
}

private CharactersDataCheckStatus checkRangeAndAddToSet(GrammarAST rootAst, GrammarAST ast, IntervalSet set, int a, int b, boolean caseInsensitive, CharactersDataCheckStatus previousStatus) {
Expand Down Expand Up @@ -630,7 +625,7 @@ private CharactersDataCheckStatus checkRangeAndAddToSet(GrammarAST rootAst, Gram

private Transition createTransition(ATNState target, int from, int to, CommonTree tree) {
RangeBorderCharactersData charactersData = RangeBorderCharactersData.getAndCheckCharactersData(from, to, g, tree, true);
if (caseInsensitive) {
if (getCurrentCaseInsensitivity()) {
if (charactersData.isSingleRange()) {
return CodePointTransitions.createWithCodePointRange(target, from, to);
}
Expand All @@ -646,6 +641,10 @@ private Transition createTransition(ATNState target, int from, int to, CommonTre
}
}

private boolean getCurrentCaseInsensitivity() {
return currentRule.caseInsensitive != null ? currentRule.caseInsensitive : caseInsensitive;
}

@Override
public Handle tokenRef(TerminalAST node) {
// Ref to EOF in lexer yields char transition on -1
Expand Down
64 changes: 62 additions & 2 deletions tool/src/org/antlr/v4/semantics/SemanticPipeline.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

package org.antlr.v4.semantics;

import org.antlr.runtime.tree.CommonTree;
import org.antlr.runtime.tree.Tree;
import org.antlr.v4.analysis.LeftRecursiveRuleTransformer;
import org.antlr.v4.automata.LexerATNFactory;
import org.antlr.v4.parse.ANTLRParser;
Expand Down Expand Up @@ -102,7 +104,7 @@ public void process() {
// ASSIGN TOKEN TYPES
g.importTokensFromTokensFile();
if ( g.isLexer() ) {
assignLexerTokenTypes(g, collector.tokensDefs);
assignLexerTokenTypesAndCaseInsensitivity(g, collector.tokensDefs);
}
else {
assignTokenTypes(g, collector.tokensDefs,
Expand Down Expand Up @@ -136,7 +138,7 @@ void identifyStartRules(SymbolCollector collector) {
}
}

void assignLexerTokenTypes(Grammar g, List<GrammarAST> tokensDefs) {
void assignLexerTokenTypesAndCaseInsensitivity(Grammar g, List<GrammarAST> tokensDefs) {
Grammar G = g.getOutermostGrammar(); // put in root, even if imported
for (GrammarAST def : tokensDefs) {
// tokens { id (',' id)* } so must check IDs not TOKEN_REF
Expand All @@ -152,6 +154,7 @@ void assignLexerTokenTypes(Grammar g, List<GrammarAST> tokensDefs) {
if ( !r.isFragment() && !hasTypeOrMoreCommand(r) ) {
G.defineTokenName(r.name);
}
extractRuleCaseInsensitivity(r);
}

// FOR ALL X : 'xxx'; RULES, DEFINE 'xxx' AS TYPE X
Expand Down Expand Up @@ -210,6 +213,63 @@ else if (LexerMoreAction.key.equals(node.getText())) {
return false;
}

void extractRuleCaseInsensitivity(Rule r) {
GrammarAST ast = r.ast;
if (ast == null) {
return;
}

GrammarAST altActionAst = (GrammarAST) ast.getFirstDescendantWithType(ANTLRParser.LEXER_ALT_ACTION);
if (altActionAst == null) {
return;
}

String caseInsensitiveOption = g.getOptionString("caseInsensitive");
boolean defaultCaseInsensitive = caseInsensitiveOption != null && caseInsensitiveOption.equals("true");

for (int i = 1; i < altActionAst.getChildCount(); i++) {
GrammarAST node = (GrammarAST) altActionAst.getChild(i);
Tree commandKeyNode;
String commandKey;
Tree commandValueNode;
String commandValue;
if (node.getType() == ANTLRParser.LEXER_ACTION_CALL) {
commandKeyNode = node.getChild(0);
commandValueNode = node.getChild(1);
commandValue = commandValueNode.getText();
}
else {
commandKeyNode = node;
commandValueNode = null;
commandValue = "true";
}
commandKey = commandKeyNode.getText();

if (Rule.caseInsensitiveLexerCommandKey.equals(commandKey)) {
Boolean currentCaseInsensitivity = null;
if (commandValue != null) {
if (commandValue.equals("true")) {
currentCaseInsensitivity = true;
}
else if (commandValue.equals("false")) {
currentCaseInsensitivity = false;
}
}
if (currentCaseInsensitivity == null) {
g.tool.errMgr.grammarError(ErrorType.ILLEGAL_OPTION_VALUE, g.fileName,
((GrammarAST) commandValueNode).getToken(), commandKey, commandValue);
}
else {
if (currentCaseInsensitivity == defaultCaseInsensitive) {
g.tool.errMgr.grammarError(ErrorType.REDUNDANT_CASE_INSENSITIVE_LEXER_COMMAND,
g.fileName, ((CommonTree)commandKeyNode).getToken(), defaultCaseInsensitive);
}
r.caseInsensitive = currentCaseInsensitivity;
}
}
}
}

void assignTokenTypes(Grammar g, List<GrammarAST> tokensDefs,
List<GrammarAST> tokenIDs, List<GrammarAST> terminals)
{
Expand Down
13 changes: 13 additions & 0 deletions tool/src/org/antlr/v4/tool/ErrorType.java
Original file line number Diff line number Diff line change
Expand Up @@ -1107,6 +1107,19 @@ public enum ErrorType {
"Range <arg>..<arg2> probably contains not implied characters <arg3>. Both bounds should be defined in lower or UPPER case",
ErrorSeverity.WARNING
),
/**
* <p>Redundant caseInsensitive lexer command</p>
*
* <pre>
* options { caseInsensitive=true; }
* TOKEN: [a-z]+ -> caseInsensitive(true); // warning
* </pre>
*/
REDUNDANT_CASE_INSENSITIVE_LEXER_COMMAND(
186,
"caseInsensitive lexer command is redundant because its value equals to global value (<arg>)",
ErrorSeverity.WARNING
),

/*
* Backward incompatibility errors
Expand Down
5 changes: 5 additions & 0 deletions tool/src/org/antlr/v4/tool/Rule.java
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ public class Rule implements AttributeResolver {
predefinedRulePropertiesDict.add(new Attribute("ctx"));
}

public static final String caseInsensitiveLexerCommandKey = "caseInsensitive";

public String name;
public List<GrammarAST> modifiers;

Expand All @@ -52,6 +54,9 @@ public class Rule implements AttributeResolver {
/** If we're in a lexer grammar, we might be in a mode */
public String mode;

/** If null then use value from global option that is false by default */
public Boolean caseInsensitive = null;

/** Map a name to an action for this rule like @init {...}.
* The code generator will use this to fill holes in the rule template.
* I track the AST node for the action in case I need the line number
Expand Down
7 changes: 6 additions & 1 deletion tool/src/org/antlr/v4/tool/ast/GrammarAST.java
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,12 @@ public List<GrammarAST> getNodesWithType(IntervalSet types) {
t = work.remove(0);
if ( types==null || types.contains(t.getType()) ) nodes.add(t);
if ( t.children!=null ) {
work.addAll(Arrays.asList(t.getChildrenAsArray()));
GrammarAST[] childrenArray = t.getChildrenAsArray();
for (GrammarAST child : childrenArray) {
if (child.getType() != ANTLRParser.LEXER_ACTION_CALL) {
work.add(child);
}
}
}
}
return nodes;
Expand Down

0 comments on commit 214a45b

Please sign in to comment.