Skip to content

Commit

Permalink
Merge pull request #3435 from KvanTTT/case-insensitive-fixes
Browse files Browse the repository at this point in the history
Fix case insensitive option with unicode, RANGE_PROBABLY_CONTAINS_NOT_IMPLIED_CHARACTERS fixes
  • Loading branch information
parrt authored Dec 26, 2021
2 parents 77be902 + df62fba commit faefc25
Show file tree
Hide file tree
Showing 4 changed files with 110 additions and 50 deletions.
39 changes: 34 additions & 5 deletions tool-testsuite/test/org/antlr/v4/test/tool/TestSymbolIssues.java
Original file line number Diff line number Diff line change
Expand Up @@ -394,7 +394,7 @@ public void testLabelsForTokensWithMixedTypesLRWithoutLabels() {

"warning(" + ErrorType.CHARACTERS_COLLISION_IN_SET.code + "): L.g4:2:18: chars a-f used multiple times in set [aa-f]\n" +
"warning(" + ErrorType.CHARACTERS_COLLISION_IN_SET.code + "): L.g4:3:18: chars D-J used multiple times in set [A-FD-J]\n" +
"warning(" + ErrorType.CHARACTERS_COLLISION_IN_SET.code + "): L.g4:4:13: chars O-V used multiple times in set 'Z' | 'K'..'R' | 'O'..'V'\n" +
"warning(" + ErrorType.CHARACTERS_COLLISION_IN_SET.code + "): L.g4:4:38: chars O-V used multiple times in set 'Z' | 'K'..'R' | 'O'..'V'\n" +
"warning(" + ErrorType.CHARACTERS_COLLISION_IN_SET.code + "): L.g4::: chars 'g' used multiple times in set 'g'..'l'\n" +
"warning(" + ErrorType.CHARACTERS_COLLISION_IN_SET.code + "): L.g4::: chars '\\n' used multiple times in set '\\n'..'\\r'\n"
};
Expand All @@ -411,16 +411,32 @@ public void testLabelsForTokensWithMixedTypesLRWithoutLabels() {
"TOKEN_RANGE_3: 'm'..'q' | [M-Q];\n",

"warning(" + ErrorType.CHARACTERS_COLLISION_IN_SET.code + "): L.g4:3:18: chars a-f used multiple times in set [a-fA-F0-9]\n" +
"warning(" + ErrorType.CHARACTERS_COLLISION_IN_SET.code + "): L.g4:3:18: chars A-F used multiple times in set [a-fA-F0-9]\n" +
"warning(" + ErrorType.CHARACTERS_COLLISION_IN_SET.code + "): L.g4:4:13: chars g-l used multiple times in set 'g'..'l' | 'G'..'L'\n" +
"warning(" + ErrorType.CHARACTERS_COLLISION_IN_SET.code + "): L.g4:4:13: chars G-L used multiple times in set 'g'..'l' | 'G'..'L'\n" +
"warning(" + ErrorType.CHARACTERS_COLLISION_IN_SET.code + "): L.g4:4:32: chars g-l used multiple times in set 'g'..'l' | 'G'..'L'\n" +
"warning(" + ErrorType.CHARACTERS_COLLISION_IN_SET.code + "): L.g4::: chars 'M' used multiple times in set 'M'..'Q' | 'm'..'q'\n" +
"warning(" + ErrorType.CHARACTERS_COLLISION_IN_SET.code + "): L.g4::: chars 'm' used multiple times in set 'M'..'Q' | 'm'..'q'\n"
};

testErrors(test, false);
}

@Test public void testCaseInsensitiveWithUnicodeRanges() {
String[] test = {
"lexer grammar L;\n" +
"options { caseInsensitive=true; }\n" +
"FullWidthLetter\n" +
" : '\\u00c0'..'\\u00d6' // ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ\n" +
" | '\\u00f8'..'\\u00ff' // øùúûüýþÿ\n" +
" ;",

""
};

// Don't transform øùúûüýþÿ to uppercase
// ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿĀāĂ㥹ĆćĈĉĊċČčĎďĐđĒēĔĕĖėĘęĚěĜĝĞğĠġĢģĤĥĦħĨĩĪīĬĭĮįİıIJijĴĵĶķĸĹĺĻļĽľĿŀŁłŃńŅņŇňʼnŊŋŌōŎŏŐőŒœŔŕŖŗŘřŚśŜŝŞşŠšŢţŤťŦŧŨũŪūŬŭŮůŰűŲųŴŵŶŷŸ
// because of different length of lower and UPPER range
testErrors(test, false);
}

@Test public void testUnreachableTokens() {
String[] test = {
"lexer grammar Test;\n" +
Expand Down Expand Up @@ -473,12 +489,25 @@ public void testLabelsForTokensWithMixedTypesLRWithoutLabels() {
"lexer grammar Test;\n" +
"TOKEN1: 'A'..'g';\n" +
"TOKEN2: [C-m];\n" +
"TOKEN3: [А-я]; // OK since range does not contain intermediate characters",
"TOKEN3: [А-я]; // OK since range does not contain intermediate characters\n" +
"TOKEN4: '\\u0100'..'\\u1fff'; // OK since range borders are unicode characters",

"warning(" + ErrorType.RANGE_PROBABLY_CONTAINS_NOT_IMPLIED_CHARACTERS.code + "): Test.g4:2:8: Range A..g probably contains not implied characters [\\]^_`. Both bounds should be defined in lower or UPPER case\n" +
"warning(" + ErrorType.RANGE_PROBABLY_CONTAINS_NOT_IMPLIED_CHARACTERS.code + "): Test.g4:3:8: Range C..m probably contains not implied characters [\\]^_`. Both bounds should be defined in lower or UPPER case\n"
};

testErrors(test, false);
}

@Test public void testNotImpliedCharactersWithCaseInsensitiveOption() {
String[] test = {
"lexer grammar Test;\n" +
"options { caseInsensitive=true; }\n" +
"TOKEN: [A-z];",

"warning(" + ErrorType.RANGE_PROBABLY_CONTAINS_NOT_IMPLIED_CHARACTERS.code + "): Test.g4:3:7: Range A..z probably contains not implied characters [\\]^_`. Both bounds should be defined in lower or UPPER case\n"
};

testErrors(test, false);
}
}
11 changes: 11 additions & 0 deletions tool/src/org/antlr/v4/automata/CharactersDataCheckStatus.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
package org.antlr.v4.automata;

public class CharactersDataCheckStatus {
public final boolean collision;
public final boolean notImpliedCharacters;

public CharactersDataCheckStatus(boolean collision, boolean notImpliedCharacters) {
this.collision = collision;
this.notImpliedCharacters = notImpliedCharacters;
}
}
85 changes: 48 additions & 37 deletions tool/src/org/antlr/v4/automata/LexerATNFactory.java
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,7 @@ public Handle set(GrammarAST associatedAST, List<GrammarAST> alts, boolean inver
int a = CharSupport.getCharValueFromGrammarCharLiteral(t.getChild(0).getText());
int b = CharSupport.getCharValueFromGrammarCharLiteral(t.getChild(1).getText());
if (checkRange((GrammarAST)t.getChild(0), (GrammarAST)t.getChild(1), a, b)) {
checkRangeAndAddToSet(associatedAST, set, a, b, caseInsensitive);
checkRangeAndAddToSet(associatedAST, t, set, a, b, caseInsensitive, null);
}
}
else if ( t.getType()==ANTLRParser.LEXER_CHAR_SET ) {
Expand Down Expand Up @@ -517,7 +517,7 @@ private CharSetParseState applyPrevStateAndMoveToCodePoint(
charSetAST.getToken(),
CharSupport.getRangeEscapedString(state.prevCodePoint, codePoint));
}
checkRangeAndAddToSet(charSetAST, set, state.prevCodePoint, codePoint, caseInsensitive);
checkRangeAndAddToSet(charSetAST, set, state.prevCodePoint, codePoint);
state = CharSetParseState.NONE;
}
else {
Expand Down Expand Up @@ -567,60 +567,71 @@ private void applyPrevState(GrammarAST charSetAST, IntervalSet set, CharSetParse
}

private void checkCharAndAddToSet(GrammarAST ast, IntervalSet set, int c) {
checkRangeAndAddToSet(ast, set, c, c, caseInsensitive);
checkRangeAndAddToSet(ast, ast, set, c, c, caseInsensitive, null);
}

private void checkRangeAndAddToSet(GrammarAST ast, IntervalSet set, int a, int b, boolean caseInsensitive) {
RangeBorderCharactersData charactersData = RangeBorderCharactersData.getAndCheckCharactersData(a, b, g, ast);
private void checkRangeAndAddToSet(GrammarAST mainAst, IntervalSet set, int a, int b) {
checkRangeAndAddToSet(mainAst, mainAst, set, a, b, caseInsensitive, null);
}

private CharactersDataCheckStatus checkRangeAndAddToSet(GrammarAST rootAst, GrammarAST ast, IntervalSet set, int a, int b, boolean caseInsensitive, CharactersDataCheckStatus previousStatus) {
CharactersDataCheckStatus status;
RangeBorderCharactersData charactersData = RangeBorderCharactersData.getAndCheckCharactersData(a, b, g, ast,
previousStatus == null || !previousStatus.notImpliedCharacters);
if (caseInsensitive) {
if (charactersData.lowerFrom == charactersData.upperFrom && charactersData.lowerTo == charactersData.upperTo ||
charactersData.mixOfLowerAndUpperCharCase
) {
checkRangeAndAddToSet(ast, set, a, b, false);
status = new CharactersDataCheckStatus(false, charactersData.mixOfLowerAndUpperCharCase);
if (charactersData.isSingleRange()) {
status = checkRangeAndAddToSet(rootAst, ast, set, a, b, false, status);
}
else {
checkRangeAndAddToSet(ast, set, charactersData.lowerFrom, charactersData.lowerTo, false);
checkRangeAndAddToSet(ast, set, charactersData.upperFrom, charactersData.upperTo, false);
status = checkRangeAndAddToSet(rootAst, ast, set, charactersData.lowerFrom, charactersData.lowerTo, false, status);
// Don't report similar warning twice
status = checkRangeAndAddToSet(rootAst, ast, set, charactersData.upperFrom, charactersData.upperTo, false, status);
}
}
else {
for (int i = a; i <= b; i++) {
if (set.contains(i)) {
String setText;
if (ast.getChildren() == null) {
setText = ast.getText();
}
else {
StringBuilder sb = new StringBuilder();
for (Object child : ast.getChildren()) {
if (child instanceof RangeAST) {
sb.append(((RangeAST) child).getChild(0).getText());
sb.append("..");
sb.append(((RangeAST) child).getChild(1).getText());
}
else {
sb.append(((GrammarAST) child).getText());
boolean charactersCollision = previousStatus != null && previousStatus.collision;
if (!charactersCollision) {
for (int i = a; i <= b; i++) {
if (set.contains(i)) {
String setText;
if (rootAst.getChildren() == null) {
setText = rootAst.getText();
}
else {
StringBuilder sb = new StringBuilder();
for (Object child : rootAst.getChildren()) {
if (child instanceof RangeAST) {
sb.append(((RangeAST) child).getChild(0).getText());
sb.append("..");
sb.append(((RangeAST) child).getChild(1).getText());
}
else {
sb.append(((GrammarAST) child).getText());
}
sb.append(" | ");
}
sb.append(" | ");
sb.replace(sb.length() - 3, sb.length(), "");
setText = sb.toString();
}
sb.replace(sb.length() - 3, sb.length(), "");
setText = sb.toString();
String charsString = a == b ? String.valueOf((char)a) : (char) a + "-" + (char) b;
g.tool.errMgr.grammarError(ErrorType.CHARACTERS_COLLISION_IN_SET, g.fileName, ast.getToken(),
charsString, setText);
charactersCollision = true;
break;
}
g.tool.errMgr.grammarError(ErrorType.CHARACTERS_COLLISION_IN_SET, g.fileName, ast.getToken(),
(char) a + "-" + (char) b, setText);
break;
}
}
status = new CharactersDataCheckStatus(charactersCollision, charactersData.mixOfLowerAndUpperCharCase);
set.add(a, b);
}
return status;
}

private Transition createTransition(ATNState target, int from, int to, CommonTree tree) {
RangeBorderCharactersData charactersData = RangeBorderCharactersData.getAndCheckCharactersData(from, to, g, tree);
RangeBorderCharactersData charactersData = RangeBorderCharactersData.getAndCheckCharactersData(from, to, g, tree, true);
if (caseInsensitive) {
if (charactersData.lowerFrom == charactersData.upperFrom && charactersData.lowerTo == charactersData.upperTo ||
charactersData.mixOfLowerAndUpperCharCase
) {
if (charactersData.isSingleRange()) {
return CodePointTransitions.createWithCodePointRange(target, from, to);
}
else {
Expand Down
25 changes: 17 additions & 8 deletions tool/src/org/antlr/v4/automata/RangeBorderCharactersData.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@
import org.antlr.v4.tool.Grammar;

public class RangeBorderCharactersData {
public int lowerFrom;
public int upperFrom;
public int lowerTo;
public int upperTo;
public boolean mixOfLowerAndUpperCharCase;
public final int lowerFrom;
public final int upperFrom;
public final int lowerTo;
public final int upperTo;
public final boolean mixOfLowerAndUpperCharCase;

public RangeBorderCharactersData(int lowerFrom, int upperFrom, int lowerTo, int upperTo, boolean mixOfLowerAndUpperCharCase) {
this.lowerFrom = lowerFrom;
Expand All @@ -19,18 +19,21 @@ public RangeBorderCharactersData(int lowerFrom, int upperFrom, int lowerTo, int
this.mixOfLowerAndUpperCharCase = mixOfLowerAndUpperCharCase;
}

public static RangeBorderCharactersData getAndCheckCharactersData(int from, int to, Grammar grammar, CommonTree tree) {
public static RangeBorderCharactersData getAndCheckCharactersData(int from, int to, Grammar grammar, CommonTree tree,
boolean reportRangeContainsNotImpliedCharacters
) {
int lowerFrom = Character.toLowerCase(from);
int upperFrom = Character.toUpperCase(from);
int lowerTo = Character.toLowerCase(to);
int upperTo = Character.toUpperCase(to);

boolean isLowerFrom = lowerFrom == from;
boolean isLowerTo = lowerTo == to;
boolean mixOfLowerAndUpperCharCase = isLowerFrom && !isLowerTo || !isLowerFrom && isLowerTo;
if (mixOfLowerAndUpperCharCase) {
if (reportRangeContainsNotImpliedCharacters && mixOfLowerAndUpperCharCase && from <= 0x7F && to <= 0x7F) {
StringBuilder notImpliedCharacters = new StringBuilder();
for (int i = from; i < to; i++) {
if (Character.toLowerCase(i) == Character.toUpperCase(i)) {
if (!Character.isAlphabetic(i)) {
notImpliedCharacters.append((char)i);
}
}
Expand All @@ -41,4 +44,10 @@ public static RangeBorderCharactersData getAndCheckCharactersData(int from, int
}
return new RangeBorderCharactersData(lowerFrom, upperFrom, lowerTo, upperTo, mixOfLowerAndUpperCharCase);
}

public boolean isSingleRange() {
return lowerFrom == upperFrom && lowerTo == upperTo ||
mixOfLowerAndUpperCharCase ||
lowerTo - lowerFrom != upperTo - upperFrom;
}
}

0 comments on commit faefc25

Please sign in to comment.