Skip to content

Commit

Permalink
Improve XMLScanner performance
Browse files Browse the repository at this point in the history
Fix #444

This PR improve XMLScanner performance by replacing regex with java code
for the 3 regexp which are the most used (element name, attribute name,
attribute value).

After testing that, a large file like nasa.xml is parsed 2-3 times
faster. You can see this time when you start XMLScannerPerformance and
DOMParserPerformance.

Signed-off-by: azerr <[email protected]>
  • Loading branch information
angelozerr committed Jun 17, 2019
1 parent 20819b3 commit dc2e0a6
Show file tree
Hide file tree
Showing 4 changed files with 131 additions and 40 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ public class Constants {

public final static int _EXL = "!".codePointAt(0);
public final static int _MIN = "-".codePointAt(0);
public final static int _UDS = "_".codePointAt(0);
public final static int _DDT = ":".codePointAt(0);
public final static int _DOT = ".".codePointAt(0);
public final static int _LAN = "<".codePointAt(0);
public final static int _RAN = ">".codePointAt(0);
public final static int _FSL = "/".codePointAt(0);
Expand Down Expand Up @@ -70,26 +73,26 @@ public class Constants {

public static final Pattern PI_TAG_NAME = Pattern.compile("^[a-zA-Z0-9]+");

//Add coming processing instructions that are defined to have attributes as content
// Add coming processing instructions that are defined to have attributes as
// content
public static final Pattern PI_WITH_VARIABLES = Pattern.compile("^(xml-stylesheet)[\\s<>?]?");

public static final Pattern DOCTYPE_KIND_OPTIONS = Pattern.compile("^(PUBLIC|SYSTEM)([\\s<>\"'])");



public static final Pattern DTD_ELEMENT_CATEGORY = Pattern.compile("^(EMPTY|ANY)([\\s<>\"'])");

public static final Pattern DTD_ELEMENT_CONTENT = Pattern.compile("^(\\((([^\\s,]+,)*[^\\s,]+)\\))|\\(\\)");

public static final Pattern DTD_PCDATA = Pattern.compile("^#PCDATA");

public static final Pattern DTD_ATTLIST_ATTRIBUTE_TYPE = Pattern.compile("^(CDATA|IDREFS|IDREF|ID|NMTOKENS|NMTOKEN|ENTITIES|ENTITY|NOTATION|xml:|\\(.*\\))([\\s<>\"'])");
public static final Pattern DTD_ATTLIST_ATTRIBUTE_TYPE = Pattern
.compile("^(CDATA|IDREFS|IDREF|ID|NMTOKENS|NMTOKEN|ENTITIES|ENTITY|NOTATION|xml:|\\(.*\\))([\\s<>\"'])");

public static final Pattern DTD_ATTLIST_ATTRIBUTE_VALUE = Pattern.compile("^(#REQUIRED|#IMPLIED|\".*\"|#FIXED \".*\")([\\s<>\"'])");
public static final Pattern DTD_ATTLIST_ATTRIBUTE_VALUE = Pattern
.compile("^(#REQUIRED|#IMPLIED|\".*\"|#FIXED \".*\")([\\s<>\"'])");

public static final Pattern DTD_ENTITY_VALUE = Pattern.compile("^\".*\"");

public static final Pattern DOCTYPE_NAME =
Pattern.compile("^[_:\\w][_:\\w-.\\d]*");
public static final Pattern DOCTYPE_NAME = Pattern.compile("^[_:\\w][_:\\w-.\\d]*");

}
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,6 @@ public class MultiLineStream {
return ch == _WSP || ch == _TAB || ch == _NWL || ch == _LFD || ch == _CAR;
};

private static final Predicate<Integer> CHARACTER_PREDICATE = ch -> {
return ch != _WSP && ch != _TAB && ch != _NWL && ch != _LFD && ch != _CAR;
};

private final String source;
private final int len;
private int position;
Expand Down Expand Up @@ -290,16 +286,11 @@ public boolean advanceUntilCharsOrNewTag(int... ch) {
return false;
}

public boolean skipWhitespace() {
int n = this.advanceWhileChar(WHITESPACE_PREDICATE);
return n > 0;
}

/**
* Advances until it reaches a whitespace character
*/
public boolean readNextWord() {
int n = this.advanceWhileChar(CHARACTER_PREDICATE);
public boolean skipWhitespace() {
int n = this.advanceWhileChar(WHITESPACE_PREDICATE);
return n > 0;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,49 @@
*/
package org.eclipse.lsp4xml.dom.parser;

import static org.eclipse.lsp4xml.dom.parser.Constants.*;
import static org.eclipse.lsp4xml.dom.parser.Constants.ATTRIBUTE_NAME_REGEX;
import static org.eclipse.lsp4xml.dom.parser.Constants.DOCTYPE_KIND_OPTIONS;
import static org.eclipse.lsp4xml.dom.parser.Constants.DTD_ELEMENT_CATEGORY;
import static org.eclipse.lsp4xml.dom.parser.Constants.ELEMENT_NAME_REGEX;
import static org.eclipse.lsp4xml.dom.parser.Constants.PROLOG_NAME_OPTIONS;
import static org.eclipse.lsp4xml.dom.parser.Constants.URL_VALUE_REGEX;
import static org.eclipse.lsp4xml.dom.parser.Constants._AST;
import static org.eclipse.lsp4xml.dom.parser.Constants._AVL;
import static org.eclipse.lsp4xml.dom.parser.Constants._CAR;
import static org.eclipse.lsp4xml.dom.parser.Constants._CRB;
import static org.eclipse.lsp4xml.dom.parser.Constants._CSB;
import static org.eclipse.lsp4xml.dom.parser.Constants._CVL;
import static org.eclipse.lsp4xml.dom.parser.Constants._DDT;
import static org.eclipse.lsp4xml.dom.parser.Constants._DOT;
import static org.eclipse.lsp4xml.dom.parser.Constants._DQO;
import static org.eclipse.lsp4xml.dom.parser.Constants._DVL;
import static org.eclipse.lsp4xml.dom.parser.Constants._EQS;
import static org.eclipse.lsp4xml.dom.parser.Constants._EVL;
import static org.eclipse.lsp4xml.dom.parser.Constants._EXL;
import static org.eclipse.lsp4xml.dom.parser.Constants._FSL;
import static org.eclipse.lsp4xml.dom.parser.Constants._IVL;
import static org.eclipse.lsp4xml.dom.parser.Constants._LAN;
import static org.eclipse.lsp4xml.dom.parser.Constants._LVL;
import static org.eclipse.lsp4xml.dom.parser.Constants._MIN;
import static org.eclipse.lsp4xml.dom.parser.Constants._MVL;
import static org.eclipse.lsp4xml.dom.parser.Constants._NVL;
import static org.eclipse.lsp4xml.dom.parser.Constants._NWL;
import static org.eclipse.lsp4xml.dom.parser.Constants._ORB;
import static org.eclipse.lsp4xml.dom.parser.Constants._OSB;
import static org.eclipse.lsp4xml.dom.parser.Constants._OVL;
import static org.eclipse.lsp4xml.dom.parser.Constants._PCT;
import static org.eclipse.lsp4xml.dom.parser.Constants._PLS;
import static org.eclipse.lsp4xml.dom.parser.Constants._PVL;
import static org.eclipse.lsp4xml.dom.parser.Constants._QMA;
import static org.eclipse.lsp4xml.dom.parser.Constants._RAN;
import static org.eclipse.lsp4xml.dom.parser.Constants._SIQ;
import static org.eclipse.lsp4xml.dom.parser.Constants._SVL;
import static org.eclipse.lsp4xml.dom.parser.Constants._TVL;
import static org.eclipse.lsp4xml.dom.parser.Constants._UDS;
import static org.eclipse.lsp4xml.dom.parser.Constants._WSP;
import static org.eclipse.lsp4xml.dom.parser.Constants._YVL;

import java.util.function.Predicate;

import org.eclipse.lsp4xml.dom.DOMDocumentType.DocumentTypeKind;;

Expand All @@ -20,16 +62,29 @@
*/
public class XMLScanner implements Scanner {

private static final Predicate<Integer> START_ELEMENT_NAME_PREDICATE = ch -> {
// ^[_:\w]
return ch == _UDS || ch == _DDT || Character.isLetter(ch);
};

private static final Predicate<Integer> ELEMENT_NAME_PREDICATE = ch -> {
// [_:\w-.\d]*
return ch == _UDS /* '_' */ || ch == _DDT /* ':' */ || ch == _DOT /* '.' */ || ch == _MIN /* '-' */
|| Character.isLetterOrDigit(ch);
};

private static final Predicate<Integer> ATTRIBUTE_NAME_PREDICATE = ch -> {
// ^[^\s\?\"'<>\/=\x00-\x0F\x7F\x80-\x9F]*
return !Character.isWhitespace(ch) && ch != _QMA && ch != _DQO && ch != _SIQ && ch != _LAN && ch != _RAN
&& ch != _FSL && ch != _EQS && !(ch >= 0x00 && ch <= 0x0F) && ch != 0x7F && !(ch >= 0x80 && ch <= 0x9F);
};

MultiLineStream stream;
ScannerState state;
int tokenOffset;
TokenType tokenType;
String tokenError;


String lastTag;
String lastAttributeName;
String lastTypeValue;
String lastDoctypeKind;
String url;
boolean isInsideDTDContent = false; // Either internal dtd in xml file OR external dtd in dtd file
Expand Down Expand Up @@ -58,14 +113,51 @@ public XMLScanner(String input, int initialOffset, ScannerState initialState, bo
this.isDTDFile = isDTDFile;
}

String nextElementName() {
return stream.advanceIfRegExp(ELEMENT_NAME_REGEX);
/**
* Returns true if the current token is an element name and false otherwise.
*
* @return true if the current token is an element name and false otherwise.
*/
boolean hasNextElementName() {
// Element name regexp : ^[_:\w][_:\w-.\d]*
// ^[_:\w]
if (!START_ELEMENT_NAME_PREDICATE.test(stream.peekChar())) {
return false;
}
stream.advance(1);
// [_:\w-.\d]*
stream.advanceWhileChar(ELEMENT_NAME_PREDICATE);
return true;
}

String nextAttributeName() {
return stream.advanceIfRegExp(ATTRIBUTE_NAME_REGEX);
/**
* Returns true if the current token is an attribute name and false otherwise.
*
* @return true if the current token is an attribute name and false otherwise.
*/
boolean hasNextAttributeName() {
// ^[^\s\?\"'<>\/=\x00-\x0F\x7F\x80-\x9F]*
return stream.advanceWhileChar(ATTRIBUTE_NAME_PREDICATE) > 0;
}

/**
* Returns true if the current token is an attribute value and false otherwise.
*
* @return true if the current token is an attribute value and false otherwise.
*/
boolean hasNextAttributeValue() {
// ^("[^"]*"?)|('[^']*'?)
int first = stream.peekChar();
if (first == _SIQ || first == _DQO) {
stream.advance(1);
if( stream.advanceUntilChar(first)) {
stream.advance(1);
}
return true;
}
return false;
}

String doctypeName() {
return stream.advanceIfRegExp(ELEMENT_NAME_REGEX);
}
Expand Down Expand Up @@ -210,8 +302,7 @@ TokenType internalScan() {
return finishToken(offset, TokenType.CDATAContent);

case AfterOpeningEndTag:
String tagName = nextElementName();
if (tagName.length() > 0) {
if (hasNextElementName()) {
state = ScannerState.WithinEndTag;
return finishToken(offset, TokenType.EndTag);
}
Expand Down Expand Up @@ -243,10 +334,7 @@ TokenType internalScan() {
return finishToken(offset, TokenType.Whitespace);

case AfterOpeningStartTag:
lastTag = nextElementName();
lastTypeValue = null;
lastAttributeName = null;
if (lastTag.length() > 0) {
if (hasNextElementName()) {
state = ScannerState.WithinTag;
return finishToken(offset, TokenType.StartTag);
}
Expand All @@ -272,8 +360,7 @@ TokenType internalScan() {
return finishToken(offset, TokenType.PrologEnd);
}

lastAttributeName = nextAttributeName();
if (lastAttributeName.length() > 0) {
if (hasNextAttributeName()) {
state = ScannerState.AfterAttributeName;
return finishToken(offset, TokenType.AttributeName);
}
Expand Down Expand Up @@ -319,11 +406,7 @@ TokenType internalScan() {
if (stream.skipWhitespace()) {
return finishToken(offset, TokenType.Whitespace);
}
String attributeValue = stream.advanceIfRegExp(ATTRIBUTE_VALUE_REGEX);
if (attributeValue.length() > 0) {
if ("type".equals(lastAttributeName)) {
lastTypeValue = attributeValue;
}
if (hasNextAttributeValue()) {
state = ScannerState.WithinTag;
return finishToken(offset, TokenType.AttributeValue);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,20 @@ public void testNestedElement() {
assertOffsetAndToken(21, TokenType.EndTagClose);
}

@Test
public void testMinusElement() {
scanner = XMLScanner.createScanner("<hello-word></hello-word>");

//<hello-word>
assertOffsetAndToken(0, TokenType.StartTagOpen);
assertOffsetAndToken(1, TokenType.StartTag, "hello-word");
assertOffsetAndToken(11, TokenType.StartTagClose);
//</hello-word>
assertOffsetAndToken(12, TokenType.EndTagOpen);
assertOffsetAndToken(14, TokenType.EndTag, "hello-word");
assertOffsetAndToken(24, TokenType.EndTagClose);
}

@Test
public void testElementWithAttribute() {
scanner = XMLScanner.createScanner("<hello key=\"value\"></hello>");
Expand Down

0 comments on commit dc2e0a6

Please sign in to comment.