diff --git a/org.eclipse.lsp4xml/src/main/java/org/eclipse/lsp4xml/dom/parser/Constants.java b/org.eclipse.lsp4xml/src/main/java/org/eclipse/lsp4xml/dom/parser/Constants.java index 824896bcd..4f712cd5c 100644 --- a/org.eclipse.lsp4xml/src/main/java/org/eclipse/lsp4xml/dom/parser/Constants.java +++ b/org.eclipse.lsp4xml/src/main/java/org/eclipse/lsp4xml/dom/parser/Constants.java @@ -16,6 +16,9 @@ public class Constants { public final static int _EXL = "!".codePointAt(0); public final static int _MIN = "-".codePointAt(0); + public final static int _UDS = "_".codePointAt(0); + public final static int _DDT = ":".codePointAt(0); + public final static int _DOT = ".".codePointAt(0); public final static int _LAN = "<".codePointAt(0); public final static int _RAN = ">".codePointAt(0); public final static int _FSL = "/".codePointAt(0); @@ -70,26 +73,26 @@ public class Constants { public static final Pattern PI_TAG_NAME = Pattern.compile("^[a-zA-Z0-9]+"); - //Add coming processing instructions that are defined to have attributes as content + // Add coming processing instructions that are defined to have attributes as + // content public static final Pattern PI_WITH_VARIABLES = Pattern.compile("^(xml-stylesheet)[\\s<>?]?"); public static final Pattern DOCTYPE_KIND_OPTIONS = Pattern.compile("^(PUBLIC|SYSTEM)([\\s<>\"'])"); - - public static final Pattern DTD_ELEMENT_CATEGORY = Pattern.compile("^(EMPTY|ANY)([\\s<>\"'])"); public static final Pattern DTD_ELEMENT_CONTENT = Pattern.compile("^(\\((([^\\s,]+,)*[^\\s,]+)\\))|\\(\\)"); public static final Pattern DTD_PCDATA = Pattern.compile("^#PCDATA"); - public static final Pattern DTD_ATTLIST_ATTRIBUTE_TYPE = Pattern.compile("^(CDATA|IDREFS|IDREF|ID|NMTOKENS|NMTOKEN|ENTITIES|ENTITY|NOTATION|xml:|\\(.*\\))([\\s<>\"'])"); + public static final Pattern DTD_ATTLIST_ATTRIBUTE_TYPE = Pattern + .compile("^(CDATA|IDREFS|IDREF|ID|NMTOKENS|NMTOKEN|ENTITIES|ENTITY|NOTATION|xml:|\\(.*\\))([\\s<>\"'])"); - public static final Pattern DTD_ATTLIST_ATTRIBUTE_VALUE = Pattern.compile("^(#REQUIRED|#IMPLIED|\".*\"|#FIXED \".*\")([\\s<>\"'])"); + public static final Pattern DTD_ATTLIST_ATTRIBUTE_VALUE = Pattern + .compile("^(#REQUIRED|#IMPLIED|\".*\"|#FIXED \".*\")([\\s<>\"'])"); public static final Pattern DTD_ENTITY_VALUE = Pattern.compile("^\".*\""); - public static final Pattern DOCTYPE_NAME = - Pattern.compile("^[_:\\w][_:\\w-.\\d]*"); + public static final Pattern DOCTYPE_NAME = Pattern.compile("^[_:\\w][_:\\w-.\\d]*"); } diff --git a/org.eclipse.lsp4xml/src/main/java/org/eclipse/lsp4xml/dom/parser/MultiLineStream.java b/org.eclipse.lsp4xml/src/main/java/org/eclipse/lsp4xml/dom/parser/MultiLineStream.java index 4fac8e22c..823985d0b 100644 --- a/org.eclipse.lsp4xml/src/main/java/org/eclipse/lsp4xml/dom/parser/MultiLineStream.java +++ b/org.eclipse.lsp4xml/src/main/java/org/eclipse/lsp4xml/dom/parser/MultiLineStream.java @@ -36,10 +36,6 @@ public class MultiLineStream { return ch == _WSP || ch == _TAB || ch == _NWL || ch == _LFD || ch == _CAR; }; - private static final Predicate CHARACTER_PREDICATE = ch -> { - return ch != _WSP && ch != _TAB && ch != _NWL && ch != _LFD && ch != _CAR; - }; - private final String source; private final int len; private int position; @@ -290,16 +286,11 @@ public boolean advanceUntilCharsOrNewTag(int... ch) { return false; } - public boolean skipWhitespace() { - int n = this.advanceWhileChar(WHITESPACE_PREDICATE); - return n > 0; - } - /** * Advances until it reaches a whitespace character */ - public boolean readNextWord() { - int n = this.advanceWhileChar(CHARACTER_PREDICATE); + public boolean skipWhitespace() { + int n = this.advanceWhileChar(WHITESPACE_PREDICATE); return n > 0; } diff --git a/org.eclipse.lsp4xml/src/main/java/org/eclipse/lsp4xml/dom/parser/XMLScanner.java b/org.eclipse.lsp4xml/src/main/java/org/eclipse/lsp4xml/dom/parser/XMLScanner.java index ec825274e..68d4633e3 100644 --- a/org.eclipse.lsp4xml/src/main/java/org/eclipse/lsp4xml/dom/parser/XMLScanner.java +++ b/org.eclipse.lsp4xml/src/main/java/org/eclipse/lsp4xml/dom/parser/XMLScanner.java @@ -10,7 +10,49 @@ */ package org.eclipse.lsp4xml.dom.parser; -import static org.eclipse.lsp4xml.dom.parser.Constants.*; +import static org.eclipse.lsp4xml.dom.parser.Constants.ATTRIBUTE_NAME_REGEX; +import static org.eclipse.lsp4xml.dom.parser.Constants.DOCTYPE_KIND_OPTIONS; +import static org.eclipse.lsp4xml.dom.parser.Constants.DTD_ELEMENT_CATEGORY; +import static org.eclipse.lsp4xml.dom.parser.Constants.ELEMENT_NAME_REGEX; +import static org.eclipse.lsp4xml.dom.parser.Constants.PROLOG_NAME_OPTIONS; +import static org.eclipse.lsp4xml.dom.parser.Constants.URL_VALUE_REGEX; +import static org.eclipse.lsp4xml.dom.parser.Constants._AST; +import static org.eclipse.lsp4xml.dom.parser.Constants._AVL; +import static org.eclipse.lsp4xml.dom.parser.Constants._CAR; +import static org.eclipse.lsp4xml.dom.parser.Constants._CRB; +import static org.eclipse.lsp4xml.dom.parser.Constants._CSB; +import static org.eclipse.lsp4xml.dom.parser.Constants._CVL; +import static org.eclipse.lsp4xml.dom.parser.Constants._DDT; +import static org.eclipse.lsp4xml.dom.parser.Constants._DOT; +import static org.eclipse.lsp4xml.dom.parser.Constants._DQO; +import static org.eclipse.lsp4xml.dom.parser.Constants._DVL; +import static org.eclipse.lsp4xml.dom.parser.Constants._EQS; +import static org.eclipse.lsp4xml.dom.parser.Constants._EVL; +import static org.eclipse.lsp4xml.dom.parser.Constants._EXL; +import static org.eclipse.lsp4xml.dom.parser.Constants._FSL; +import static org.eclipse.lsp4xml.dom.parser.Constants._IVL; +import static org.eclipse.lsp4xml.dom.parser.Constants._LAN; +import static org.eclipse.lsp4xml.dom.parser.Constants._LVL; +import static org.eclipse.lsp4xml.dom.parser.Constants._MIN; +import static org.eclipse.lsp4xml.dom.parser.Constants._MVL; +import static org.eclipse.lsp4xml.dom.parser.Constants._NVL; +import static org.eclipse.lsp4xml.dom.parser.Constants._NWL; +import static org.eclipse.lsp4xml.dom.parser.Constants._ORB; +import static org.eclipse.lsp4xml.dom.parser.Constants._OSB; +import static org.eclipse.lsp4xml.dom.parser.Constants._OVL; +import static org.eclipse.lsp4xml.dom.parser.Constants._PCT; +import static org.eclipse.lsp4xml.dom.parser.Constants._PLS; +import static org.eclipse.lsp4xml.dom.parser.Constants._PVL; +import static org.eclipse.lsp4xml.dom.parser.Constants._QMA; +import static org.eclipse.lsp4xml.dom.parser.Constants._RAN; +import static org.eclipse.lsp4xml.dom.parser.Constants._SIQ; +import static org.eclipse.lsp4xml.dom.parser.Constants._SVL; +import static org.eclipse.lsp4xml.dom.parser.Constants._TVL; +import static org.eclipse.lsp4xml.dom.parser.Constants._UDS; +import static org.eclipse.lsp4xml.dom.parser.Constants._WSP; +import static org.eclipse.lsp4xml.dom.parser.Constants._YVL; + +import java.util.function.Predicate; import org.eclipse.lsp4xml.dom.DOMDocumentType.DocumentTypeKind;; @@ -20,16 +62,29 @@ */ public class XMLScanner implements Scanner { + private static final Predicate START_ELEMENT_NAME_PREDICATE = ch -> { + // ^[_:\w] + return ch == _UDS || ch == _DDT || Character.isLetter(ch); + }; + + private static final Predicate ELEMENT_NAME_PREDICATE = ch -> { + // [_:\w-.\d]* + return ch == _UDS /* '_' */ || ch == _DDT /* ':' */ || ch == _DOT /* '.' */ || ch == _MIN /* '-' */ + || Character.isLetterOrDigit(ch); + }; + + private static final Predicate ATTRIBUTE_NAME_PREDICATE = ch -> { + // ^[^\s\?\"'<>\/=\x00-\x0F\x7F\x80-\x9F]* + return !Character.isWhitespace(ch) && ch != _QMA && ch != _DQO && ch != _SIQ && ch != _LAN && ch != _RAN + && ch != _FSL && ch != _EQS && !(ch >= 0x00 && ch <= 0x0F) && ch != 0x7F && !(ch >= 0x80 && ch <= 0x9F); + }; + MultiLineStream stream; ScannerState state; int tokenOffset; TokenType tokenType; String tokenError; - - String lastTag; - String lastAttributeName; - String lastTypeValue; String lastDoctypeKind; String url; boolean isInsideDTDContent = false; // Either internal dtd in xml file OR external dtd in dtd file @@ -58,14 +113,51 @@ public XMLScanner(String input, int initialOffset, ScannerState initialState, bo this.isDTDFile = isDTDFile; } - String nextElementName() { - return stream.advanceIfRegExp(ELEMENT_NAME_REGEX); + /** + * Returns true if the current token is an element name and false otherwise. + * + * @return true if the current token is an element name and false otherwise. + */ + boolean hasNextElementName() { + // Element name regexp : ^[_:\w][_:\w-.\d]* + // ^[_:\w] + if (!START_ELEMENT_NAME_PREDICATE.test(stream.peekChar())) { + return false; + } + stream.advance(1); + // [_:\w-.\d]* + stream.advanceWhileChar(ELEMENT_NAME_PREDICATE); + return true; } - String nextAttributeName() { - return stream.advanceIfRegExp(ATTRIBUTE_NAME_REGEX); + /** + * Returns true if the current token is an attribute name and false otherwise. + * + * @return true if the current token is an attribute name and false otherwise. + */ + boolean hasNextAttributeName() { + // ^[^\s\?\"'<>\/=\x00-\x0F\x7F\x80-\x9F]* + return stream.advanceWhileChar(ATTRIBUTE_NAME_PREDICATE) > 0; } + /** + * Returns true if the current token is an attribute value and false otherwise. + * + * @return true if the current token is an attribute value and false otherwise. + */ + boolean hasNextAttributeValue() { + // ^("[^"]*"?)|('[^']*'?) + int first = stream.peekChar(); + if (first == _SIQ || first == _DQO) { + stream.advance(1); + if( stream.advanceUntilChar(first)) { + stream.advance(1); + } + return true; + } + return false; + } + String doctypeName() { return stream.advanceIfRegExp(ELEMENT_NAME_REGEX); } @@ -210,8 +302,7 @@ TokenType internalScan() { return finishToken(offset, TokenType.CDATAContent); case AfterOpeningEndTag: - String tagName = nextElementName(); - if (tagName.length() > 0) { + if (hasNextElementName()) { state = ScannerState.WithinEndTag; return finishToken(offset, TokenType.EndTag); } @@ -243,10 +334,7 @@ TokenType internalScan() { return finishToken(offset, TokenType.Whitespace); case AfterOpeningStartTag: - lastTag = nextElementName(); - lastTypeValue = null; - lastAttributeName = null; - if (lastTag.length() > 0) { + if (hasNextElementName()) { state = ScannerState.WithinTag; return finishToken(offset, TokenType.StartTag); } @@ -272,8 +360,7 @@ TokenType internalScan() { return finishToken(offset, TokenType.PrologEnd); } - lastAttributeName = nextAttributeName(); - if (lastAttributeName.length() > 0) { + if (hasNextAttributeName()) { state = ScannerState.AfterAttributeName; return finishToken(offset, TokenType.AttributeName); } @@ -319,11 +406,7 @@ TokenType internalScan() { if (stream.skipWhitespace()) { return finishToken(offset, TokenType.Whitespace); } - String attributeValue = stream.advanceIfRegExp(ATTRIBUTE_VALUE_REGEX); - if (attributeValue.length() > 0) { - if ("type".equals(lastAttributeName)) { - lastTypeValue = attributeValue; - } + if (hasNextAttributeValue()) { state = ScannerState.WithinTag; return finishToken(offset, TokenType.AttributeValue); } diff --git a/org.eclipse.lsp4xml/src/test/java/org/eclipse/lsp4xml/dom/parser/XMLScannerTest.java b/org.eclipse.lsp4xml/src/test/java/org/eclipse/lsp4xml/dom/parser/XMLScannerTest.java index 4e6958dc5..3cf5eb308 100644 --- a/org.eclipse.lsp4xml/src/test/java/org/eclipse/lsp4xml/dom/parser/XMLScannerTest.java +++ b/org.eclipse.lsp4xml/src/test/java/org/eclipse/lsp4xml/dom/parser/XMLScannerTest.java @@ -64,6 +64,20 @@ public void testNestedElement() { assertOffsetAndToken(21, TokenType.EndTagClose); } + @Test + public void testMinusElement() { + scanner = XMLScanner.createScanner(""); + + // + assertOffsetAndToken(0, TokenType.StartTagOpen); + assertOffsetAndToken(1, TokenType.StartTag, "hello-word"); + assertOffsetAndToken(11, TokenType.StartTagClose); + // + assertOffsetAndToken(12, TokenType.EndTagOpen); + assertOffsetAndToken(14, TokenType.EndTag, "hello-word"); + assertOffsetAndToken(24, TokenType.EndTagClose); + } + @Test public void testElementWithAttribute() { scanner = XMLScanner.createScanner("");