diff --git a/src/main/java/org/jsoup/nodes/Attributes.java b/src/main/java/org/jsoup/nodes/Attributes.java index d1fb6ca69f..ecbe9f4ead 100644 --- a/src/main/java/org/jsoup/nodes/Attributes.java +++ b/src/main/java/org/jsoup/nodes/Attributes.java @@ -2,7 +2,6 @@ import org.jsoup.SerializationException; import org.jsoup.helper.Validate; -import org.jsoup.internal.Normalizer; import org.jsoup.internal.SharedConstants; import org.jsoup.internal.StringUtil; import org.jsoup.parser.ParseSettings; @@ -525,23 +524,6 @@ public void normalize() { if (!isInternalKey(keys[i])) keys[i] = lowerCase(keys[i]); } - - // if we are tracking attribute source ranges, normalize those keys also - //noinspection unchecked - Map ranges = (Map) userData(AttrRangeKey); - if (ranges != null) { - Object[] names = ranges.keySet().toArray(); // copy to array to avoid CMEs during put - for (Object name : names) { - String normal = lowerCase((String) name); - if (normal.equals(name)) continue; - if (ranges.containsKey(normal)) { - ranges.remove(name); // dedupe now that we have normalized - } else { - Range.AttributeRange range = ranges.remove(name); - ranges.put(normal, range); - } - } - } } /** diff --git a/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java b/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java index b0fc8f3f62..f76fc486d0 100644 --- a/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java +++ b/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java @@ -88,7 +88,7 @@ protected void initialiseParse(Reader input, String baseUri, Parser parser) { formattingElements = new ArrayList<>(); tmplInsertMode = new ArrayList<>(); pendingTableCharacters = new ArrayList<>(); - emptyEnd = new Token.EndTag(); + emptyEnd = new Token.EndTag(this); framesetOk = true; fosterInserts = false; fragmentParsing = false; diff --git a/src/main/java/org/jsoup/parser/Parser.java b/src/main/java/org/jsoup/parser/Parser.java index 7ae7f31cd1..9ec437b6d3 100644 --- a/src/main/java/org/jsoup/parser/Parser.java +++ b/src/main/java/org/jsoup/parser/Parser.java @@ -245,7 +245,9 @@ public static Document parseBodyFragment(String bodyHtml, String baseUri) { * @return an unescaped string */ public static String unescapeEntities(String string, boolean inAttribute) { - Tokeniser tokeniser = new Tokeniser(new CharacterReader(string), ParseErrorList.noTracking(), false); + Parser parser = Parser.htmlParser(); + parser.treeBuilder.initialiseParse(new StringReader(string), "", parser); + Tokeniser tokeniser = new Tokeniser(parser.treeBuilder); return tokeniser.unescapeEntities(inAttribute); } diff --git a/src/main/java/org/jsoup/parser/Token.java b/src/main/java/org/jsoup/parser/Token.java index c813e5d889..8c0133c53a 100644 --- a/src/main/java/org/jsoup/parser/Token.java +++ b/src/main/java/org/jsoup/parser/Token.java @@ -1,6 +1,7 @@ package org.jsoup.parser; import org.jsoup.helper.Validate; +import org.jsoup.internal.Normalizer; import org.jsoup.nodes.Attributes; import org.jsoup.nodes.Range; import org.jspecify.annotations.Nullable; @@ -122,11 +123,13 @@ static abstract class Tag extends Token { private boolean hasEmptyAttrValue = false; // distinguish boolean attribute from empty string value // attribute source range tracking + final TreeBuilder treeBuilder; final boolean trackSource; int attrNameStart, attrNameEnd, attrValStart, attrValEnd; - Tag(boolean trackSource) { - this.trackSource = trackSource; + Tag(TreeBuilder treeBuilder) { + this.treeBuilder = treeBuilder; + this.trackSource = treeBuilder.trackSourceRange; } @Override @@ -187,7 +190,9 @@ else if (hasEmptyAttrValue) private void trackAttributeRange(String name) { if (trackSource && isStartTag()) { final StartTag start = asStartTag(); - final CharacterReader r = start.reader; + final CharacterReader r = start.treeBuilder.reader; + final boolean preserve = start.treeBuilder.settings.preserveAttributeCase(); + assert attributes != null; //noinspection unchecked Map attrRanges = @@ -196,7 +201,9 @@ private void trackAttributeRange(String name) { attrRanges = new HashMap<>(); attributes.userData(AttrRangeKey, attrRanges); } - if (attrRanges.containsKey(name)) return; // dedupe ranges on case-sensitive name as we go; actual attributes get deduped later + + if (!preserve) name = Normalizer.lowerCase(name); + if (attrRanges.containsKey(name)) return; // dedupe ranges as we go; actual attributes get deduped later for error count // if there's no value (e.g. boolean), make it an implicit range at current if (!hasAttrValue) attrValStart = attrValEnd = attrNameEnd; @@ -343,13 +350,11 @@ private void ensureAttrValue(int startPos, int endPos) { } final static class StartTag extends Tag { - final CharacterReader reader; - // Reader is provided so if tracking, can get line / column positions for Range. - StartTag(boolean trackSource, CharacterReader reader) { - super(trackSource); + // TreeBuilder is provided so if tracking, can get line / column positions for Range; and can dedupe as we go + StartTag(TreeBuilder treeBuilder) { + super(treeBuilder); type = TokenType.StartTag; - this.reader = reader; } @Override @@ -377,8 +382,8 @@ public String toString() { } final static class EndTag extends Tag{ - EndTag() { - super(false); + EndTag(TreeBuilder treeBuilder) { + super(treeBuilder); type = TokenType.EndTag; } diff --git a/src/main/java/org/jsoup/parser/Tokeniser.java b/src/main/java/org/jsoup/parser/Tokeniser.java index e76ba0d741..d77fbf9e81 100644 --- a/src/main/java/org/jsoup/parser/Tokeniser.java +++ b/src/main/java/org/jsoup/parser/Tokeniser.java @@ -41,7 +41,7 @@ final class Tokeniser { final StringBuilder dataBuffer = new StringBuilder(1024); // buffers data looking for final Token.StartTag startPending; - final Token.EndTag endPending = new Token.EndTag(); + final Token.EndTag endPending; Token.Tag tagPending; // tag we are building up: start or end pending final Token.Character charPending = new Token.Character(); final Token.Doctype doctypePending = new Token.Doctype(); // doctype building up @@ -52,10 +52,11 @@ final class Tokeniser { private static final int Unset = -1; private int markupStartPos, charStartPos = Unset; // reader pos at the start of markup / characters. updated on state transition - Tokeniser(CharacterReader reader, ParseErrorList errors, boolean trackSource) { - tagPending = startPending = new Token.StartTag(trackSource, reader); - this.reader = reader; - this.errors = errors; + Tokeniser(TreeBuilder treeBuilder) { + tagPending = startPending = new Token.StartTag(treeBuilder); + endPending = new Token.EndTag(treeBuilder); + this.reader = treeBuilder.reader; + this.errors = treeBuilder.parser.getErrors(); } Token read() { diff --git a/src/main/java/org/jsoup/parser/TreeBuilder.java b/src/main/java/org/jsoup/parser/TreeBuilder.java index eadfa28447..81f1fb57e0 100644 --- a/src/main/java/org/jsoup/parser/TreeBuilder.java +++ b/src/main/java/org/jsoup/parser/TreeBuilder.java @@ -31,10 +31,10 @@ abstract class TreeBuilder { Map seenTags; // tags we've used in this parse; saves tag GC for custom tags. private Token.StartTag start; // start tag to process - private final Token.EndTag end = new Token.EndTag(); + private final Token.EndTag end = new Token.EndTag(this); abstract ParseSettings defaultSettings(); - private boolean trackSourceRange; // optionally tracks the source range of nodes + boolean trackSourceRange; // optionally tracks the source range of nodes and attributes void initialiseParse(Reader input, String baseUri, Parser parser) { Validate.notNullParam(input, "input"); @@ -49,10 +49,10 @@ void initialiseParse(Reader input, String baseUri, Parser parser) { trackSourceRange = parser.isTrackPosition(); reader.trackNewlines(parser.isTrackErrors() || trackSourceRange); // when tracking errors or source ranges, enable newline tracking for better legibility currentToken = null; - tokeniser = new Tokeniser(reader, parser.getErrors(), trackSourceRange); + tokeniser = new Tokeniser(this); stack = new ArrayList<>(32); seenTags = new HashMap<>(); - start = new Token.StartTag(trackSourceRange, reader); + start = new Token.StartTag(this); this.baseUri = baseUri; } @@ -101,7 +101,7 @@ boolean processStartTag(String name) { // these are "virtual" start tags (auto-created by the treebuilder), so not tracking the start position final Token.StartTag start = this.start; if (currentToken == start) { // don't recycle an in-use token - return process(new Token.StartTag(trackSourceRange, reader).name(name)); + return process(new Token.StartTag(this).name(name)); } return process(start.reset().name(name)); } @@ -109,7 +109,7 @@ boolean processStartTag(String name) { boolean processStartTag(String name, Attributes attrs) { final Token.StartTag start = this.start; if (currentToken == start) { // don't recycle an in-use token - return process(new Token.StartTag(trackSourceRange, reader).nameAttr(name, attrs)); + return process(new Token.StartTag(this).nameAttr(name, attrs)); } start.reset(); start.nameAttr(name, attrs); @@ -118,7 +118,7 @@ boolean processStartTag(String name, Attributes attrs) { boolean processEndTag(String name) { if (currentToken == end) { // don't recycle an in-use token - return process(new Token.EndTag().name(name)); + return process(new Token.EndTag(this).name(name)); } return process(end.reset().name(name)); } diff --git a/src/test/java/org/jsoup/parser/PositionTest.java b/src/test/java/org/jsoup/parser/PositionTest.java index 677347327a..e4662c6546 100644 --- a/src/test/java/org/jsoup/parser/PositionTest.java +++ b/src/test/java/org/jsoup/parser/PositionTest.java @@ -447,6 +447,29 @@ private void printRange(Node node) { assertEquals("id:3-5=6-7; ", xmlLcPos .toString()); } + @Test void trackAttributesPositionsDirectionalDedupes() { + String html = "

"; + Document htmlDoc = Jsoup.parse(html, TrackingHtmlParser); + Document htmlDocUc = Jsoup.parse(html, Parser.htmlParser().setTrackPosition(true).settings(new ParseSettings(true, true))); + Document xmlDoc = Jsoup.parse(html, TrackingXmlParser); + Document xmlDocLc = Jsoup.parse(html, Parser.xmlParser().setTrackPosition(true).settings(new ParseSettings(false, false))); + + StringBuilder htmlPos = new StringBuilder(); + StringBuilder htmlUcPos = new StringBuilder(); + StringBuilder xmlPos = new StringBuilder(); + StringBuilder xmlLcPos = new StringBuilder(); + + accumulateAttributePositions(htmlDoc .expectFirst("p"), htmlPos); + accumulateAttributePositions(htmlDocUc .expectFirst("p"), htmlUcPos); + accumulateAttributePositions(xmlDoc .expectFirst("p"), xmlPos); + accumulateAttributePositions(xmlDocLc .expectFirst("p"), xmlLcPos); + + assertEquals("id:3-5=6-7; ", htmlPos .toString()); + assertEquals("Id:3-5=6-7; id:8-10=11-12; ", htmlUcPos .toString()); + assertEquals("Id:3-5=6-7; id:8-10=11-12; ", xmlPos .toString()); + assertEquals("id:3-5=6-7; ", xmlLcPos .toString()); + } + static void accumulateAttributePositions(Node node, StringBuilder sb) { if (node instanceof LeafNode) return; // leafnode pseudo attributes are not tracked for (Attribute attribute : node.attributes()) {