Skip to content

Commit

Permalink
Moved attribute source range normal+dedupe into Token
Browse files Browse the repository at this point in the history
Dedupes and normalizes attribute source range as we go.

Ensures that the tracked range is for the correct attribute.

Further fixes #2067
  • Loading branch information
jhy committed Nov 28, 2023
1 parent fc4b175 commit 244db10
Show file tree
Hide file tree
Showing 7 changed files with 56 additions and 43 deletions.
18 changes: 0 additions & 18 deletions src/main/java/org/jsoup/nodes/Attributes.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import org.jsoup.SerializationException;
import org.jsoup.helper.Validate;
import org.jsoup.internal.Normalizer;
import org.jsoup.internal.SharedConstants;
import org.jsoup.internal.StringUtil;
import org.jsoup.parser.ParseSettings;
Expand Down Expand Up @@ -525,23 +524,6 @@ public void normalize() {
if (!isInternalKey(keys[i]))
keys[i] = lowerCase(keys[i]);
}

// if we are tracking attribute source ranges, normalize those keys also
//noinspection unchecked
Map<String, Range.AttributeRange> ranges = (Map<String, Range.AttributeRange>) userData(AttrRangeKey);
if (ranges != null) {
Object[] names = ranges.keySet().toArray(); // copy to array to avoid CMEs during put
for (Object name : names) {
String normal = lowerCase((String) name);
if (normal.equals(name)) continue;
if (ranges.containsKey(normal)) {
ranges.remove(name); // dedupe now that we have normalized
} else {
Range.AttributeRange range = ranges.remove(name);
ranges.put(normal, range);
}
}
}
}

/**
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/org/jsoup/parser/HtmlTreeBuilder.java
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ protected void initialiseParse(Reader input, String baseUri, Parser parser) {
formattingElements = new ArrayList<>();
tmplInsertMode = new ArrayList<>();
pendingTableCharacters = new ArrayList<>();
emptyEnd = new Token.EndTag();
emptyEnd = new Token.EndTag(this);
framesetOk = true;
fosterInserts = false;
fragmentParsing = false;
Expand Down
4 changes: 3 additions & 1 deletion src/main/java/org/jsoup/parser/Parser.java
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,9 @@ public static Document parseBodyFragment(String bodyHtml, String baseUri) {
* @return an unescaped string
*/
public static String unescapeEntities(String string, boolean inAttribute) {
Tokeniser tokeniser = new Tokeniser(new CharacterReader(string), ParseErrorList.noTracking(), false);
Parser parser = Parser.htmlParser();
parser.treeBuilder.initialiseParse(new StringReader(string), "", parser);
Tokeniser tokeniser = new Tokeniser(parser.treeBuilder);
return tokeniser.unescapeEntities(inAttribute);
}

Expand Down
27 changes: 16 additions & 11 deletions src/main/java/org/jsoup/parser/Token.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package org.jsoup.parser;

import org.jsoup.helper.Validate;
import org.jsoup.internal.Normalizer;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Range;
import org.jspecify.annotations.Nullable;
Expand Down Expand Up @@ -122,11 +123,13 @@ static abstract class Tag extends Token {
private boolean hasEmptyAttrValue = false; // distinguish boolean attribute from empty string value

// attribute source range tracking
final TreeBuilder treeBuilder;
final boolean trackSource;
int attrNameStart, attrNameEnd, attrValStart, attrValEnd;

Tag(boolean trackSource) {
this.trackSource = trackSource;
Tag(TreeBuilder treeBuilder) {
this.treeBuilder = treeBuilder;
this.trackSource = treeBuilder.trackSourceRange;
}

@Override
Expand Down Expand Up @@ -187,7 +190,9 @@ else if (hasEmptyAttrValue)
private void trackAttributeRange(String name) {
if (trackSource && isStartTag()) {
final StartTag start = asStartTag();
final CharacterReader r = start.reader;
final CharacterReader r = start.treeBuilder.reader;
final boolean preserve = start.treeBuilder.settings.preserveAttributeCase();

assert attributes != null;
//noinspection unchecked
Map<String, Range.AttributeRange> attrRanges =
Expand All @@ -196,7 +201,9 @@ private void trackAttributeRange(String name) {
attrRanges = new HashMap<>();
attributes.userData(AttrRangeKey, attrRanges);
}
if (attrRanges.containsKey(name)) return; // dedupe ranges on case-sensitive name as we go; actual attributes get deduped later

if (!preserve) name = Normalizer.lowerCase(name);
if (attrRanges.containsKey(name)) return; // dedupe ranges as we go; actual attributes get deduped later for error count

// if there's no value (e.g. boolean), make it an implicit range at current
if (!hasAttrValue) attrValStart = attrValEnd = attrNameEnd;
Expand Down Expand Up @@ -343,13 +350,11 @@ private void ensureAttrValue(int startPos, int endPos) {
}

final static class StartTag extends Tag {
final CharacterReader reader;

// Reader is provided so if tracking, can get line / column positions for Range.
StartTag(boolean trackSource, CharacterReader reader) {
super(trackSource);
// TreeBuilder is provided so if tracking, can get line / column positions for Range; and can dedupe as we go
StartTag(TreeBuilder treeBuilder) {
super(treeBuilder);
type = TokenType.StartTag;
this.reader = reader;
}

@Override
Expand Down Expand Up @@ -377,8 +382,8 @@ public String toString() {
}

final static class EndTag extends Tag{
EndTag() {
super(false);
EndTag(TreeBuilder treeBuilder) {
super(treeBuilder);
type = TokenType.EndTag;
}

Expand Down
11 changes: 6 additions & 5 deletions src/main/java/org/jsoup/parser/Tokeniser.java
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ final class Tokeniser {
final StringBuilder dataBuffer = new StringBuilder(1024); // buffers data looking for </script>

final Token.StartTag startPending;
final Token.EndTag endPending = new Token.EndTag();
final Token.EndTag endPending;
Token.Tag tagPending; // tag we are building up: start or end pending
final Token.Character charPending = new Token.Character();
final Token.Doctype doctypePending = new Token.Doctype(); // doctype building up
Expand All @@ -52,10 +52,11 @@ final class Tokeniser {
private static final int Unset = -1;
private int markupStartPos, charStartPos = Unset; // reader pos at the start of markup / characters. updated on state transition

Tokeniser(CharacterReader reader, ParseErrorList errors, boolean trackSource) {
tagPending = startPending = new Token.StartTag(trackSource, reader);
this.reader = reader;
this.errors = errors;
Tokeniser(TreeBuilder treeBuilder) {
tagPending = startPending = new Token.StartTag(treeBuilder);
endPending = new Token.EndTag(treeBuilder);
this.reader = treeBuilder.reader;
this.errors = treeBuilder.parser.getErrors();
}

Token read() {
Expand Down
14 changes: 7 additions & 7 deletions src/main/java/org/jsoup/parser/TreeBuilder.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@ abstract class TreeBuilder {
Map<String, Tag> seenTags; // tags we've used in this parse; saves tag GC for custom tags.

private Token.StartTag start; // start tag to process
private final Token.EndTag end = new Token.EndTag();
private final Token.EndTag end = new Token.EndTag(this);
abstract ParseSettings defaultSettings();

private boolean trackSourceRange; // optionally tracks the source range of nodes
boolean trackSourceRange; // optionally tracks the source range of nodes and attributes

void initialiseParse(Reader input, String baseUri, Parser parser) {
Validate.notNullParam(input, "input");
Expand All @@ -49,10 +49,10 @@ void initialiseParse(Reader input, String baseUri, Parser parser) {
trackSourceRange = parser.isTrackPosition();
reader.trackNewlines(parser.isTrackErrors() || trackSourceRange); // when tracking errors or source ranges, enable newline tracking for better legibility
currentToken = null;
tokeniser = new Tokeniser(reader, parser.getErrors(), trackSourceRange);
tokeniser = new Tokeniser(this);
stack = new ArrayList<>(32);
seenTags = new HashMap<>();
start = new Token.StartTag(trackSourceRange, reader);
start = new Token.StartTag(this);
this.baseUri = baseUri;
}

Expand Down Expand Up @@ -101,15 +101,15 @@ boolean processStartTag(String name) {
// these are "virtual" start tags (auto-created by the treebuilder), so not tracking the start position
final Token.StartTag start = this.start;
if (currentToken == start) { // don't recycle an in-use token
return process(new Token.StartTag(trackSourceRange, reader).name(name));
return process(new Token.StartTag(this).name(name));
}
return process(start.reset().name(name));
}

boolean processStartTag(String name, Attributes attrs) {
final Token.StartTag start = this.start;
if (currentToken == start) { // don't recycle an in-use token
return process(new Token.StartTag(trackSourceRange, reader).nameAttr(name, attrs));
return process(new Token.StartTag(this).nameAttr(name, attrs));
}
start.reset();
start.nameAttr(name, attrs);
Expand All @@ -118,7 +118,7 @@ boolean processStartTag(String name, Attributes attrs) {

boolean processEndTag(String name) {
if (currentToken == end) { // don't recycle an in-use token
return process(new Token.EndTag().name(name));
return process(new Token.EndTag(this).name(name));
}
return process(end.reset().name(name));
}
Expand Down
23 changes: 23 additions & 0 deletions src/test/java/org/jsoup/parser/PositionTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,29 @@ private void printRange(Node node) {
assertEquals("id:3-5=6-7; ", xmlLcPos .toString());
}

@Test void trackAttributesPositionsDirectionalDedupes() {
String html = "<p Id=1 id=2 Id=3 Id=4 id=5 Id=6>";
Document htmlDoc = Jsoup.parse(html, TrackingHtmlParser);
Document htmlDocUc = Jsoup.parse(html, Parser.htmlParser().setTrackPosition(true).settings(new ParseSettings(true, true)));
Document xmlDoc = Jsoup.parse(html, TrackingXmlParser);
Document xmlDocLc = Jsoup.parse(html, Parser.xmlParser().setTrackPosition(true).settings(new ParseSettings(false, false)));

StringBuilder htmlPos = new StringBuilder();
StringBuilder htmlUcPos = new StringBuilder();
StringBuilder xmlPos = new StringBuilder();
StringBuilder xmlLcPos = new StringBuilder();

accumulateAttributePositions(htmlDoc .expectFirst("p"), htmlPos);
accumulateAttributePositions(htmlDocUc .expectFirst("p"), htmlUcPos);
accumulateAttributePositions(xmlDoc .expectFirst("p"), xmlPos);
accumulateAttributePositions(xmlDocLc .expectFirst("p"), xmlLcPos);

assertEquals("id:3-5=6-7; ", htmlPos .toString());
assertEquals("Id:3-5=6-7; id:8-10=11-12; ", htmlUcPos .toString());
assertEquals("Id:3-5=6-7; id:8-10=11-12; ", xmlPos .toString());
assertEquals("id:3-5=6-7; ", xmlLcPos .toString());
}

static void accumulateAttributePositions(Node node, StringBuilder sb) {
if (node instanceof LeafNode) return; // leafnode pseudo attributes are not tracked
for (Attribute attribute : node.attributes()) {
Expand Down

0 comments on commit 244db10

Please sign in to comment.