Skip to content

Commit

Permalink
Copy attribute source range when merging attributes
Browse files Browse the repository at this point in the history
Also simplified setting source range.

Fixes #2204
  • Loading branch information
jhy committed Nov 22, 2024
1 parent 5ee376b commit 91b5a56
Show file tree
Hide file tree
Showing 5 changed files with 60 additions and 24 deletions.
1 change: 1 addition & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
applies appropriate path selection on cookies when making requests. [1831](https://github.com/jhy/jsoup/issues/1831)
* When parsing named HTML entities, base entities should resolve if they are a prefix of the input token (and not in an
attribute). [2207](https://github.com/jhy/jsoup/issues/2207)
* Fixed incorrect tracking of source ranges for attributes merged from late-occurring elements that were implicitly created (`html` or `body`). [2204](https://github.com/jhy/jsoup/issues/2204)

## 1.18.1 (2024-Jul-10)

Expand Down
19 changes: 19 additions & 0 deletions src/main/java/org/jsoup/nodes/Attributes.java
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,25 @@ public Range.AttributeRange sourceRange(String key) {
return (Map<String, Range.AttributeRange>) userData(AttrRangeKey);
}

/**
Set the source ranges (start to end position) from which this attribute's <b>name</b> and <b>value</b> were parsed.
@param key the attribute name
@param range the range for the attribute's name and value
@return these attributes, for chaining
@since 1.18.2
*/
public Attributes sourceRange(String key, Range.AttributeRange range) {
Validate.notNull(key);
Validate.notNull(range);
Map<String, Range.AttributeRange> ranges = getRanges();
if (ranges == null) {
ranges = new HashMap<>();
userData(AttrRangeKey, ranges);
}
ranges.put(key, range);
return this;
}


@Override
public Iterator<Attribute> iterator() {
Expand Down
31 changes: 18 additions & 13 deletions src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import org.jsoup.nodes.Document;
import org.jsoup.nodes.DocumentType;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Range;

import java.util.ArrayList;

Expand Down Expand Up @@ -371,12 +372,7 @@ private boolean inBodyStartTag(Token t, HtmlTreeBuilder tb) {
stack = tb.getStack();
if (stack.size() > 0) {
Element html = tb.getStack().get(0);
if (startTag.hasAttributes()) {
for (Attribute attribute : startTag.attributes) {
if (!html.hasAttr(attribute.getKey()))
html.attributes().put(attribute);
}
}
mergeAttributes(startTag, html);
}
break;
case "body":
Expand All @@ -388,13 +384,8 @@ private boolean inBodyStartTag(Token t, HtmlTreeBuilder tb) {
} else {
tb.framesetOk(false);
// will be on stack if this is a nested body. won't be if closed (which is a variance from spec, which leaves it on)
Element body;
if (startTag.hasAttributes() && (body = tb.getFromStack("body")) != null) { // we only ever put one body on stack
for (Attribute attribute : startTag.attributes) {
if (!body.hasAttr(attribute.getKey()))
body.attributes().put(attribute);
}
}
Element body = tb.getFromStack("body");
if (body != null) mergeAttributes(startTag, body);
}
break;
case "frameset":
Expand Down Expand Up @@ -1841,6 +1832,20 @@ boolean processAsHtml(Token t, HtmlTreeBuilder tb) {
}
};

private static void mergeAttributes(Token.StartTag source, Element dest) {
if (!source.hasAttributes()) return;
for (Attribute attr : source.attributes) { // only iterates public attributes
Attributes destAttrs = dest.attributes();
if (!destAttrs.hasKey(attr.getKey())) {
Range.AttributeRange range = attr.sourceRange(); // need to grab range before its parent changes
destAttrs.put(attr);
if (source.trackSource) { // copy the attribute range
destAttrs.sourceRange(attr.getKey(), range);
}
}
}
}

private static final String nullString = String.valueOf('\u0000');

abstract boolean process(Token t, HtmlTreeBuilder tb);
Expand Down
12 changes: 2 additions & 10 deletions src/main/java/org/jsoup/parser/Token.java
Original file line number Diff line number Diff line change
Expand Up @@ -196,16 +196,8 @@ private void trackAttributeRange(String name) {
final boolean preserve = start.treeBuilder.settings.preserveAttributeCase();

assert attributes != null;
//noinspection unchecked
Map<String, Range.AttributeRange> attrRanges =
(Map<String, Range.AttributeRange>) attributes.userData(AttrRangeKey);
if (attrRanges == null) {
attrRanges = new HashMap<>();
attributes.userData(AttrRangeKey, attrRanges);
}

if (!preserve) name = Normalizer.lowerCase(name);
if (attrRanges.containsKey(name)) return; // dedupe ranges as we go; actual attributes get deduped later for error count
if (attributes.sourceRange(name).nameRange().isTracked()) return; // dedupe ranges as we go; actual attributes get deduped later for error count

// if there's no value (e.g. boolean), make it an implicit range at current
if (!hasAttrValue) attrValStart = attrValEnd = attrNameEnd;
Expand All @@ -218,7 +210,7 @@ private void trackAttributeRange(String name) {
new Range.Position(attrValStart, r.lineNumber(attrValStart), r.columnNumber(attrValStart)),
new Range.Position(attrValEnd, r.lineNumber(attrValEnd), r.columnNumber(attrValEnd)))
);
attrRanges.put(name, range);
attributes.sourceRange(name, range);
}
}

Expand Down
21 changes: 20 additions & 1 deletion src/test/java/org/jsoup/parser/PositionTest.java
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
package org.jsoup.parser;

import org.jsoup.Jsoup;
import org.jsoup.TextUtil;
import org.jsoup.integration.servlets.FileServlet;
import org.jsoup.internal.Normalizer;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.CDataNode;
import org.jsoup.nodes.Comment;
Expand Down Expand Up @@ -567,6 +569,23 @@ private void printRange(Node node) {
assertEquals(expectedRange, attr2.sourceRange().toString());
}

@Test void movedAttributesHaveRange() {
// https://github.com/jhy/jsoup/issues/2204
String html = "<span id=1>One</span><html attr=foo><body class=2>Two</body><head title=3><body class=ok data=bar>";
// note that the attributes of the head el are not copied into the implicit head created by the span, per spec. html and body els are.
Document doc = Jsoup.parse(html, TrackingHtmlParser);
StringBuilder elTrack = new StringBuilder();
doc.forEachNode(node -> accumulatePositions(node, elTrack));

StringBuilder atTrack = new StringBuilder();
doc.forEachNode(node -> accumulateAttributePositions(node, atTrack));

assertEquals("#document:0-0~98-98; html:0-0~98-98; head:0-0~0-0; body:0-0~53-60; span:0-11~14-21; #text:11-14; #text:50-53; ", elTrack.toString());
assertEquals("attr:27-31=32-35; class:42-47=48-49; data:89-93=94-97; id:6-8=9-10; ", atTrack.toString());

assertEquals("<html attr=\"foo\"><head></head><body class=\"2\" data=\"bar\"><span id=\"1\">One</span>Two </body></html>", TextUtil.normalizeSpaces(doc.html()));
}

static void accumulateAttributePositions(Node node, StringBuilder sb) {
if (node instanceof LeafNode) return; // leafnode pseudo attributes are not tracked
for (Attribute attribute : node.attributes()) {
Expand All @@ -591,4 +610,4 @@ static void accumulatePositions(Attribute attr, StringBuilder sb) {

sb.append("; ");
}
}
}

0 comments on commit 91b5a56

Please sign in to comment.