From 73d450657932370e516f205537b395d1d055d043 Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Thu, 23 Nov 2023 17:31:36 +1100 Subject: [PATCH] Refactored UserData to be tucked into a hash (#2060) And make sure attribute source ranges are copied in the cleaner --- .../org/jsoup/internal/SharedConstants.java | 10 +-- src/main/java/org/jsoup/nodes/Attribute.java | 10 +-- src/main/java/org/jsoup/nodes/Attributes.java | 79 ++++++++++++------- src/main/java/org/jsoup/nodes/Element.java | 9 ++- src/main/java/org/jsoup/nodes/Range.java | 26 ++---- src/main/java/org/jsoup/parser/Token.java | 20 +++-- .../java/org/jsoup/parser/TreeBuilder.java | 3 +- src/main/java/org/jsoup/safety/Cleaner.java | 19 ++--- .../java/org/jsoup/nodes/PositionTest.java | 2 - .../java/org/jsoup/safety/CleanerTest.java | 16 ++-- 10 files changed, 98 insertions(+), 96 deletions(-) diff --git a/src/main/java/org/jsoup/internal/SharedConstants.java b/src/main/java/org/jsoup/internal/SharedConstants.java index b8d9d851fc..141e1df9a8 100644 --- a/src/main/java/org/jsoup/internal/SharedConstants.java +++ b/src/main/java/org/jsoup/internal/SharedConstants.java @@ -5,12 +5,10 @@ this package when modules are enabled. */ public final class SharedConstants { - // Indicates a jsoup internal key. Can't be set via HTML. (It could be set via accessor, but not too worried about - // that. Suppressed from list, iter. - public static final char InternalPrefix = '/'; - public static final String PrivatePrefix = "/jsoup."; - - public static final String AttrRange = PrivatePrefix + "attrRange."; + public static final String UserDataKey = "/jsoup.userdata"; + public final static String AttrRangeKey = "jsoup.attrs"; + public static final String RangeKey = "jsoup.start"; + public static final String EndRangeKey = "jsoup.end"; public static final int DefaultBufferSize = 1024 * 32; diff --git a/src/main/java/org/jsoup/nodes/Attribute.java b/src/main/java/org/jsoup/nodes/Attribute.java index 5d37d98c68..f77106db1d 100644 --- a/src/main/java/org/jsoup/nodes/Attribute.java +++ b/src/main/java/org/jsoup/nodes/Attribute.java @@ -141,7 +141,7 @@ Get the source ranges (start to end positions) in the original input source from @since 1.17.1 */ public Range.AttributeRange sourceRange() { - if (parent == null) return Range.AttributeRange.Untracked; + if (parent == null) return Range.AttributeRange.UntrackedAttr; return parent.sourceRange(key); } @@ -211,14 +211,6 @@ protected static boolean isDataAttribute(String key) { return key.startsWith(Attributes.dataPrefix) && key.length() > Attributes.dataPrefix.length(); } - /** - Is this an internal attribute? Internal attributes can be fetched by key, but are not serialized. - * @return if an internal attribute. - */ - public boolean isInternal() { - return Attributes.isInternalKey(key); - } - /** * Collapsible if it's a boolean attribute and value is empty or same as name * diff --git a/src/main/java/org/jsoup/nodes/Attributes.java b/src/main/java/org/jsoup/nodes/Attributes.java index ae07731f55..ecbe9f4ead 100644 --- a/src/main/java/org/jsoup/nodes/Attributes.java +++ b/src/main/java/org/jsoup/nodes/Attributes.java @@ -14,6 +14,7 @@ import java.util.Arrays; import java.util.Collections; import java.util.ConcurrentModificationException; +import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; @@ -21,7 +22,8 @@ import java.util.Set; import static org.jsoup.internal.Normalizer.lowerCase; -import static org.jsoup.nodes.Range.AttributeRange.Untracked; +import static org.jsoup.internal.SharedConstants.AttrRangeKey; +import static org.jsoup.nodes.Range.AttributeRange.UntrackedAttr; /** * The attributes of an Element. @@ -39,6 +41,10 @@ * @author Jonathan Hedley, jonathan@hedley.net */ public class Attributes implements Iterable, Cloneable { + // Indicates an internal key. Can't be set via HTML. (It could be set via accessor, but not too worried about + // that. Suppressed from list, iter.) + static final char InternalPrefix = '/'; + // The Attributes object is only created on the first use of an attribute; the Element will just have a null // Attribute slot otherwise protected static final String dataPrefix = "data-"; @@ -114,21 +120,6 @@ public String getIgnoreCase(String key) { return i == NotFound ? EmptyString : checkNotNull(vals[i]); } - /** - Get an arbitrary user data object by key. - * @param key case-sensitive key to the object. - * @return the object associated to this key, or {@code null} if not found. - * @see #userData(String key, Object val) - * @since 1.17.2 - */ - @Nullable - public Object userData(String key) { - Validate.notNull(key); - if (!isInternalKey(key)) key = internalKey(key); - int i = indexOfKey(key); - return i == NotFound ? null : vals[i]; - } - /** * Adds a new attribute. Will produce duplicates if the key already exists. * @see Attributes#put(String, String) @@ -161,6 +152,40 @@ public Attributes put(String key, @Nullable String value) { return this; } + /** + Get the map holding any user-data associated with these Attributes. Will be created empty on first use. Held as + an internal attribute, not a field member, to reduce the memory footprint of Attributes when not used. Can hold + arbitrary objects; use for source ranges, connecting W3C nodes to Elements, etc. + * @return the map holding user-data + */ + Map userData() { + final Map userData; + int i = indexOfKey(SharedConstants.UserDataKey); + if (i == NotFound) { + userData = new HashMap<>(); + addObject(SharedConstants.UserDataKey, userData); + } else { + //noinspection unchecked + userData = (Map) vals[i]; + } + return userData; + } + + /** + Get an arbitrary user-data object by key. + * @param key case-sensitive key to the object. + * @return the object associated to this key, or {@code null} if not found. + * @see #userData(String key, Object val) + * @since 1.17.1 + */ + @Nullable + public Object userData(String key) { + Validate.notNull(key); + if (!hasKey(SharedConstants.UserDataKey)) return null; // no user data exists + Map userData = userData(); + return userData.get(key); + } + /** Set an arbitrary user-data object by key. Will be treated as an internal attribute, so will not be emitted in HTML. * @param key case-sensitive key @@ -171,13 +196,7 @@ public Attributes put(String key, @Nullable String value) { */ public Attributes userData(String key, Object value) { Validate.notNull(key); - if (!isInternalKey(key)) key = internalKey(key); - Validate.notNull(value); - int i = indexOfKey(key); - if (i != NotFound) - vals[i] = value; - else - addObject(key, value); + userData().put(key, value); return this; } @@ -340,10 +359,12 @@ Get the source ranges (start to end position) in the original input source from @since 1.17.1 */ public Range.AttributeRange sourceRange(String key) { - if (!hasKey(key)) return Untracked; - final String rangeKey = SharedConstants.AttrRange + key; - if (!hasDeclaredValueForKey(rangeKey)) return Untracked; - return (Range.AttributeRange) Validate.ensureNotNull(userData(rangeKey)); + if (!hasKey(key)) return UntrackedAttr; + //noinspection unchecked + Map ranges = (Map) userData(AttrRangeKey); + if (ranges == null) return Range.AttributeRange.UntrackedAttr; + Range.AttributeRange range = ranges.get(key); + return range != null ? range : Range.AttributeRange.UntrackedAttr; } @Override @@ -592,10 +613,10 @@ private static String dataKey(String key) { } static String internalKey(String key) { - return SharedConstants.InternalPrefix + key; + return InternalPrefix + key; } static boolean isInternalKey(String key) { - return key != null && key.length() > 1 && key.charAt(0) == SharedConstants.InternalPrefix; + return key != null && key.length() > 1 && key.charAt(0) == InternalPrefix; } } diff --git a/src/main/java/org/jsoup/nodes/Element.java b/src/main/java/org/jsoup/nodes/Element.java index c1cf7584d3..a1b09b3676 100644 --- a/src/main/java/org/jsoup/nodes/Element.java +++ b/src/main/java/org/jsoup/nodes/Element.java @@ -1821,7 +1821,9 @@ public Element clone() { @Override public Element shallowClone() { // simpler than implementing a clone version with no child copy - return new Element(tag, baseUri(), attributes == null ? null : attributes.clone()); + String baseUri = baseUri(); + if (baseUri.equals("")) baseUri = null; // saves setting a blank internal attribute + return new Element(tag, baseUri, attributes == null ? null : attributes.clone()); } @Override @@ -1838,8 +1840,9 @@ protected Element doClone(@Nullable Node parent) { @Override public Element clearAttributes() { if (attributes != null) { - super.clearAttributes(); - attributes = null; + super.clearAttributes(); // keeps internal attributes via iterator + if (attributes.size() == 0) + attributes = null; // only remove entirely if no internal attributes } return this; diff --git a/src/main/java/org/jsoup/nodes/Range.java b/src/main/java/org/jsoup/nodes/Range.java index 87c8e53c76..955c043a8c 100644 --- a/src/main/java/org/jsoup/nodes/Range.java +++ b/src/main/java/org/jsoup/nodes/Range.java @@ -1,10 +1,5 @@ package org.jsoup.nodes; -import org.jsoup.helper.Validate; -import org.jsoup.internal.SharedConstants; - -import java.util.Objects; - import static org.jsoup.internal.SharedConstants.*; /** @@ -15,11 +10,8 @@ @since 1.15.2 */ public class Range { - private final Position start, end; - - private static final String RangeKey = PrivatePrefix + "sourceRange"; - private static final String EndRangeKey = PrivatePrefix + "endSourceRange"; private static final Position UntrackedPos = new Position(-1, -1, -1); + private final Position start, end; /** An untracked source range. */ static final Range Untracked = new Range(UntrackedPos, UntrackedPos); @@ -98,18 +90,16 @@ public boolean isImplicit() { */ static Range of(Node node, boolean start) { final String key = start ? RangeKey : EndRangeKey; - if (!node.hasAttr(key)) return Untracked; - return (Range) Validate.ensureNotNull(node.attributes().userData(key)); + if (!node.hasAttributes()) return Untracked; + Object range = node.attributes().userData(key); + return range != null ? (Range) range : Untracked; } /** - Internal jsoup method, called by the TreeBuilder. Tracks a Range for a Node. - * @param node the node to associate this position to - * @param start if this is the starting range. {@code false} for Element end tags. + @deprecated no-op; internal method moved out of visibility */ - public void track(Node node, boolean start) { - node.attributes().userData(start ? RangeKey : EndRangeKey, this); - } + @Deprecated + public void track(Node node, boolean start) {} @Override public boolean equals(Object o) { @@ -222,7 +212,7 @@ public int hashCode() { } public static class AttributeRange { - static final AttributeRange Untracked = new AttributeRange(Range.Untracked, Range.Untracked); + static final AttributeRange UntrackedAttr = new AttributeRange(Range.Untracked, Range.Untracked); private final Range nameRange; private final Range valueRange; diff --git a/src/main/java/org/jsoup/parser/Token.java b/src/main/java/org/jsoup/parser/Token.java index 8314f5ea68..c813e5d889 100644 --- a/src/main/java/org/jsoup/parser/Token.java +++ b/src/main/java/org/jsoup/parser/Token.java @@ -1,11 +1,13 @@ package org.jsoup.parser; import org.jsoup.helper.Validate; -import org.jsoup.internal.SharedConstants; import org.jsoup.nodes.Attributes; import org.jsoup.nodes.Range; import org.jspecify.annotations.Nullable; +import java.util.HashMap; +import java.util.Map; + import static org.jsoup.internal.SharedConstants.*; @@ -186,6 +188,15 @@ private void trackAttributeRange(String name) { if (trackSource && isStartTag()) { final StartTag start = asStartTag(); final CharacterReader r = start.reader; + assert attributes != null; + //noinspection unchecked + Map attrRanges = + (Map) attributes.userData(AttrRangeKey); + if (attrRanges == null) { + attrRanges = new HashMap<>(); + attributes.userData(AttrRangeKey, attrRanges); + } + if (attrRanges.containsKey(name)) return; // dedupe ranges on case-sensitive name as we go; actual attributes get deduped later // if there's no value (e.g. boolean), make it an implicit range at current if (!hasAttrValue) attrValStart = attrValEnd = attrNameEnd; @@ -198,12 +209,7 @@ private void trackAttributeRange(String name) { new Range.Position(attrValStart, r.lineNumber(attrValStart), r.columnNumber(attrValStart)), new Range.Position(attrValEnd, r.lineNumber(attrValEnd), r.columnNumber(attrValEnd))) ); - - // todo - deduping as we go as case-sensitive key; want first - String key = AttrRange + name; - - assert attributes != null; - attributes.userData(key, range); + attrRanges.put(name, range); } } diff --git a/src/main/java/org/jsoup/parser/TreeBuilder.java b/src/main/java/org/jsoup/parser/TreeBuilder.java index e20a641de2..eadfa28447 100644 --- a/src/main/java/org/jsoup/parser/TreeBuilder.java +++ b/src/main/java/org/jsoup/parser/TreeBuilder.java @@ -2,7 +2,6 @@ import org.jsoup.helper.Validate; import org.jsoup.internal.SharedConstants; -import org.jsoup.nodes.Attribute; import org.jsoup.nodes.Attributes; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; @@ -279,6 +278,6 @@ private void trackNodePosition(Node node, boolean isStart) { Range.Position endPosition = new Range.Position (endPos, reader.lineNumber(endPos), reader.columnNumber(endPos)); Range range = new Range(startPosition, endPosition); - range.track(node, isStart); + node.attributes().userData(isStart ? SharedConstants.RangeKey : SharedConstants.EndRangeKey, range); } } diff --git a/src/main/java/org/jsoup/safety/Cleaner.java b/src/main/java/org/jsoup/safety/Cleaner.java index f15399be62..b84608abac 100644 --- a/src/main/java/org/jsoup/safety/Cleaner.java +++ b/src/main/java/org/jsoup/safety/Cleaner.java @@ -9,15 +9,12 @@ import org.jsoup.nodes.Node; import org.jsoup.nodes.TextNode; import org.jsoup.parser.ParseErrorList; -import org.jsoup.parser.ParseSettings; import org.jsoup.parser.Parser; -import org.jsoup.parser.Tag; import org.jsoup.select.NodeTraversor; import org.jsoup.select.NodeVisitor; import java.util.List; - /** The safelist based HTML cleaner. Use to ensure that end-user provided HTML contains only the elements and attributes that you are expecting; no junk, and no cross-site scripting attacks! @@ -178,11 +175,12 @@ private int copySafeNodes(Element source, Element dest) { } private ElementMeta createSafeElement(Element sourceEl) { + Element dest = sourceEl.shallowClone(); // reuses tag, clones attributes and preserves any user data String sourceTag = sourceEl.tagName(); - Attributes destAttrs = new Attributes(); - Element dest = new Element(sourceEl.tag(), sourceEl.baseUri(), destAttrs); - int numDiscarded = 0; + Attributes destAttrs = dest.attributes(); + dest.clearAttributes(); // clear all non-internal attributes, ready for safe copy + int numDiscarded = 0; Attributes sourceAttrs = sourceEl.attributes(); for (Attribute sourceAttr : sourceAttrs) { if (safelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr)) @@ -192,14 +190,7 @@ private ElementMeta createSafeElement(Element sourceEl) { } Attributes enforcedAttrs = safelist.getEnforcedAttributes(sourceTag); destAttrs.addAll(enforcedAttrs); - - // Copy the original start and end range, if set - // TODO - might be good to make a generic Element#userData set type interface, and copy those all over - if (sourceEl.sourceRange().isTracked()) - sourceEl.sourceRange().track(dest, true); - if (sourceEl.endSourceRange().isTracked()) - sourceEl.endSourceRange().track(dest, false); - + dest.attributes().addAll(destAttrs); // re-attach, if removed in clear return new ElementMeta(dest, numDiscarded); } diff --git a/src/test/java/org/jsoup/nodes/PositionTest.java b/src/test/java/org/jsoup/nodes/PositionTest.java index 2676641a27..ee6f49d79f 100644 --- a/src/test/java/org/jsoup/nodes/PositionTest.java +++ b/src/test/java/org/jsoup/nodes/PositionTest.java @@ -311,7 +311,6 @@ private void printRange(Node node) { StringBuilder track = new StringBuilder(); for (Attribute attr : div.attributes()) { - if (attr.isInternal()) continue; Range.AttributeRange attrRange = attr.sourceRange(); assertTrue(attrRange.nameRange().isTracked()); @@ -339,7 +338,6 @@ private void printRange(Node node) { StringBuilder track = new StringBuilder(); for (Attribute attr : div.attributes()) { - if (attr.isInternal()) continue; Range.AttributeRange attrRange = attr.sourceRange(); assertTrue(attrRange.nameRange().isTracked()); assertTrue(attrRange.valueRange().isTracked()); diff --git a/src/test/java/org/jsoup/safety/CleanerTest.java b/src/test/java/org/jsoup/safety/CleanerTest.java index 101dfbf8a6..961a7636c2 100644 --- a/src/test/java/org/jsoup/safety/CleanerTest.java +++ b/src/test/java/org/jsoup/safety/CleanerTest.java @@ -10,7 +10,6 @@ import org.jsoup.parser.Parser; import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.MethodSource; import org.junit.jupiter.params.provider.ValueSource; import java.util.Arrays; @@ -392,16 +391,21 @@ public void bailsIfRemovingProtocolThatsNotSet() { } @Test void preservesSourcePositionViaUserData() { - Document orig = Jsoup.parse("\n

Hello

", Parser.htmlParser().setTrackPosition(true)); + Document orig = Jsoup.parse("\n

Hello

", Parser.htmlParser().setTrackPosition(true)); Element p = orig.expectFirst("p"); Range origRange = p.sourceRange(); - assertEquals("2,2:22-2,5:25", origRange.toString()); + assertEquals("2,2:22-2,10:30", origRange.toString()); - Document clean = new Cleaner(Safelist.relaxed()).clean(orig); + Range.AttributeRange attributeRange = p.attributes().sourceRange("id"); + assertEquals("2,5:25-2,7:27=2,8:28-2,9:29", attributeRange.toString()); + + Document clean = new Cleaner(Safelist.relaxed().addAttributes("p", "id")).clean(orig); Element cleanP = clean.expectFirst("p"); + assertEquals("1", cleanP.id()); Range cleanRange = cleanP.sourceRange(); - assertEquals(cleanRange, origRange); - assertEquals(clean.endSourceRange(), orig.endSourceRange()); + assertEquals(origRange, cleanRange); + assertEquals(orig.endSourceRange(), clean.endSourceRange()); + assertEquals(attributeRange, cleanP.attributes().sourceRange("id")); } @ParameterizedTest @ValueSource(booleans = {true, false})