From 5ee376bc5bbe6a4f66eb24e2180552b547b8a4bf Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Fri, 22 Nov 2024 13:03:49 +1100 Subject: [PATCH] Entity decoding supports prefix matches When parsing named HTML entities, base entities should resolve if they are a prefix of the input token (and not in an attribute). Finds the longest prefix. Validated matches browser behavior and extended entities *don't* match (like `&clubsuite;`) Fixes #2207 --- CHANGES.md | 2 ++ pom.xml | 1 + src/main/java/org/jsoup/nodes/Entities.java | 24 +++++++++++++++++++ src/main/java/org/jsoup/parser/Tokeniser.java | 7 +++++- .../java/org/jsoup/nodes/EntitiesTest.java | 7 ++++++ .../java/org/jsoup/parser/HtmlParserTest.java | 22 ++++++++++++++--- 6 files changed, 59 insertions(+), 4 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 73d01e173c..5c2438b023 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -33,6 +33,8 @@ `Connection.Response#cookies()` will provide the last one set. Generally it is better to use the [Jsoup.newSession](https://jsoup.org/cookbook/web/request-session) method to maintain a cookie jar, as that applies appropriate path selection on cookies when making requests. [1831](https://github.com/jhy/jsoup/issues/1831) +* When parsing named HTML entities, base entities should resolve if they are a prefix of the input token (and not in an + attribute). [2207](https://github.com/jhy/jsoup/issues/2207) ## 1.18.1 (2024-Jul-10) diff --git a/pom.xml b/pom.xml index aa01a6f380..042477e7fe 100644 --- a/pom.xml +++ b/pom.xml @@ -98,6 +98,7 @@ java.io.UncheckedIOException java.util.Comparator java.util.List + java.util.ArrayList java.util.LinkedHashMap java.util.Map java.util.Objects diff --git a/src/main/java/org/jsoup/nodes/Entities.java b/src/main/java/org/jsoup/nodes/Entities.java index c4503dd02e..bff4d35fe5 100644 --- a/src/main/java/org/jsoup/nodes/Entities.java +++ b/src/main/java/org/jsoup/nodes/Entities.java @@ -11,7 +11,9 @@ import java.io.IOException; import java.nio.charset.Charset; import java.nio.charset.CharsetEncoder; +import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.HashMap; import static org.jsoup.nodes.Document.OutputSettings.*; @@ -36,6 +38,9 @@ public class Entities { private static final char[] codeDelims = {',', ';'}; private static final HashMap multipoints = new HashMap<>(); // name -> multiple character references + private static final int BaseCount = 106; + private static final ArrayList baseSorted = new ArrayList<>(BaseCount); // names sorted longest first, for prefix matching + public enum EscapeMode { /** * Restricted entities suitable for XHTML output: lt, gt, amp, and quot only. @@ -50,6 +55,12 @@ public enum EscapeMode { */ extended(EntitiesData.fullPoints, 2125); + static { + // sort the base names by length, for prefix matching + Collections.addAll(baseSorted, base.nameKeys); + baseSorted.sort((a, b) -> b.length() - a.length()); + } + // table of named references to their codepoints. sorted so we can binary search. built by BuildEntities. private String[] nameKeys; private int[] codeVals; // limitation is the few references with multiple characters; those go into multipoints. @@ -134,6 +145,19 @@ public static int codepointsForName(final String name, final int[] codepoints) { return 0; } + /** + Finds the longest base named entity that is a prefix of the input. That is, input "notit" would return "not". + + @return longest entity name that is a prefix of the input, or "" if no entity matches + */ + public static String findPrefix(String input) { + for (String name : baseSorted) { + if (input.startsWith(name)) return name; + } + return emptyName; + // if perf critical, could look at using a Trie vs a scan + } + /** HTML escape an input string. That is, {@code <} is returned as {@code <}. The escaped string is suitable for use both in attributes and in text data. diff --git a/src/main/java/org/jsoup/parser/Tokeniser.java b/src/main/java/org/jsoup/parser/Tokeniser.java index ff3bfa4f18..750ee0701f 100644 --- a/src/main/java/org/jsoup/parser/Tokeniser.java +++ b/src/main/java/org/jsoup/parser/Tokeniser.java @@ -228,7 +228,12 @@ void advanceTransition(TokeniserState newState) { reader.rewindToMark(); if (looksLegit) // named with semicolon characterReferenceError("invalid named reference [%s]", nameRef); - return null; + if (inAttribute) return null; + // check if there's a base prefix match; consume and use that if so + String prefix = Entities.findPrefix(nameRef); + if (prefix.isEmpty()) return null; + reader.matchConsume(prefix); + nameRef = prefix; } if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) { // don't want that to match diff --git a/src/test/java/org/jsoup/nodes/EntitiesTest.java b/src/test/java/org/jsoup/nodes/EntitiesTest.java index 59708cdf99..fb9d278cfc 100644 --- a/src/test/java/org/jsoup/nodes/EntitiesTest.java +++ b/src/test/java/org/jsoup/nodes/EntitiesTest.java @@ -112,6 +112,13 @@ public class EntitiesTest { assertEquals("Hello &= &", Entities.unescape(text, false)); } + @Test public void prefixMatch() { + // https://github.com/jhy/jsoup/issues/2207 + // example from https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state + String text = "I'm ¬it; I tell you. I'm ∉ I tell you."; + assertEquals("I'm ¬it; I tell you. I'm ∉ I tell you.", Entities.unescape(text, false)); + assertEquals("I'm ¬it; I tell you. I'm ∉ I tell you.", Entities.unescape(text, true)); // not for attributes + } @Test public void caseSensitive() { String unescaped = "Ü ü & &"; diff --git a/src/test/java/org/jsoup/parser/HtmlParserTest.java b/src/test/java/org/jsoup/parser/HtmlParserTest.java index a67003a839..d87318bda0 100644 --- a/src/test/java/org/jsoup/parser/HtmlParserTest.java +++ b/src/test/java/org/jsoup/parser/HtmlParserTest.java @@ -925,9 +925,8 @@ private static Stream dupeAttributeData() { assertEquals("
  1. One

Two

", StringUtil.normaliseWhitespace(nodes.get(0).outerHtml())); } - @Test public void doesNotFindShortestMatchingEntity() { - // previous behaviour was to identify a possible entity, then chomp down the string until a match was found. - // (as defined in html5.) However in practise that lead to spurious matches against the author's intent. + @Test public void doesNotFindExtendedPrefixMatchingEntity() { + // only base entities, not extended entities, should allow prefix match (i.e., those in the spec named list that don't include a trailing ; - https://html.spec.whatwg.org/multipage/named-characters.html) String html = "One &clubsuite; ♣"; Document doc = Jsoup.parse(html); assertEquals(StringUtil.normaliseWhitespace("One &clubsuite; ♣"), doc.body().html()); @@ -941,6 +940,23 @@ private static Stream dupeAttributeData() { assertEquals("& \" ® &icy &hopf и 𝕙", doc.body().html()); } + @Test public void findsBasePrefixEntity() { + // https://github.com/jhy/jsoup/issues/2207 + String html = "a c­c I'm ¬it; I tell you. I'm ∉ I tell you."; + Document doc = Jsoup.parse(html); + doc.outputSettings().escapeMode(Entities.EscapeMode.extended).charset("ascii"); + assertEquals("a c­c I'm ¬it; I tell you. I'm ∉ I tell you.", doc.body().html()); + assertEquals("a cc I'm ¬it; I tell you. I'm ∉ I tell you.", doc.body().text()); + + // and in an attribute: + html = "One"; + doc = Jsoup.parse(html); + doc.outputSettings().escapeMode(Entities.EscapeMode.extended).charset("ascii"); + Element el = doc.expectFirst("a"); + assertEquals("One", el.outerHtml()); + assertEquals(" c­c I'm ¬it; I tell you. I'm ∉ I tell you.", el.attr("title")); + } + @Test public void handlesXmlDeclarationAsBogusComment() { String html = "One"; Document doc = Jsoup.parse(html);