Skip to content

Commit

Permalink
Entity decoding supports prefix matches
Browse files Browse the repository at this point in the history
When parsing named HTML entities, base entities should resolve if they are a prefix of the input token (and not in an  attribute). Finds the longest prefix.

Validated matches browser behavior and extended entities *don't* match (like `&clubsuite;`)

Fixes #2207
  • Loading branch information
jhy committed Nov 22, 2024
1 parent 708fc1f commit 5ee376b
Show file tree
Hide file tree
Showing 6 changed files with 59 additions and 4 deletions.
2 changes: 2 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
`Connection.Response#cookies()` will provide the last one set. Generally it is better to use
the [Jsoup.newSession](https://jsoup.org/cookbook/web/request-session) method to maintain a cookie jar, as that
applies appropriate path selection on cookies when making requests. [1831](https://github.com/jhy/jsoup/issues/1831)
* When parsing named HTML entities, base entities should resolve if they are a prefix of the input token (and not in an
attribute). [2207](https://github.com/jhy/jsoup/issues/2207)

## 1.18.1 (2024-Jul-10)

Expand Down
1 change: 1 addition & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@
<ignore>java.io.UncheckedIOException</ignore>
<ignore>java.util.Comparator</ignore> <!-- Comparator.comparingInt() -->
<ignore>java.util.List</ignore> <!-- List#stream() -->
<ignore>java.util.ArrayList</ignore> <!-- List / ArrayList #sort() -->
<ignore>java.util.LinkedHashMap</ignore> <!-- LinkedHashMap#computeIfAbsent() -->
<ignore>java.util.Map</ignore> <!-- Map#computeIfAbsent() -->
<ignore>java.util.Objects</ignore>
Expand Down
24 changes: 24 additions & 0 deletions src/main/java/org/jsoup/nodes/Entities.java
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;

import static org.jsoup.nodes.Document.OutputSettings.*;
Expand All @@ -36,6 +38,9 @@ public class Entities {
private static final char[] codeDelims = {',', ';'};
private static final HashMap<String, String> multipoints = new HashMap<>(); // name -> multiple character references

private static final int BaseCount = 106;
private static final ArrayList<String> baseSorted = new ArrayList<>(BaseCount); // names sorted longest first, for prefix matching

public enum EscapeMode {
/**
* Restricted entities suitable for XHTML output: lt, gt, amp, and quot only.
Expand All @@ -50,6 +55,12 @@ public enum EscapeMode {
*/
extended(EntitiesData.fullPoints, 2125);

static {
// sort the base names by length, for prefix matching
Collections.addAll(baseSorted, base.nameKeys);
baseSorted.sort((a, b) -> b.length() - a.length());
}

// table of named references to their codepoints. sorted so we can binary search. built by BuildEntities.
private String[] nameKeys;
private int[] codeVals; // limitation is the few references with multiple characters; those go into multipoints.
Expand Down Expand Up @@ -134,6 +145,19 @@ public static int codepointsForName(final String name, final int[] codepoints) {
return 0;
}

/**
Finds the longest base named entity that is a prefix of the input. That is, input "notit" would return "not".
@return longest entity name that is a prefix of the input, or "" if no entity matches
*/
public static String findPrefix(String input) {
for (String name : baseSorted) {
if (input.startsWith(name)) return name;
}
return emptyName;
// if perf critical, could look at using a Trie vs a scan
}

/**
HTML escape an input string. That is, {@code <} is returned as {@code &lt;}. The escaped string is suitable for use
both in attributes and in text data.
Expand Down
7 changes: 6 additions & 1 deletion src/main/java/org/jsoup/parser/Tokeniser.java
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,12 @@ void advanceTransition(TokeniserState newState) {
reader.rewindToMark();
if (looksLegit) // named with semicolon
characterReferenceError("invalid named reference [%s]", nameRef);
return null;
if (inAttribute) return null;
// check if there's a base prefix match; consume and use that if so
String prefix = Entities.findPrefix(nameRef);
if (prefix.isEmpty()) return null;
reader.matchConsume(prefix);
nameRef = prefix;
}
if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) {
// don't want that to match
Expand Down
7 changes: 7 additions & 0 deletions src/test/java/org/jsoup/nodes/EntitiesTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,13 @@ public class EntitiesTest {
assertEquals("Hello &= &", Entities.unescape(text, false));
}

@Test public void prefixMatch() {
// https://github.com/jhy/jsoup/issues/2207
// example from https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state
String text = "I'm &notit; I tell you. I'm &notin; I tell you.";
assertEquals("I'm ¬it; I tell you. I'm ∉ I tell you.", Entities.unescape(text, false));
assertEquals("I'm &notit; I tell you. I'm ∉ I tell you.", Entities.unescape(text, true)); // not for attributes
}

@Test public void caseSensitive() {
String unescaped = "Ü ü & &";
Expand Down
22 changes: 19 additions & 3 deletions src/test/java/org/jsoup/parser/HtmlParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -925,9 +925,8 @@ private static Stream<Arguments> dupeAttributeData() {
assertEquals("<html> <head></head> <body> <ol> <li>One</li> </ol> <p>Two</p> </body> </html>", StringUtil.normaliseWhitespace(nodes.get(0).outerHtml()));
}

@Test public void doesNotFindShortestMatchingEntity() {
// previous behaviour was to identify a possible entity, then chomp down the string until a match was found.
// (as defined in html5.) However in practise that lead to spurious matches against the author's intent.
@Test public void doesNotFindExtendedPrefixMatchingEntity() {
// only base entities, not extended entities, should allow prefix match (i.e., those in the spec named list that don't include a trailing ; - https://html.spec.whatwg.org/multipage/named-characters.html)
String html = "One &clubsuite; &clubsuit;";
Document doc = Jsoup.parse(html);
assertEquals(StringUtil.normaliseWhitespace("One &amp;clubsuite; ♣"), doc.body().html());
Expand All @@ -941,6 +940,23 @@ private static Stream<Arguments> dupeAttributeData() {
assertEquals("&amp; \" &reg; &amp;icy &amp;hopf &icy; &hopf;", doc.body().html());
}

@Test public void findsBasePrefixEntity() {
// https://github.com/jhy/jsoup/issues/2207
String html = "a&nbspc&shyc I'm &notit; I tell you. I'm &notin; I tell you.";
Document doc = Jsoup.parse(html);
doc.outputSettings().escapeMode(Entities.EscapeMode.extended).charset("ascii");
assertEquals("a&nbsp;c&shy;c I'm &not;it; I tell you. I'm &notin; I tell you.", doc.body().html());
assertEquals("a cc I'm ¬it; I tell you. I'm ∉ I tell you.", doc.body().text());

// and in an attribute:
html = "<a title=\"&nbspc&shyc I'm &notit; I tell you. I'm &notin; I tell you.\">One</a>";
doc = Jsoup.parse(html);
doc.outputSettings().escapeMode(Entities.EscapeMode.extended).charset("ascii");
Element el = doc.expectFirst("a");
assertEquals("<a title=\"&amp;nbspc&amp;shyc I'm &amp;notit; I tell you. I'm &notin; I tell you.\">One</a>", el.outerHtml());
assertEquals("&nbspc&shyc I'm &notit; I tell you. I'm ∉ I tell you.", el.attr("title"));
}

@Test public void handlesXmlDeclarationAsBogusComment() {
String html = "<?xml encoding='UTF-8' ?><body>One</body>";
Document doc = Jsoup.parse(html);
Expand Down

0 comments on commit 5ee376b

Please sign in to comment.