Skip to content

Commit

Permalink
Allow < in tag name state
Browse files Browse the repository at this point in the history
We used to have specific handling for this, but that moves us out of spec, and it's not a clear-cut intent.

Fixes #2230
  • Loading branch information
jhy committed Nov 23, 2024
1 parent 51909b1 commit 0ef4b70
Show file tree
Hide file tree
Showing 5 changed files with 22 additions and 26 deletions.
5 changes: 4 additions & 1 deletion CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,10 @@
applies appropriate path selection on cookies when making requests. [1831](https://github.com/jhy/jsoup/issues/1831)
* When parsing named HTML entities, base entities should resolve if they are a prefix of the input token (and not in an
attribute). [2207](https://github.com/jhy/jsoup/issues/2207)
* Fixed incorrect tracking of source ranges for attributes merged from late-occurring elements that were implicitly created (`html` or `body`). [2204](https://github.com/jhy/jsoup/issues/2204)
* Fixed incorrect tracking of source ranges for attributes merged from late-occurring elements that were implicitly
created (`html` or `body`). [2204](https://github.com/jhy/jsoup/issues/2204)
* Follow the current HTML specification in the tokenizer to allow `<` as part of a tag name, instead of emitting it as a
character node. [2230](https://github.com/jhy/jsoup/issues/2230)

## 1.18.1 (2024-Jul-10)

Expand Down
3 changes: 1 addition & 2 deletions src/main/java/org/jsoup/parser/CharacterReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -489,7 +489,7 @@ String consumeRawData() {

String consumeTagName() {
// '\t', '\n', '\r', '\f', ' ', '/', '>'
// NOTE: out of spec, added '<' to fix common author bugs; does not stop and append on nullChar but eats
// NOTE: out of spec; does not stop and append on nullChar but eats
bufferUp();
int pos = bufPos;
final int start = pos;
Expand All @@ -505,7 +505,6 @@ String consumeTagName() {
case ' ':
case '/':
case '>':
case '<':
break OUTER;
}
pos++;
Expand Down
4 changes: 0 additions & 4 deletions src/main/java/org/jsoup/parser/TokeniserState.java
Original file line number Diff line number Diff line change
Expand Up @@ -160,10 +160,6 @@ enum TokeniserState {
case '/':
t.transition(SelfClosingStartTag);
break;
case '<': // NOTE: out of spec, but clear author intent
r.unconsume();
t.error(this);
// intended fall through to next >
case '>':
t.emitTagPending();
t.transition(Data);
Expand Down
19 changes: 17 additions & 2 deletions src/test/java/org/jsoup/parser/HtmlParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -1649,9 +1649,9 @@ private boolean didAddElements(String input) {
// when the Element is created, the name got normalized to "template" and so looked like there should be a
// template on the stack during resetInsertionMode for the select.
// The issue was that the normalization in Tag.valueOf did a trim which the Token.Tag did not
Document doc = Jsoup.parse("<template\u001E<select<input<");
Document doc = Jsoup.parse("<template\u001E><select><input>");
assertNotNull(doc);
assertEquals("<template><select></select><input>&lt;</template>",
assertEquals("<template><select></select><input></template>",
TextUtil.stripNewlines(doc.head().html()));
}

Expand Down Expand Up @@ -1924,4 +1924,19 @@ private static void assertMathNamespace(Element el) {
TextUtil.normalizeSpaces(doc.body().html())
);
}

@Test void gtAfterTagClose() {
// https://github.com/jhy/jsoup/issues/2230
String html = "<div>Div</div<> <a>One<a<b>Hello</b>";
// this gives us an element "a<b", which is gross, but to the spec & browsers
Document doc = Jsoup.parse(html);
Element body = doc.body();
assertEquals("<div> Div <a>One<a<b> Hello </a<b></a></div>", TextUtil.normalizeSpaces(body.html()));

Elements abs = doc.getElementsByTag("a<b");
assertEquals(1, abs.size());
Element ab = abs.first();
assertEquals("Hello", ab.text());
assertEquals("a<b", ab.tag().normalName());
}
}
17 changes: 0 additions & 17 deletions src/test/java/org/jsoup/parser/TokeniserStateTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -198,13 +198,6 @@ public void testPublicAndSystemIdentifiersWithWhitespace() {
}
}

@Test public void handlesLessInTagThanAsNewTag() {
// out of spec, but clear author intent
String html = "<p\n<p<div id=one <span>Two";
Document doc = Jsoup.parse(html);
assertEquals("<p></p><p></p><div id=\"one\"><span>Two</span></div>", TextUtil.stripNewlines(doc.body().html()));
}

@Test
public void testUnconsumeAtBufferBoundary() {
String triggeringSnippet = "<a href=\"\"foo";
Expand Down Expand Up @@ -250,16 +243,6 @@ public void testMalformedSelfClosingTag() {
assertEquals(7, errorList.get(0).getPosition());
}

@Test
public void testOpeningAngleBracketInTagName() {
String triggeringSnippet = "<html<";
ParseErrorList errorList = ParseErrorList.tracking(1);

Parser.parseFragment(triggeringSnippet, null, "", errorList);

assertEquals(5, errorList.get(0).getPosition());
}

@Test
public void rcData() {
Document doc = Jsoup.parse("<title>One \0Two</title>");
Expand Down

0 comments on commit 0ef4b70

Please sign in to comment.