Allow < in tag name state

We used to have specific handling for this, but that moves us out of spec, and it's not a clear-cut intent. Fixes #2230
jhy · Nov 23, 2024 · 0ef4b70 · 0ef4b70
1 parent 51909b1
commit 0ef4b70
Show file tree

Hide file tree

Showing 5 changed files with 22 additions and 26 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -35,7 +35,10 @@
   applies appropriate path selection on cookies when making requests. [1831](https://github.com/jhy/jsoup/issues/1831)
 * When parsing named HTML entities, base entities should resolve if they are a prefix of the input token (and not in an
   attribute). [2207](https://github.com/jhy/jsoup/issues/2207)
-* Fixed incorrect tracking of source ranges for attributes merged from late-occurring elements that were implicitly created (`html` or `body`). [2204](https://github.com/jhy/jsoup/issues/2204)
+* Fixed incorrect tracking of source ranges for attributes merged from late-occurring elements that were implicitly
+  created (`html` or `body`). [2204](https://github.com/jhy/jsoup/issues/2204)
+* Follow the current HTML specification in the tokenizer to allow `<` as part of a tag name, instead of emitting it as a
+  character node. [2230](https://github.com/jhy/jsoup/issues/2230)
 
 ## 1.18.1 (2024-Jul-10)
 

diff --git a/src/main/java/org/jsoup/parser/CharacterReader.java b/src/main/java/org/jsoup/parser/CharacterReader.java
@@ -489,7 +489,7 @@ String consumeRawData() {
 
     String consumeTagName() {
         // '\t', '\n', '\r', '\f', ' ', '/', '>'
-        // NOTE: out of spec, added '<' to fix common author bugs; does not stop and append on nullChar but eats
+        // NOTE: out of spec; does not stop and append on nullChar but eats
         bufferUp();
         int pos = bufPos;
         final int start = pos;
@@ -505,7 +505,6 @@ String consumeTagName() {
                 case ' ':
                 case '/':
                 case '>':
-                case '<':
                     break OUTER;
             }
             pos++;

diff --git a/src/main/java/org/jsoup/parser/TokeniserState.java b/src/main/java/org/jsoup/parser/TokeniserState.java
@@ -160,10 +160,6 @@ enum TokeniserState {
                 case '/':
                     t.transition(SelfClosingStartTag);
                     break;
-                case '<': // NOTE: out of spec, but clear author intent
-                    r.unconsume();
-                    t.error(this);
-                    // intended fall through to next >
                 case '>':
                     t.emitTagPending();
                     t.transition(Data);

diff --git a/src/test/java/org/jsoup/parser/HtmlParserTest.java b/src/test/java/org/jsoup/parser/HtmlParserTest.java
@@ -1649,9 +1649,9 @@ private boolean didAddElements(String input) {
         // when the Element is created, the name got normalized to "template" and so looked like there should be a
         // template on the stack during resetInsertionMode for the select.
         // The issue was that the normalization in Tag.valueOf did a trim which the Token.Tag did not
-        Document doc = Jsoup.parse("<template\u001E<select<input<");
+        Document doc = Jsoup.parse("<template\u001E><select><input>");
         assertNotNull(doc);
-        assertEquals("<template><select></select><input>&lt;</template>",
+        assertEquals("<template><select></select><input></template>",
             TextUtil.stripNewlines(doc.head().html()));
     }
 
@@ -1924,4 +1924,19 @@ private static void assertMathNamespace(Element el) {
             TextUtil.normalizeSpaces(doc.body().html())
         );
     }
+
+    @Test void gtAfterTagClose() {
+        // https://github.com/jhy/jsoup/issues/2230
+        String html = "<div>Div</div<> <a>One<a<b>Hello</b>";
+        // this gives us an element "a<b", which is gross, but to the spec & browsers
+        Document doc = Jsoup.parse(html);
+        Element body = doc.body();
+        assertEquals("<div> Div <a>One<a<b> Hello </a<b></a></div>", TextUtil.normalizeSpaces(body.html()));
+
+        Elements abs = doc.getElementsByTag("a<b");
+        assertEquals(1, abs.size());
+        Element ab = abs.first();
+        assertEquals("Hello", ab.text());
+        assertEquals("a<b", ab.tag().normalName());
+    }
 }
diff --git a/src/test/java/org/jsoup/parser/TokeniserStateTest.java b/src/test/java/org/jsoup/parser/TokeniserStateTest.java
@@ -198,13 +198,6 @@ public void testPublicAndSystemIdentifiersWithWhitespace() {
         }
     }
 
-    @Test public void handlesLessInTagThanAsNewTag() {
-        // out of spec, but clear author intent
-        String html = "<p\n<p<div id=one <span>Two";
-        Document doc = Jsoup.parse(html);
-        assertEquals("<p></p><p></p><div id=\"one\"><span>Two</span></div>", TextUtil.stripNewlines(doc.body().html()));
-    }
-
     @Test
     public void testUnconsumeAtBufferBoundary() {
         String triggeringSnippet = "<a href=\"\"foo";
@@ -250,16 +243,6 @@ public void testMalformedSelfClosingTag() {
         assertEquals(7, errorList.get(0).getPosition());
     }
 
-    @Test
-    public void testOpeningAngleBracketInTagName() {
-        String triggeringSnippet = "<html<";
-        ParseErrorList errorList = ParseErrorList.tracking(1);
-
-        Parser.parseFragment(triggeringSnippet, null, "", errorList);
-
-        assertEquals(5, errorList.get(0).getPosition());
-    }
-
     @Test
     public void rcData() {
         Document doc = Jsoup.parse("<title>One \0Two</title>");