Skip to content
/ net Public
forked from golang/net

Commit

Permalink
html: Added new ParseOptionLenientSelfClosing option [MOD]
Browse files Browse the repository at this point in the history
This option slightly relaxes parsing rules about which elements can
be self-closing to fix common issues with XHTML in EPUBs generated
by XML tooling.

See pgaskin/kepubify#45 and pgaskin/kepubify#28 for a few examples.
  • Loading branch information
pgaskin committed Jan 12, 2020
1 parent 2b87421 commit bc324ea
Show file tree
Hide file tree
Showing 2 changed files with 270 additions and 0 deletions.
79 changes: 79 additions & 0 deletions html/parse.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Modifications Copyright 2020 Patrick Gaskin.

package html

import (
Expand Down Expand Up @@ -49,6 +51,10 @@ type parser struct {
// context is the context element when parsing an HTML fragment
// (section 12.4).
context *Node
// lenientSelfClosing controls whether to allow additional non-void elements
// to be self closing (<whatever ... />). This is mainly for better
// compatibility with XHTML found in EPUBs. MOD(geek1011)
lenientSelfClosing bool
}

func (p *parser) top() *Node {
Expand Down Expand Up @@ -652,6 +658,36 @@ func inHeadIM(p *parser) bool {
p.tokenizer.NextIsNotRawText()
return true
case a.Script, a.Title:
// MOD(geek1011): Allow title to be self-closing. See the mod below
// for A elements for more details. This is often found in XHTML
// EPUBs, where generators don't add any text to the title tag
// and use an XML renderer (which then self-closes it, which is
// ignored in the HTML5 spec, and results in everything after
// becoming the title when parsed using a compliant HTML5 parser).
//
// Also allow script to be self-closing, as I've seen it in quite
// a few cases when HTML is generated using an XML encoder and
// it's a script[src].
if p.lenientSelfClosing && p.hasSelfClosingToken {
// Add the element, but immediately remove it from the stack (
// this is necessary because it would usually be removed after
// the raw text, which we're skipping).
p.addElement()
p.oe.pop()

// This doesn't actually do anything here, but we acknowledge it
// for consistency.
p.acknowledgeSelfClosingTag()

// There isn't an end tag for Title, so it considers it raw text
// afterwards (even if we don't add the element) unless we tell
// it not to consider it as such.
p.tokenizer.NextIsNotRawText()

return true
}
// END MOD

p.addElement()
p.setOriginalIM()
p.im = textIM
Expand Down Expand Up @@ -975,6 +1011,31 @@ func inBodyIM(p *parser) bool {
}
}
p.reconstructActiveFormattingElements()
// MOD(geek1011): Allow A to be self-closing. THIS IS NOT SPEC-
// COMPLIANT, but won't cause any issues in basically any real-
// world case, as people don't go doing things like
// `<a href="/whatever" />link text</a>` and expect it to work (
// the spec says to ignore the self-closing on any non-void
// element). This fixes cases where people (or generators) do
// things like `<a id="aRandomID" />some more text` and expect
// it to be treated like XHTML/XML, where the self-closing would
// work.
//
// Based on the case for void elements (br, wbr, input, ...),
// but only closes them if they have the self closing token (/>)
// rather than closing them and ignoring content in all cases.
//
// This is done after reconstructActiveFormattingElements (but
// before addFormattingElement, which would be useless as there
// isn't any content in a self-closing A) to prevent breaking
// open formatting elements before the A tag.
if p.lenientSelfClosing && p.hasSelfClosingToken {
p.addElement()
p.oe.pop()
p.acknowledgeSelfClosingTag()
break
}
// END MOD
p.addFormattingElement()
case a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U:
p.reconstructActiveFormattingElements()
Expand Down Expand Up @@ -1093,6 +1154,15 @@ func inBodyIM(p *parser) bool {
case a.Caption, a.Col, a.Colgroup, a.Frame, a.Head, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr:
// Ignore the token.
default:
// MOD(geek1011): Allow span to be self-closing. See the mod above
// for A elements for more details.
if p.lenientSelfClosing && p.tok.DataAtom == a.Span && p.hasSelfClosingToken {
p.addElement()
p.oe.pop()
p.acknowledgeSelfClosingTag()
break
}
// END MOD
p.reconstructActiveFormattingElements()
p.addElement()
}
Expand Down Expand Up @@ -2331,6 +2401,15 @@ func ParseOptionEnableScripting(enable bool) ParseOption {
}
}

// ParseOptionLenientSelfClosing controls whether to allow additional non-void
// elements to be self closing (<whatever ... />). This is mainly for better
// compatibility with XHTML found in EPUBs. MOD(geek1011)
func ParseOptionLenientSelfClosing(enable bool) ParseOption {
return func(p *parser) {
p.lenientSelfClosing = enable
}
}

// ParseWithOptions is like Parse, with options.
func ParseWithOptions(r io.Reader, opts ...ParseOption) (*Node, error) {
p := &parser{
Expand Down
191 changes: 191 additions & 0 deletions html/parse_geek1011_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
// Copyright 2020 Patrick Gaskin.

package html

import (
"testing"
)

func TestMod_ParseLenientSelfClosing(t *testing.T) {
testModCase{
What: `Self closing title in head`,
Original: `<!DOCTYPE html><html><head><title/></head><body><p>Test 1</p></body></html>`,

ParseOptsA: []ParseOption{ParseOptionLenientSelfClosing(false)},
RenderOptsA: nil,
RenderedA: `<!DOCTYPE html><html><head><title>&lt;/head&gt;&lt;body&gt;&lt;p&gt;Test 1&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</title></head><body></body></html>`,

ParseOptsB: []ParseOption{ParseOptionLenientSelfClosing(true)},
RenderOptsB: nil,
RenderedB: `<!DOCTYPE html><html><head><title></title></head><body><p>Test 1</p></body></html>`,
}.Test(t)

testModCase{
What: `Self closing title in head with elements around`,
Original: `<!DOCTYPE html><html><head><base href="/"/><title/><meta charset="asd"/></head><body><p>Test 1</p></body></html>`,

ParseOptsA: []ParseOption{ParseOptionLenientSelfClosing(false)},
RenderOptsA: nil,
RenderedA: `<!DOCTYPE html><html><head><base href="/"/><title>&lt;meta charset=&#34;asd&#34;/&gt;&lt;/head&gt;&lt;body&gt;&lt;p&gt;Test 1&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</title></head><body></body></html>`,

ParseOptsB: []ParseOption{ParseOptionLenientSelfClosing(true)},
RenderOptsB: nil,
RenderedB: `<!DOCTYPE html><html><head><base href="/"/><title></title><meta charset="asd"/></head><body><p>Test 1</p></body></html>`,
}.Test(t)

testModCase{
What: `Self closing title in body`,
Original: `<!DOCTYPE html><html><head></head><body><title/><p>Test 1</p></body></html>`,

ParseOptsA: []ParseOption{ParseOptionLenientSelfClosing(false)},
RenderOptsA: nil,
RenderedA: `<!DOCTYPE html><html><head></head><body><title>&lt;p&gt;Test 1&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</title></body></html>`,

ParseOptsB: []ParseOption{ParseOptionLenientSelfClosing(true)},
RenderOptsB: nil,
RenderedB: `<!DOCTYPE html><html><head></head><body><title></title><p>Test 1</p></body></html>`,
}.Test(t)

testModCase{
What: `Self closing script in head`,
Original: `<!DOCTYPE html><html><head><title>Title</title><script src="script.js"/></head><body><p>Test 1</p></body></html>`,

ParseOptsA: []ParseOption{ParseOptionLenientSelfClosing(false)},
RenderOptsA: nil,
RenderedA: `<!DOCTYPE html><html><head><title>Title</title><script src="script.js"></head><body><p>Test 1</p></body></html></script></head><body></body></html>`,

ParseOptsB: []ParseOption{ParseOptionLenientSelfClosing(true)},
RenderOptsB: nil,
RenderedB: `<!DOCTYPE html><html><head><title>Title</title><script src="script.js"></script></head><body><p>Test 1</p></body></html>`,
}.Test(t)

testModCase{
What: `Self closing script in head with elements around`,
Original: `<!DOCTYPE html><html><head><title>Title</title><base href="/"/><script src="script.js"/><meta charset="asd"/></head><body><p>Test 1</p></body></html>`,

ParseOptsA: []ParseOption{ParseOptionLenientSelfClosing(false)},
RenderOptsA: nil,
RenderedA: `<!DOCTYPE html><html><head><title>Title</title><base href="/"/><script src="script.js"><meta charset="asd"/></head><body><p>Test 1</p></body></html></script></head><body></body></html>`,

ParseOptsB: []ParseOption{ParseOptionLenientSelfClosing(true)},
RenderOptsB: nil,
RenderedB: `<!DOCTYPE html><html><head><title>Title</title><base href="/"/><script src="script.js"></script><meta charset="asd"/></head><body><p>Test 1</p></body></html>`,
}.Test(t)

testModCase{
What: `Self closing script in body`,
Original: `<!DOCTYPE html><html><head><title>Title</title></head><body><script src="script.js"/><p>Test 1</p></body></html>`,

ParseOptsA: []ParseOption{ParseOptionLenientSelfClosing(false)},
RenderOptsA: nil,
RenderedA: `<!DOCTYPE html><html><head><title>Title</title></head><body><script src="script.js"><p>Test 1</p></body></html></script></body></html>`,

ParseOptsB: []ParseOption{ParseOptionLenientSelfClosing(true)},
RenderOptsB: nil,
RenderedB: `<!DOCTYPE html><html><head><title>Title</title></head><body><script src="script.js"></script><p>Test 1</p></body></html>`,
}.Test(t)

testModCase{
What: `Self closing a in body (simple)`,
Original: `<!DOCTYPE html><html><head><title>Title</title></head><body><p>Test <a id="test"/> 1</p></body></html>`,

ParseOptsA: []ParseOption{ParseOptionLenientSelfClosing(false)},
RenderOptsA: nil,
RenderedA: `<!DOCTYPE html><html><head><title>Title</title></head><body><p>Test <a id="test"> 1</a></p></body></html>`,

ParseOptsB: []ParseOption{ParseOptionLenientSelfClosing(true)},
RenderOptsB: nil,
RenderedB: `<!DOCTYPE html><html><head><title>Title</title></head><body><p>Test <a id="test"></a> 1</p></body></html>`,
}.Test(t)

testModCase{
What: `Self closing a in body (multiple)`,
Original: `<!DOCTYPE html><html><head><title>Title</title></head><body><p>Test <a id="test"/><a id="test1"/> 1</p></body></html>`,

ParseOptsA: []ParseOption{ParseOptionLenientSelfClosing(false)},
RenderOptsA: nil,
RenderedA: `<!DOCTYPE html><html><head><title>Title</title></head><body><p>Test <a id="test"></a><a id="test1"> 1</a></p></body></html>`,

ParseOptsB: []ParseOption{ParseOptionLenientSelfClosing(true)},
RenderOptsB: nil,
RenderedB: `<!DOCTYPE html><html><head><title>Title</title></head><body><p>Test <a id="test"></a><a id="test1"></a> 1</p></body></html>`,
}.Test(t)

testModCase{
What: `Self closing a in body (within text and escaped characters around multiple elements)`,
Original: `<!DOCTYPE html><html><head><title>Title</title></head><body><p>Test &gt;&lt;<a id="test"/>&gt;&lt; 1<span>test</span></p><p>Test 2</p></body></html>`,

ParseOptsA: []ParseOption{ParseOptionLenientSelfClosing(false)},
RenderOptsA: nil,
RenderedA: `<!DOCTYPE html><html><head><title>Title</title></head><body><p>Test &gt;&lt;<a id="test">&gt;&lt; 1<span>test</span></a></p><p><a id="test">Test 2</a></p></body></html>`,

ParseOptsB: []ParseOption{ParseOptionLenientSelfClosing(true)},
RenderOptsB: nil,
RenderedB: `<!DOCTYPE html><html><head><title>Title</title></head><body><p>Test &gt;&lt;<a id="test"></a>&gt;&lt; 1<span>test</span></p><p>Test 2</p></body></html>`,
}.Test(t)

testModCase{
What: `Self closing a in body (within text and escaped characters around multiple elements + HTML5 unclosed formatting elements)`,
Original: `<!DOCTYPE html><html><head><title>Title</title></head><body><p><i>Test &gt;&lt;<a id="test"/>&gt;<b>&lt; 1<span>test</span></p><p>Test 2</p></body></html>`,

ParseOptsA: []ParseOption{ParseOptionLenientSelfClosing(false)},
RenderOptsA: nil,
RenderedA: `<!DOCTYPE html><html><head><title>Title</title></head><body><p><i>Test &gt;&lt;<a id="test">&gt;<b>&lt; 1<span>test</span></b></a></i></p><p><i><a id="test"><b>Test 2</b></a></i></p></body></html>`,

ParseOptsB: []ParseOption{ParseOptionLenientSelfClosing(true)},
RenderOptsB: nil,
RenderedB: `<!DOCTYPE html><html><head><title>Title</title></head><body><p><i>Test &gt;&lt;<a id="test"></a>&gt;<b>&lt; 1<span>test</span></b></i></p><p><i><b>Test 2</b></i></p></body></html>`,
}.Test(t)

testModCase{
What: `Self closing span in body (simple)`,
Original: `<!DOCTYPE html><html><head><title>Title</title></head><body><p>Test <span id="test"/> 1</p></body></html>`,

ParseOptsA: []ParseOption{ParseOptionLenientSelfClosing(false)},
RenderOptsA: nil,
RenderedA: `<!DOCTYPE html><html><head><title>Title</title></head><body><p>Test <span id="test"> 1</span></p></body></html>`,

ParseOptsB: []ParseOption{ParseOptionLenientSelfClosing(true)},
RenderOptsB: nil,
RenderedB: `<!DOCTYPE html><html><head><title>Title</title></head><body><p>Test <span id="test"></span> 1</p></body></html>`,
}.Test(t)

testModCase{
What: `Self closing span in body (multiple)`,
Original: `<!DOCTYPE html><html><head><title>Title</title></head><body><p>Test <span id="test"/><span id="test1"/> 1</p></body></html>`,

ParseOptsA: []ParseOption{ParseOptionLenientSelfClosing(false)},
RenderOptsA: nil,
RenderedA: `<!DOCTYPE html><html><head><title>Title</title></head><body><p>Test <span id="test"><span id="test1"> 1</span></span></p></body></html>`,

ParseOptsB: []ParseOption{ParseOptionLenientSelfClosing(true)},
RenderOptsB: nil,
RenderedB: `<!DOCTYPE html><html><head><title>Title</title></head><body><p>Test <span id="test"></span><span id="test1"></span> 1</p></body></html>`,
}.Test(t)

testModCase{
What: `Self closing span in body (within text and escaped characters around multiple elements)`,
Original: `<!DOCTYPE html><html><head><title>Title</title></head><body><p>Test &gt;&lt;<span id="test" />&gt;&lt; 1<span>test</span></p><p>Test 2</p></body></html>`,

ParseOptsA: []ParseOption{ParseOptionLenientSelfClosing(false)},
RenderOptsA: nil,
RenderedA: `<!DOCTYPE html><html><head><title>Title</title></head><body><p>Test &gt;&lt;<span id="test">&gt;&lt; 1<span>test</span></span></p><p>Test 2</p></body></html>`,

ParseOptsB: []ParseOption{ParseOptionLenientSelfClosing(true)},
RenderOptsB: nil,
RenderedB: `<!DOCTYPE html><html><head><title>Title</title></head><body><p>Test &gt;&lt;<span id="test"></span>&gt;&lt; 1<span>test</span></p><p>Test 2</p></body></html>`,
}.Test(t)

testModCase{
What: `Self closing span in body (within text and escaped characters around multiple elements + HTML5 unclosed formatting elements)`,
Original: `<!DOCTYPE html><html><head><title>Title</title></head><body><p><i>Test &gt;&lt;<span id="test" />&gt;<b>&lt; 1<span>test</span></p><p>Test 2</p></body></html>`,

ParseOptsA: []ParseOption{ParseOptionLenientSelfClosing(false)},
RenderOptsA: nil,
RenderedA: `<!DOCTYPE html><html><head><title>Title</title></head><body><p><i>Test &gt;&lt;<span id="test">&gt;<b>&lt; 1<span>test</span></b></span></i></p><p><i><b>Test 2</b></i></p></body></html>`,

ParseOptsB: []ParseOption{ParseOptionLenientSelfClosing(true)},
RenderOptsB: nil,
RenderedB: `<!DOCTYPE html><html><head><title>Title</title></head><body><p><i>Test &gt;&lt;<span id="test"></span>&gt;<b>&lt; 1<span>test</span></b></i></p><p><i><b>Test 2</b></i></p></body></html>`,
}.Test(t)
}

0 comments on commit bc324ea

Please sign in to comment.