diff --git a/html/parse.go b/html/parse.go index 2cd12fc816..e5942e1ba2 100644 --- a/html/parse.go +++ b/html/parse.go @@ -2,6 +2,8 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +// Modifications Copyright 2020 Patrick Gaskin. + package html import ( @@ -49,6 +51,10 @@ type parser struct { // context is the context element when parsing an HTML fragment // (section 12.4). context *Node + // lenientSelfClosing controls whether to allow additional non-void elements + // to be self closing (). This is mainly for better + // compatibility with XHTML found in EPUBs. MOD(geek1011) + lenientSelfClosing bool } func (p *parser) top() *Node { @@ -652,6 +658,36 @@ func inHeadIM(p *parser) bool { p.tokenizer.NextIsNotRawText() return true case a.Script, a.Title: + // MOD(geek1011): Allow title to be self-closing. See the mod below + // for A elements for more details. This is often found in XHTML + // EPUBs, where generators don't add any text to the title tag + // and use an XML renderer (which then self-closes it, which is + // ignored in the HTML5 spec, and results in everything after + // becoming the title when parsed using a compliant HTML5 parser). + // + // Also allow script to be self-closing, as I've seen it in quite + // a few cases when HTML is generated using an XML encoder and + // it's a script[src]. + if p.lenientSelfClosing && p.hasSelfClosingToken { + // Add the element, but immediately remove it from the stack ( + // this is necessary because it would usually be removed after + // the raw text, which we're skipping). + p.addElement() + p.oe.pop() + + // This doesn't actually do anything here, but we acknowledge it + // for consistency. + p.acknowledgeSelfClosingTag() + + // There isn't an end tag for Title, so it considers it raw text + // afterwards (even if we don't add the element) unless we tell + // it not to consider it as such. + p.tokenizer.NextIsNotRawText() + + return true + } + // END MOD + p.addElement() p.setOriginalIM() p.im = textIM @@ -975,6 +1011,31 @@ func inBodyIM(p *parser) bool { } } p.reconstructActiveFormattingElements() + // MOD(geek1011): Allow A to be self-closing. THIS IS NOT SPEC- + // COMPLIANT, but won't cause any issues in basically any real- + // world case, as people don't go doing things like + // `link text` and expect it to work ( + // the spec says to ignore the self-closing on any non-void + // element). This fixes cases where people (or generators) do + // things like `some more text` and expect + // it to be treated like XHTML/XML, where the self-closing would + // work. + // + // Based on the case for void elements (br, wbr, input, ...), + // but only closes them if they have the self closing token (/>) + // rather than closing them and ignoring content in all cases. + // + // This is done after reconstructActiveFormattingElements (but + // before addFormattingElement, which would be useless as there + // isn't any content in a self-closing A) to prevent breaking + // open formatting elements before the A tag. + if p.lenientSelfClosing && p.hasSelfClosingToken { + p.addElement() + p.oe.pop() + p.acknowledgeSelfClosingTag() + break + } + // END MOD p.addFormattingElement() case a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U: p.reconstructActiveFormattingElements() @@ -1093,6 +1154,15 @@ func inBodyIM(p *parser) bool { case a.Caption, a.Col, a.Colgroup, a.Frame, a.Head, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: // Ignore the token. default: + // MOD(geek1011): Allow span to be self-closing. See the mod above + // for A elements for more details. + if p.lenientSelfClosing && p.tok.DataAtom == a.Span && p.hasSelfClosingToken { + p.addElement() + p.oe.pop() + p.acknowledgeSelfClosingTag() + break + } + // END MOD p.reconstructActiveFormattingElements() p.addElement() } @@ -2331,6 +2401,15 @@ func ParseOptionEnableScripting(enable bool) ParseOption { } } +// ParseOptionLenientSelfClosing controls whether to allow additional non-void +// elements to be self closing (). This is mainly for better +// compatibility with XHTML found in EPUBs. MOD(geek1011) +func ParseOptionLenientSelfClosing(enable bool) ParseOption { + return func(p *parser) { + p.lenientSelfClosing = enable + } +} + // ParseWithOptions is like Parse, with options. func ParseWithOptions(r io.Reader, opts ...ParseOption) (*Node, error) { p := &parser{ diff --git a/html/parse_geek1011_test.go b/html/parse_geek1011_test.go new file mode 100644 index 0000000000..ab9018786c --- /dev/null +++ b/html/parse_geek1011_test.go @@ -0,0 +1,191 @@ +// Copyright 2020 Patrick Gaskin. + +package html + +import ( + "testing" +) + +func TestMod_ParseLenientSelfClosing(t *testing.T) { + testModCase{ + What: `Self closing title in head`, + Original: `</head><body><p>Test 1</p></body></html>`, + + ParseOptsA: []ParseOption{ParseOptionLenientSelfClosing(false)}, + RenderOptsA: nil, + RenderedA: `<!DOCTYPE html><html><head><title></head><body><p>Test 1</p></body></html>`, + + ParseOptsB: []ParseOption{ParseOptionLenientSelfClosing(true)}, + RenderOptsB: nil, + RenderedB: `

Test 1

`, + }.Test(t) + + testModCase{ + What: `Self closing title in head with elements around`, + Original: `<meta charset="asd"/></head><body><p>Test 1</p></body></html>`, + + ParseOptsA: []ParseOption{ParseOptionLenientSelfClosing(false)}, + RenderOptsA: nil, + RenderedA: `<!DOCTYPE html><html><head><base href="/"/><title><meta charset="asd"/></head><body><p>Test 1</p></body></html>`, + + ParseOptsB: []ParseOption{ParseOptionLenientSelfClosing(true)}, + RenderOptsB: nil, + RenderedB: `

Test 1

`, + }.Test(t) + + testModCase{ + What: `Self closing title in body`, + Original: `<p>Test 1</p></body></html>`, + + ParseOptsA: []ParseOption{ParseOptionLenientSelfClosing(false)}, + RenderOptsA: nil, + RenderedA: `<!DOCTYPE html><html><head></head><body><title><p>Test 1</p></body></html>`, + + ParseOptsB: []ParseOption{ParseOptionLenientSelfClosing(true)}, + RenderOptsB: nil, + RenderedB: `

Test 1

`, + }.Test(t) + + testModCase{ + What: `Self closing script in head`, + Original: `Title`, + + ParseOptsB: []ParseOption{ParseOptionLenientSelfClosing(true)}, + RenderOptsB: nil, + RenderedB: `Title

Test 1

`, + }.Test(t) + + testModCase{ + What: `Self closing script in head with elements around`, + Original: `Title`, + + ParseOptsB: []ParseOption{ParseOptionLenientSelfClosing(true)}, + RenderOptsB: nil, + RenderedB: `Title

Test 1

`, + }.Test(t) + + testModCase{ + What: `Self closing script in body`, + Original: `Title`, + + ParseOptsB: []ParseOption{ParseOptionLenientSelfClosing(true)}, + RenderOptsB: nil, + RenderedB: `Title

Test 1

`, + }.Test(t) + + testModCase{ + What: `Self closing a in body (simple)`, + Original: `Title

Test 1

`, + + ParseOptsA: []ParseOption{ParseOptionLenientSelfClosing(false)}, + RenderOptsA: nil, + RenderedA: `Title

Test 1

`, + + ParseOptsB: []ParseOption{ParseOptionLenientSelfClosing(true)}, + RenderOptsB: nil, + RenderedB: `Title

Test 1

`, + }.Test(t) + + testModCase{ + What: `Self closing a in body (multiple)`, + Original: `Title

Test 1

`, + + ParseOptsA: []ParseOption{ParseOptionLenientSelfClosing(false)}, + RenderOptsA: nil, + RenderedA: `Title

Test 1

`, + + ParseOptsB: []ParseOption{ParseOptionLenientSelfClosing(true)}, + RenderOptsB: nil, + RenderedB: `Title

Test 1

`, + }.Test(t) + + testModCase{ + What: `Self closing a in body (within text and escaped characters around multiple elements)`, + Original: `Title

Test ><>< 1test

Test 2

`, + + ParseOptsA: []ParseOption{ParseOptionLenientSelfClosing(false)}, + RenderOptsA: nil, + RenderedA: `Title

Test ><>< 1test

Test 2

`, + + ParseOptsB: []ParseOption{ParseOptionLenientSelfClosing(true)}, + RenderOptsB: nil, + RenderedB: `Title

Test ><>< 1test

Test 2

`, + }.Test(t) + + testModCase{ + What: `Self closing a in body (within text and escaped characters around multiple elements + HTML5 unclosed formatting elements)`, + Original: `Title

Test ><>< 1test

Test 2

`, + + ParseOptsA: []ParseOption{ParseOptionLenientSelfClosing(false)}, + RenderOptsA: nil, + RenderedA: `Title

Test ><>< 1test

Test 2

`, + + ParseOptsB: []ParseOption{ParseOptionLenientSelfClosing(true)}, + RenderOptsB: nil, + RenderedB: `Title

Test ><>< 1test

Test 2

`, + }.Test(t) + + testModCase{ + What: `Self closing span in body (simple)`, + Original: `Title

Test 1

`, + + ParseOptsA: []ParseOption{ParseOptionLenientSelfClosing(false)}, + RenderOptsA: nil, + RenderedA: `Title

Test 1

`, + + ParseOptsB: []ParseOption{ParseOptionLenientSelfClosing(true)}, + RenderOptsB: nil, + RenderedB: `Title

Test 1

`, + }.Test(t) + + testModCase{ + What: `Self closing span in body (multiple)`, + Original: `Title

Test 1

`, + + ParseOptsA: []ParseOption{ParseOptionLenientSelfClosing(false)}, + RenderOptsA: nil, + RenderedA: `Title

Test 1

`, + + ParseOptsB: []ParseOption{ParseOptionLenientSelfClosing(true)}, + RenderOptsB: nil, + RenderedB: `Title

Test 1

`, + }.Test(t) + + testModCase{ + What: `Self closing span in body (within text and escaped characters around multiple elements)`, + Original: `Title

Test ><>< 1test

Test 2

`, + + ParseOptsA: []ParseOption{ParseOptionLenientSelfClosing(false)}, + RenderOptsA: nil, + RenderedA: `Title

Test ><>< 1test

Test 2

`, + + ParseOptsB: []ParseOption{ParseOptionLenientSelfClosing(true)}, + RenderOptsB: nil, + RenderedB: `Title

Test ><>< 1test

Test 2

`, + }.Test(t) + + testModCase{ + What: `Self closing span in body (within text and escaped characters around multiple elements + HTML5 unclosed formatting elements)`, + Original: `Title

Test ><>< 1test

Test 2

`, + + ParseOptsA: []ParseOption{ParseOptionLenientSelfClosing(false)}, + RenderOptsA: nil, + RenderedA: `Title

Test ><>< 1test

Test 2

`, + + ParseOptsB: []ParseOption{ParseOptionLenientSelfClosing(true)}, + RenderOptsB: nil, + RenderedB: `Title

Test ><>< 1test

Test 2

`, + }.Test(t) +}