-
Notifications
You must be signed in to change notification settings - Fork 289
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
HtmlState: restore FormattedMode if still in pre after char ref #1414
Changes from all commits
906ced9
978a362
3ecc424
0ac87b2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -255,7 +255,6 @@ module internal HtmlParser = | |
|
||
type InsertionMode = | ||
| DefaultMode | ||
| FormattedMode | ||
| ScriptMode | ||
| CharRefMode | ||
| CommentMode | ||
|
@@ -264,7 +263,6 @@ module internal HtmlParser = | |
override x.ToString() = | ||
match x with | ||
| DefaultMode -> "default" | ||
| FormattedMode -> "formatted" | ||
| ScriptMode -> "script" | ||
| CharRefMode -> "charref" | ||
| CommentMode -> "comment" | ||
|
@@ -275,13 +273,15 @@ module internal HtmlParser = | |
{ Attributes : (CharList * CharList) list ref | ||
CurrentTag : CharList ref | ||
Content : CharList ref | ||
HasFormattedParent: bool ref | ||
InsertionMode : InsertionMode ref | ||
Tokens : HtmlToken list ref | ||
Reader : TextReader } | ||
static member Create (reader:TextReader) = | ||
{ Attributes = ref [] | ||
CurrentTag = ref CharList.Empty | ||
Content = ref CharList.Empty | ||
HasFormattedParent = ref false | ||
InsertionMode = ref DefaultMode | ||
Tokens = ref [] | ||
Reader = reader } | ||
|
@@ -335,13 +335,13 @@ module internal HtmlParser = | |
|
||
member x.IsFormattedTag | ||
with get() = | ||
match x.CurrentTagName() with | ||
| "pre" | "code" -> true | ||
| _ -> false | ||
match x.CurrentTagName().ToLower() with | ||
| "pre" -> true | ||
| _ -> false | ||
|
||
member x.IsScriptTag | ||
with get() = | ||
match x.CurrentTagName().Trim().ToLower() with | ||
match x.CurrentTagName().ToLower() with | ||
| "script" | "style" -> true | ||
| _ -> false | ||
|
||
|
@@ -355,9 +355,15 @@ module internal HtmlParser = | |
else TagEnd(name) | ||
else Tag(false, name, x.GetAttributes()) | ||
|
||
// pre is the only default formatted tag, nested pres are not | ||
// allowed in the spec. | ||
if x.IsFormattedTag then | ||
x.HasFormattedParent := not isEnd | ||
else | ||
x.HasFormattedParent := !x.HasFormattedParent || x.IsFormattedTag | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. as an example if this is
The codebase is fairly old :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hm, I just meant to be consistent, should I update it? |
||
|
||
x.InsertionMode := | ||
if x.IsFormattedTag && (not isEnd) then FormattedMode | ||
elif x.IsScriptTag && (not isEnd) then ScriptMode | ||
if x.IsScriptTag && (not isEnd) then ScriptMode | ||
else DefaultMode | ||
|
||
x.CurrentTag := CharList.Empty | ||
|
@@ -377,9 +383,11 @@ module internal HtmlParser = | |
let content = (!x.Content).ToString() | ||
match !x.InsertionMode with | ||
| DefaultMode -> | ||
let normalizedContent = wsRegex.Value.Replace(content, " ") | ||
if normalizedContent = " " then Text "" else Text normalizedContent | ||
| FormattedMode -> content |> Text | ||
if !x.HasFormattedParent then | ||
Text content | ||
else | ||
let normalizedContent = wsRegex.Value.Replace(content, " ") | ||
if normalizedContent = " " then Text "" else Text normalizedContent | ||
| ScriptMode -> content |> Text | ||
| CharRefMode -> content.Trim() |> HtmlCharRefs.substitute |> Text | ||
| CommentMode -> Comment content | ||
|
@@ -422,7 +430,6 @@ module internal HtmlParser = | |
match !state.InsertionMode with | ||
| DefaultMode -> state.Cons(); data state | ||
| ScriptMode -> script state; | ||
| FormattedMode -> state.Cons(); data state | ||
| CharRefMode -> charRef state | ||
| DocTypeMode -> docType state | ||
| CommentMode -> comment state | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
module FSharp.Data.Tests.HtmlParser | ||
|
||
open System | ||
open System.Globalization | ||
open NUnit.Framework | ||
open FsUnit | ||
|
@@ -805,8 +806,8 @@ let ``Can parse pre blocks``() = | |
result |> should equal [ "\r\n This code should be indented and\r\n have line feeds in it" ] | ||
|
||
[<Test>] | ||
let ``Can parse code blocks``() = | ||
let html = "<code>\r\n let f a b = a * b\r\n f 5 6 |> should equal 30</code>" | ||
let ``Can parse pre containing code blocks``() = | ||
let html = "<pre><code>\r\n let f a b = a * b\r\n f 5 6 |> should equal 30</code></pre>" | ||
|
||
let result = | ||
(HtmlDocument.Parse html) | ||
|
@@ -815,6 +816,33 @@ let ``Can parse code blocks``() = | |
|> Seq.toList | ||
result |> should equal [ "\r\n let f a b = a * b\r\n f 5 6 |> should equal 30" ] | ||
|
||
[<Test>] | ||
let ``Can parse pre blocks with char refs``() = | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This removes the code blocks test - could you add that test back as well? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure, it would test for invalid behavior though since code is an inline element. Should I restore the behavior for code anyways and open another issue for elements nested in pre? |
||
let html = "<pre>let hello =\r\n fun who ->\r\n "hello" + who</pre>" | ||
|
||
let result = | ||
(HtmlDocument.Parse html) | ||
|> HtmlDocument.descendantsNamed true [ "pre" ] | ||
|> Seq.head | ||
|> HtmlNode.innerText | ||
let expected = "let hello =\r\n fun who ->\r\n \"hello\" + who" | ||
result |> should equal expected | ||
|
||
[<Test>] | ||
let ``Drops whitespace outside pre``() = | ||
let html = | ||
"<div>foo <pre> bar </pre> baz</div>" | ||
|
||
let result = | ||
(HtmlDocument.Parse html) | ||
|> HtmlDocument.descendantsNamed false [ "div" ] | ||
|> Seq.head | ||
|> string | ||
// default indentation is 2 spaces | ||
let nl = Environment.NewLine | ||
let expected = $"<div>%s{nl} foo <pre> bar </pre> baz%s{nl}</div>" | ||
result |> should equal expected | ||
|
||
[<Test>] | ||
let ``Can parse national rail mobile site correctly``() = | ||
HtmlDocument.Load "Data/UKDepartures.html" | ||
|
@@ -911,9 +939,9 @@ let ``Parsing non-html content doesn't cause an infinite loop - Github-1264``() | |
[<Test; Timeout(2000)>] | ||
let ``Can handle incomplete tags at end of file without creating an infinite loop``() = | ||
let result = HtmlDocument.Parse """<html><head></head></html""" | ||
let expected = | ||
let expected = | ||
HtmlDocument.New | ||
[ HtmlNode.NewElement | ||
("html", | ||
[ HtmlNode.NewElement("head")])] | ||
result |> should equal expected | ||
result |> should equal expected |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
An alternative is that this can be a
mutable HasFormattedParent
to avoid theref
, but I don't think that's terribly important.