From 906ced9702bdd0cf20fde92f3b8ba304e9486924 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn?= Date: Fri, 19 Nov 2021 23:14:03 +0000 Subject: [PATCH 1/4] HtmlState: restore FormattedMode if still in pre At the same time the assumption that the code element is formatted is not correct. They are often used at the same time though and with that comes another issue: keeping FormattedMode in elements nested into pre. --- src/Html/HtmlParser.fs | 11 +++++------ tests/FSharp.Data.Tests/HtmlParser.fs | 17 +++++++++-------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/Html/HtmlParser.fs b/src/Html/HtmlParser.fs index c41fcb7a8..493b3e94a 100644 --- a/src/Html/HtmlParser.fs +++ b/src/Html/HtmlParser.fs @@ -334,14 +334,11 @@ module internal HtmlParser = x.Tokens := result :: !x.Tokens member x.IsFormattedTag - with get() = - match x.CurrentTagName() with - | "pre" | "code" -> true - | _ -> false + with get() = x.CurrentTagName().ToLower() = "pre" member x.IsScriptTag with get() = - match x.CurrentTagName().Trim().ToLower() with + match x.CurrentTagName().ToLower() with | "script" | "style" -> true | _ -> false @@ -386,7 +383,9 @@ module internal HtmlParser = | DocTypeMode -> DocType content | CDATAMode -> CData (content.Replace("", "")) x.Content := CharList.Empty - x.InsertionMode := DefaultMode + x.InsertionMode := + if x.IsFormattedTag then FormattedMode + else DefaultMode match result with | Text t when String.IsNullOrEmpty(t) -> () | _ -> x.Tokens := result :: !x.Tokens diff --git a/tests/FSharp.Data.Tests/HtmlParser.fs b/tests/FSharp.Data.Tests/HtmlParser.fs index 359cdf175..7420dae20 100644 --- a/tests/FSharp.Data.Tests/HtmlParser.fs +++ b/tests/FSharp.Data.Tests/HtmlParser.fs @@ -805,15 +805,16 @@ let ``Can parse pre blocks``() = result |> should equal [ "\r\n This code should be indented and\r\n have line feeds in it" ] [] -let ``Can parse code blocks``() = - let html = "\r\n let f a b = a * b\r\n f 5 6 |> should equal 30" +let ``Can parse pre blocks with char refs``() = + let html = "
let hello who =\r\n    "hello" + who
" let result = (HtmlDocument.Parse html) - |> HtmlDocument.descendantsNamed true [ "code" ] - |> Seq.map (HtmlNode.innerText) - |> Seq.toList - result |> should equal [ "\r\n let f a b = a * b\r\n f 5 6 |> should equal 30" ] + |> HtmlDocument.descendantsNamed true [ "pre" ] + |> Seq.head + |> HtmlNode.innerText + let expected = "let hello who =\r\n \"hello\" + who" + result |> should equal expected [] let ``Can parse national rail mobile site correctly``() = @@ -911,9 +912,9 @@ let ``Parsing non-html content doesn't cause an infinite loop - Github-1264``() [] let ``Can handle incomplete tags at end of file without creating an infinite loop``() = let result = HtmlDocument.Parse """ should equal expected \ No newline at end of file + result |> should equal expected From 978a362bf2061e34981176befd89b6c603bbb531 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn?= Date: Sat, 20 Nov 2021 14:56:24 +0000 Subject: [PATCH 2/4] HtmlState: restore code element treated as formatted, fixup test --- src/Html/HtmlParser.fs | 5 ++++- tests/FSharp.Data.Tests/HtmlParser.fs | 15 +++++++++++++-- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/src/Html/HtmlParser.fs b/src/Html/HtmlParser.fs index 493b3e94a..87c53edcb 100644 --- a/src/Html/HtmlParser.fs +++ b/src/Html/HtmlParser.fs @@ -334,7 +334,10 @@ module internal HtmlParser = x.Tokens := result :: !x.Tokens member x.IsFormattedTag - with get() = x.CurrentTagName().ToLower() = "pre" + with get() = + match x.CurrentTagName().ToLower() with + | "pre" | "code" -> true + | _ -> false member x.IsScriptTag with get() = diff --git a/tests/FSharp.Data.Tests/HtmlParser.fs b/tests/FSharp.Data.Tests/HtmlParser.fs index 7420dae20..fafed6adc 100644 --- a/tests/FSharp.Data.Tests/HtmlParser.fs +++ b/tests/FSharp.Data.Tests/HtmlParser.fs @@ -804,16 +804,27 @@ let ``Can parse pre blocks``() = |> Seq.toList result |> should equal [ "\r\n This code should be indented and\r\n have line feeds in it" ] +[] +let ``Can parse code blocks``() = + let html = "\r\n let f a b = a * b\r\n f 5 6 |> should equal 30" + + let result = + (HtmlDocument.Parse html) + |> HtmlDocument.descendantsNamed true [ "code" ] + |> Seq.map (HtmlNode.innerText) + |> Seq.toList + result |> should equal [ "\r\n let f a b = a * b\r\n f 5 6 |> should equal 30" ] + [] let ``Can parse pre blocks with char refs``() = - let html = "
let hello who =\r\n    "hello" + who
" + let html = "
let hello =\r\n    fun who ->\r\n        "hello" + who
" let result = (HtmlDocument.Parse html) |> HtmlDocument.descendantsNamed true [ "pre" ] |> Seq.head |> HtmlNode.innerText - let expected = "let hello who =\r\n \"hello\" + who" + let expected = "let hello =\r\n fun who ->\r\n \"hello\" + who" result |> should equal expected [] From 3ecc424574bdf239cc4768bd912c102fa63a9498 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn?= Date: Tue, 23 Nov 2021 09:32:27 +0000 Subject: [PATCH 3/4] HtmlState: move from FormattedMode to HasFormattedParent bool --- src/Html/HtmlParser.fs | 29 ++++++++++++++++----------- tests/FSharp.Data.Tests/HtmlParser.fs | 18 +++++++++++++++-- 2 files changed, 33 insertions(+), 14 deletions(-) diff --git a/src/Html/HtmlParser.fs b/src/Html/HtmlParser.fs index 87c53edcb..07c1ddba7 100644 --- a/src/Html/HtmlParser.fs +++ b/src/Html/HtmlParser.fs @@ -255,7 +255,6 @@ module internal HtmlParser = type InsertionMode = | DefaultMode - | FormattedMode | ScriptMode | CharRefMode | CommentMode @@ -264,7 +263,6 @@ module internal HtmlParser = override x.ToString() = match x with | DefaultMode -> "default" - | FormattedMode -> "formatted" | ScriptMode -> "script" | CharRefMode -> "charref" | CommentMode -> "comment" @@ -275,6 +273,7 @@ module internal HtmlParser = { Attributes : (CharList * CharList) list ref CurrentTag : CharList ref Content : CharList ref + HasFormattedParent: bool ref InsertionMode : InsertionMode ref Tokens : HtmlToken list ref Reader : TextReader } @@ -282,6 +281,7 @@ module internal HtmlParser = { Attributes = ref [] CurrentTag = ref CharList.Empty Content = ref CharList.Empty + HasFormattedParent = ref false InsertionMode = ref DefaultMode Tokens = ref [] Reader = reader } @@ -336,7 +336,7 @@ module internal HtmlParser = member x.IsFormattedTag with get() = match x.CurrentTagName().ToLower() with - | "pre" | "code" -> true + | "pre" -> true | _ -> false member x.IsScriptTag @@ -355,9 +355,15 @@ module internal HtmlParser = else TagEnd(name) else Tag(false, name, x.GetAttributes()) + // pre is the only default formatted tag, nested pres are not + // allowed in the spec. + if x.IsFormattedTag then + x.HasFormattedParent := not isEnd + else + x.HasFormattedParent := !x.HasFormattedParent || x.IsFormattedTag + x.InsertionMode := - if x.IsFormattedTag && (not isEnd) then FormattedMode - elif x.IsScriptTag && (not isEnd) then ScriptMode + if x.IsScriptTag && (not isEnd) then ScriptMode else DefaultMode x.CurrentTag := CharList.Empty @@ -377,18 +383,18 @@ module internal HtmlParser = let content = (!x.Content).ToString() match !x.InsertionMode with | DefaultMode -> - let normalizedContent = wsRegex.Value.Replace(content, " ") - if normalizedContent = " " then Text "" else Text normalizedContent - | FormattedMode -> content |> Text + if !x.HasFormattedParent then + Text content + else + let normalizedContent = wsRegex.Value.Replace(content, " ") + if normalizedContent = " " then Text "" else Text normalizedContent | ScriptMode -> content |> Text | CharRefMode -> content.Trim() |> HtmlCharRefs.substitute |> Text | CommentMode -> Comment content | DocTypeMode -> DocType content | CDATAMode -> CData (content.Replace("", "")) x.Content := CharList.Empty - x.InsertionMode := - if x.IsFormattedTag then FormattedMode - else DefaultMode + x.InsertionMode := DefaultMode match result with | Text t when String.IsNullOrEmpty(t) -> () | _ -> x.Tokens := result :: !x.Tokens @@ -424,7 +430,6 @@ module internal HtmlParser = match !state.InsertionMode with | DefaultMode -> state.Cons(); data state | ScriptMode -> script state; - | FormattedMode -> state.Cons(); data state | CharRefMode -> charRef state | DocTypeMode -> docType state | CommentMode -> comment state diff --git a/tests/FSharp.Data.Tests/HtmlParser.fs b/tests/FSharp.Data.Tests/HtmlParser.fs index fafed6adc..87cee2147 100644 --- a/tests/FSharp.Data.Tests/HtmlParser.fs +++ b/tests/FSharp.Data.Tests/HtmlParser.fs @@ -805,8 +805,8 @@ let ``Can parse pre blocks``() = result |> should equal [ "\r\n This code should be indented and\r\n have line feeds in it" ] [] -let ``Can parse code blocks``() = - let html = "\r\n let f a b = a * b\r\n f 5 6 |> should equal 30" +let ``Can parse pre containing code blocks``() = + let html = "
\r\n        let f a b = a * b\r\n        f 5 6 |> should equal 30
" let result = (HtmlDocument.Parse html) @@ -827,6 +827,20 @@ let ``Can parse pre blocks with char refs``() = let expected = "let hello =\r\n fun who ->\r\n \"hello\" + who" result |> should equal expected +[] +let ``Drops whitespace outside pre``() = + let html = + "
foo
    bar    
baz
" + + let result = + (HtmlDocument.Parse html) + |> HtmlDocument.descendantsNamed false [ "div" ] + |> Seq.head + |> string + // default indentation is 2 spaces + let expected = "
\n foo
    bar    
baz\n
" + result |> should equal expected + [] let ``Can parse national rail mobile site correctly``() = HtmlDocument.Load "Data/UKDepartures.html" From 0ac87b2a03b4a8dc202b884f70d7d870bf819d02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn?= Date: Tue, 23 Nov 2021 16:51:59 +0000 Subject: [PATCH 4/4] HtmlState: fixup test to expect Environment.NewLine --- tests/FSharp.Data.Tests/HtmlParser.fs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/FSharp.Data.Tests/HtmlParser.fs b/tests/FSharp.Data.Tests/HtmlParser.fs index 87cee2147..31e051ba3 100644 --- a/tests/FSharp.Data.Tests/HtmlParser.fs +++ b/tests/FSharp.Data.Tests/HtmlParser.fs @@ -1,5 +1,6 @@ module FSharp.Data.Tests.HtmlParser +open System open System.Globalization open NUnit.Framework open FsUnit @@ -838,7 +839,8 @@ let ``Drops whitespace outside pre``() = |> Seq.head |> string // default indentation is 2 spaces - let expected = "
\n foo
    bar    
baz\n
" + let nl = Environment.NewLine + let expected = $"
%s{nl} foo
    bar    
baz%s{nl}
" result |> should equal expected []