diff --git a/src/Html/HtmlParser.fs b/src/Html/HtmlParser.fs index c41fcb7a8..07c1ddba7 100644 --- a/src/Html/HtmlParser.fs +++ b/src/Html/HtmlParser.fs @@ -255,7 +255,6 @@ module internal HtmlParser = type InsertionMode = | DefaultMode - | FormattedMode | ScriptMode | CharRefMode | CommentMode @@ -264,7 +263,6 @@ module internal HtmlParser = override x.ToString() = match x with | DefaultMode -> "default" - | FormattedMode -> "formatted" | ScriptMode -> "script" | CharRefMode -> "charref" | CommentMode -> "comment" @@ -275,6 +273,7 @@ module internal HtmlParser = { Attributes : (CharList * CharList) list ref CurrentTag : CharList ref Content : CharList ref + HasFormattedParent: bool ref InsertionMode : InsertionMode ref Tokens : HtmlToken list ref Reader : TextReader } @@ -282,6 +281,7 @@ module internal HtmlParser = { Attributes = ref [] CurrentTag = ref CharList.Empty Content = ref CharList.Empty + HasFormattedParent = ref false InsertionMode = ref DefaultMode Tokens = ref [] Reader = reader } @@ -335,13 +335,13 @@ module internal HtmlParser = member x.IsFormattedTag with get() = - match x.CurrentTagName() with - | "pre" | "code" -> true - | _ -> false + match x.CurrentTagName().ToLower() with + | "pre" -> true + | _ -> false member x.IsScriptTag with get() = - match x.CurrentTagName().Trim().ToLower() with + match x.CurrentTagName().ToLower() with | "script" | "style" -> true | _ -> false @@ -355,9 +355,15 @@ module internal HtmlParser = else TagEnd(name) else Tag(false, name, x.GetAttributes()) + // pre is the only default formatted tag, nested pres are not + // allowed in the spec. + if x.IsFormattedTag then + x.HasFormattedParent := not isEnd + else + x.HasFormattedParent := !x.HasFormattedParent || x.IsFormattedTag + x.InsertionMode := - if x.IsFormattedTag && (not isEnd) then FormattedMode - elif x.IsScriptTag && (not isEnd) then ScriptMode + if x.IsScriptTag && (not isEnd) then ScriptMode else DefaultMode x.CurrentTag := CharList.Empty @@ -377,9 +383,11 @@ module internal HtmlParser = let content = (!x.Content).ToString() match !x.InsertionMode with | DefaultMode -> - let normalizedContent = wsRegex.Value.Replace(content, " ") - if normalizedContent = " " then Text "" else Text normalizedContent - | FormattedMode -> content |> Text + if !x.HasFormattedParent then + Text content + else + let normalizedContent = wsRegex.Value.Replace(content, " ") + if normalizedContent = " " then Text "" else Text normalizedContent | ScriptMode -> content |> Text | CharRefMode -> content.Trim() |> HtmlCharRefs.substitute |> Text | CommentMode -> Comment content @@ -422,7 +430,6 @@ module internal HtmlParser = match !state.InsertionMode with | DefaultMode -> state.Cons(); data state | ScriptMode -> script state; - | FormattedMode -> state.Cons(); data state | CharRefMode -> charRef state | DocTypeMode -> docType state | CommentMode -> comment state diff --git a/tests/FSharp.Data.Tests/HtmlParser.fs b/tests/FSharp.Data.Tests/HtmlParser.fs index 359cdf175..31e051ba3 100644 --- a/tests/FSharp.Data.Tests/HtmlParser.fs +++ b/tests/FSharp.Data.Tests/HtmlParser.fs @@ -1,5 +1,6 @@ module FSharp.Data.Tests.HtmlParser +open System open System.Globalization open NUnit.Framework open FsUnit @@ -805,8 +806,8 @@ let ``Can parse pre blocks``() = result |> should equal [ "\r\n This code should be indented and\r\n have line feeds in it" ] [] -let ``Can parse code blocks``() = - let html = "\r\n let f a b = a * b\r\n f 5 6 |> should equal 30" +let ``Can parse pre containing code blocks``() = + let html = "
\r\n        let f a b = a * b\r\n        f 5 6 |> should equal 30
" let result = (HtmlDocument.Parse html) @@ -815,6 +816,33 @@ let ``Can parse code blocks``() = |> Seq.toList result |> should equal [ "\r\n let f a b = a * b\r\n f 5 6 |> should equal 30" ] +[] +let ``Can parse pre blocks with char refs``() = + let html = "
let hello =\r\n    fun who ->\r\n        "hello" + who
" + + let result = + (HtmlDocument.Parse html) + |> HtmlDocument.descendantsNamed true [ "pre" ] + |> Seq.head + |> HtmlNode.innerText + let expected = "let hello =\r\n fun who ->\r\n \"hello\" + who" + result |> should equal expected + +[] +let ``Drops whitespace outside pre``() = + let html = + "
foo
    bar    
baz
" + + let result = + (HtmlDocument.Parse html) + |> HtmlDocument.descendantsNamed false [ "div" ] + |> Seq.head + |> string + // default indentation is 2 spaces + let nl = Environment.NewLine + let expected = $"
%s{nl} foo
    bar    
baz%s{nl}
" + result |> should equal expected + [] let ``Can parse national rail mobile site correctly``() = HtmlDocument.Load "Data/UKDepartures.html" @@ -911,9 +939,9 @@ let ``Parsing non-html content doesn't cause an infinite loop - Github-1264``() [] let ``Can handle incomplete tags at end of file without creating an infinite loop``() = let result = HtmlDocument.Parse """ should equal expected \ No newline at end of file + result |> should equal expected