Skip to content

Commit

Permalink
HTML: track the form element pointer
Browse files Browse the repository at this point in the history
This is done for a (more-)correct behavior on an unmatched </form> tag.

The actual behavior is not quite right: when a form element is closed,
implied end tags are correctly generated for dd, dt, li, option,
optgroup, rb, rp, rt, rtc elements. If the element on the top of the
stack is not then the form element being closed, the specification says
to simply remove it from the stack.

This leaves the Markup.ml streaming parser unable to generate a proper
`End_element signal for it:

- If the signal is generated immediately, it will break nesting of the
signal stream, as the element on top of the stack has been opened, and
suddenly we are closing its ancestor form element.
- If the signal is not generated immediately, this will not accomplish
closing the form element, until its lingering presence triggers an
unrelated error, and it is closed haphazardly at a later point.

Note that this entire situation is a parse error to begin with, and
should not occur for correct markup.

This commit chooses to generate the signal immediately, but after first
closing all elements until the form element is reached, to preserve
nesting. This may result in early closing of (wrongly) unclosed elements
inside a form, in some error recovery scenarios.

Fixes aantron/lambdasoup#10.
  • Loading branch information
aantron committed May 8, 2017
1 parent 232c1b5 commit 0bf4f1b
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 6 deletions.
46 changes: 40 additions & 6 deletions src/markup_html_parser.ml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ type element =
suppress : bool;
mutable buffering : bool;
mutable is_open : bool;
mutable attributes : (name * string) list;
mutable attributes : (name * string) list;
mutable end_location : location;
mutable children : annotated_node list;
mutable parent : element}
Expand All @@ -67,7 +67,9 @@ and annotated_node = location * node
module Element :
sig
val create :
?is_html_integration_point:bool -> ?suppress:bool -> qname -> location ->
?is_html_integration_point:bool ->
?suppress:bool ->
qname -> location ->
element
val dummy : element

Expand Down Expand Up @@ -1036,6 +1038,7 @@ let parse requested_context report (tokens, set_tokenizer_state, set_foreign) =
let template_insertion_modes = Template.create () in
let frameset_ok = ref true in
let head_seen = ref false in
let form_element_pointer = ref None in

let add_character = Text.add text in

Expand Down Expand Up @@ -1173,7 +1176,8 @@ let parse requested_context report (tokens, set_tokenizer_state, set_foreign) =

and push_and_emit
?(formatting = false) ?(acknowledge = false) ?(namespace = `HTML)
location ({Token_tag.name; attributes; self_closing} as tag) mode =
?(set_form_element_pointer = false) location
({Token_tag.name; attributes; self_closing} as tag) mode =

report_if (self_closing && not acknowledge) location (fun () ->
`Bad_token ("/>", "tag", "should not be self-closing"))
Expand Down Expand Up @@ -1204,6 +1208,9 @@ let parse requested_context report (tokens, set_tokenizer_state, set_foreign) =
in
open_elements := element_entry::!open_elements;

if set_form_element_pointer then
form_element_pointer := Some element_entry;

if formatting then
active_formatting_elements :=
Active.Element_ (element_entry, location, tag)::
Expand Down Expand Up @@ -1726,8 +1733,14 @@ let parse requested_context report (tokens, set_tokenizer_state, set_foreign) =
mode ())))

| l, `Start ({name = "form"} as t) ->
close_current_p_element l (fun () ->
push_and_emit l t mode)
if !form_element_pointer <> None &&
not @@ Stack.has open_elements "template" then
report l (`Misnested_tag ("form", "form")) !throw mode
else begin
close_current_p_element l (fun () ->
let in_template = Stack.has open_elements "template" in
push_and_emit ~set_form_element_pointer:(not in_template) l t mode)
end

| l, `Start ({name = "li"} as t) ->
frameset_ok := false;
Expand Down Expand Up @@ -1769,7 +1782,28 @@ let parse requested_context report (tokens, set_tokenizer_state, set_foreign) =
close_element_with_implied name l mode

| l, `End {name = "form"} ->
close_element_with_implied "form" l mode
if not @@ Stack.has open_elements "template" then begin
let form_element = !form_element_pointer in
form_element_pointer := None;
match form_element with
| Some element when Stack.target_in_scope open_elements element ->
pop_implied l (fun () ->
match Stack.current_element open_elements with
| Some element' when element' == element ->
pop l mode
| _ ->
report element.location (`Unmatched_start_tag "form") !throw
(fun () ->
pop_until (fun element' -> element' == element) l (fun () ->
pop l mode)))
| _ ->
report l (`Unmatched_end_tag "form") !throw mode
end
else
if not @@ Stack.in_scope open_elements "form" then
report l (`Unmatched_end_tag "form") !throw mode
else
close_element_with_implied "form" l mode

| l, `End {name = "p"} ->
(fun mode' ->
Expand Down
11 changes: 11 additions & 0 deletions test/test_html_parser.ml
Original file line number Diff line number Diff line change
Expand Up @@ -1028,6 +1028,17 @@ let tests = [
[ 1, 1, S (start_element "form");
1, 7, S `End_element]);

("html.parser.form.nested" >:: fun _ ->
expect ~context:(Some (`Fragment "body")) "<form><form></form>"
[ 1, 1, S (start_element "form");
1, 7, E (`Misnested_tag ("form", "form"));
1, 13, S `End_element]
);

("html.parser.form.unopened" >:: fun _ ->
expect ~context:(Some (`Fragment "body")) "</form>"
[ 1, 1, E (`Unmatched_end_tag "form")]);

("html.parser.noframes" >:: fun _ ->
expect ~context:None "<noframes>foo&amp;bar</a></noframes>"
[ 1, 1, S (start_element "noframes");
Expand Down

0 comments on commit 0bf4f1b

Please sign in to comment.