From 7a04fdd0fac9d25225183624d1306c3072e877cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20M=C3=BCller?= Date: Thu, 17 Nov 2022 01:02:04 +0100 Subject: [PATCH] Refactor libXML error handling to remove global state (#12663) libXML error handlers are now set right before calling a lib function collecting errors directly in that context. Afterwards the error handler is reset. There is no longer a global array collecting errors at `XML::Error.errors`. Co-authored-by: Sijawusz Pur Rahnama Co-authored-by: Beta Ziliani --- spec/std/xml/reader_spec.cr | 20 ++++++++++++ spec/std/xml/xml_spec.cr | 9 ++---- src/xml.cr | 21 +++++++------ src/xml/error.cr | 62 +++++++++++++++++++++++-------------- src/xml/node.cr | 12 ++----- src/xml/reader.cr | 38 ++++++++++++++++++++--- src/xml/xpath_context.cr | 4 ++- 7 files changed, 110 insertions(+), 56 deletions(-) diff --git a/spec/std/xml/reader_spec.cr b/spec/std/xml/reader_spec.cr index d8c2946a201b..54183dd8742d 100644 --- a/spec/std/xml/reader_spec.cr +++ b/spec/std/xml/reader_spec.cr @@ -553,4 +553,24 @@ module XML end end end + + describe "#errors" do + it "makes errors accessible" do + reader = XML::Reader.new(%()) + reader.read + reader.expand? + + reader.errors.map(&.to_s).should eq ["Opening and ending tag mismatch: people line 1 and foo"] + end + + it "adds errors to `XML::Error.errors` (deprecated)" do + XML::Error.errors # clear class error list + + reader = XML::Reader.new(%()) + reader.read + reader.expand? + + XML::Error.errors.try(&.map(&.to_s)).should eq ["Opening and ending tag mismatch: people line 1 and foo"] + end + end end diff --git a/spec/std/xml/xml_spec.cr b/spec/std/xml/xml_spec.cr index cb5db774e32f..4c5e18c97249 100644 --- a/spec/std/xml/xml_spec.cr +++ b/spec/std/xml/xml_spec.cr @@ -158,15 +158,10 @@ describe XML do person2.previous_element.should eq(person) end - it "handles errors" do + it "#errors" do xml = XML.parse(%()) xml.root.not_nil!.name.should eq("people") - errors = xml.errors.not_nil! - errors.size.should eq(1) - - errors[0].message.should eq("Opening and ending tag mismatch: people line 1 and foo") - errors[0].line_number.should eq(1) - errors[0].to_s.should eq("Opening and ending tag mismatch: people line 1 and foo") + xml.errors.try(&.map(&.to_s)).should eq ["Opening and ending tag mismatch: people line 1 and foo"] end describe "#namespace" do diff --git a/src/xml.cr b/src/xml.cr index f07bb6b5bee0..9e1a19e4cc7c 100644 --- a/src/xml.cr +++ b/src/xml.cr @@ -52,14 +52,14 @@ module XML # See `ParserOptions.default` for default options. def self.parse(string : String, options : ParserOptions = ParserOptions.default) : Node raise XML::Error.new("Document is empty", 0) if string.empty? - from_ptr LibXML.xmlReadMemory(string, string.bytesize, nil, nil, options) + from_ptr { LibXML.xmlReadMemory(string, string.bytesize, nil, nil, options) } end # Parses an XML document from *io* with *options* into an `XML::Node`. # # See `ParserOptions.default` for default options. def self.parse(io : IO, options : ParserOptions = ParserOptions.default) : Node - from_ptr LibXML.xmlReadIO( + from_ptr { LibXML.xmlReadIO( ->(ctx, buffer, len) { LibC::Int.new(Box(IO).unbox(ctx).read Slice.new(buffer, len)) }, @@ -68,7 +68,7 @@ module XML nil, nil, options, - ) + ) } end # Parses an HTML document from *string* with *options* into an `XML::Node`. @@ -76,14 +76,14 @@ module XML # See `HTMLParserOptions.default` for default options. def self.parse_html(string : String, options : HTMLParserOptions = HTMLParserOptions.default) : Node raise XML::Error.new("Document is empty", 0) if string.empty? - from_ptr LibXML.htmlReadMemory(string, string.bytesize, nil, nil, options) + from_ptr { LibXML.htmlReadMemory(string, string.bytesize, nil, nil, options) } end # Parses an HTML document from *io* with *options* into an `XML::Node`. # # See `HTMLParserOptions.default` for default options. def self.parse_html(io : IO, options : HTMLParserOptions = HTMLParserOptions.default) : Node - from_ptr LibXML.htmlReadIO( + from_ptr { LibXML.htmlReadIO( ->(ctx, buffer, len) { LibC::Int.new(Box(IO).unbox(ctx).read Slice.new(buffer, len)) }, @@ -92,15 +92,16 @@ module XML nil, nil, options, - ) + ) } end - protected def self.from_ptr(doc : LibXML::Doc*) + protected def self.from_ptr(& : -> LibXML::Doc*) + errors = [] of XML::Error + doc = XML::Error.collect(errors) { yield } + raise Error.new(LibXML.xmlGetLastError) unless doc - node = Node.new(doc) - XML::Error.set_errors(node) - node + Node.new(doc, errors) end end diff --git a/src/xml/error.cr b/src/xml/error.cr index 23b0229858df..868dfeb4bd00 100644 --- a/src/xml/error.cr +++ b/src/xml/error.cr @@ -11,35 +11,14 @@ class XML::Error < Exception super(message) end - # TODO: this logic isn't thread/fiber safe, but error checking is less needed than - # the ability to parse HTML5 and malformed documents. In any case, fix this. @@errors = [] of self - LibXML.xmlSetStructuredErrorFunc nil, ->(ctx, error) { - @@errors << XML::Error.new(error) - } - - LibXML.xmlSetGenericErrorFunc nil, ->(ctx, fmt) { - # TODO: use va_start and va_end to - message = String.new(fmt).chomp - error = XML::Error.new(message, 0) - - {% if flag?(:arm) || flag?(:aarch64) %} - # libxml2 is likely missing ARM unwind tables (.ARM.extab and .ARM.exidx - # sections) which prevent raising from a libxml2 context. - @@errors << error - {% else %} - raise error - {% end %} - } - # :nodoc: - def self.set_errors(node) - if errors = self.errors - node.errors = errors - end + protected def self.add_errors(errors) + @@errors.concat(errors) end + @[Deprecated("This class accessor is deprecated. XML errors are accessible directly in the respective context via `XML::Reader#errors` and `XML::Node#errors`.")] def self.errors : Array(XML::Error)? if @@errors.empty? nil @@ -49,4 +28,39 @@ class XML::Error < Exception errors end end + + def self.collect(errors, &) + LibXML.xmlSetStructuredErrorFunc Box.box(errors), ->(ctx, error) { + Box(Array(XML::Error)).unbox(ctx) << XML::Error.new(error) + } + begin + yield + ensure + LibXML.xmlSetStructuredErrorFunc nil, nil + end + end + + def self.collect_generic(errors, &) + LibXML.xmlSetGenericErrorFunc Box.box(errors), ->(ctx, fmt) { + # TODO: use va_start and va_end to + message = String.new(fmt).chomp + error = XML::Error.new(message, 0) + + {% if flag?(:arm) || flag?(:aarch64) %} + # libxml2 is likely missing ARM unwind tables (.ARM.extab and .ARM.exidx + # sections) which prevent raising from a libxml2 context. + Box(Array(XML::Error)).unbox(ctx) << error + {% else %} + raise error + {% end %} + } + + begin + collect(errors) do + yield + end + ensure + LibXML.xmlSetGenericErrorFunc nil, nil + end + end end diff --git a/src/xml/node.cr b/src/xml/node.cr index cb5313f3e702..218897b5d025 100644 --- a/src/xml/node.cr +++ b/src/xml/node.cr @@ -7,7 +7,7 @@ class XML::Node end # :ditto: - def initialize(node : LibXML::Doc*) + def initialize(node : LibXML::Doc*, @errors = nil) initialize(node.as(LibXML::Node*)) end @@ -576,17 +576,9 @@ class XML::Node xpath(path, namespaces, variables).as(String) end - # :nodoc: - def errors=(errors) - @node.value._private = errors.as(Void*) - end - # Returns the list of `XML::Error` found when parsing this document. # Returns `nil` if no errors were found. - def errors : Array(XML::Error)? - ptr = @node.value._private - ptr ? (ptr.as(Array(XML::Error))) : nil - end + getter errors : Array(XML::Error)? private def check_no_null_byte(string) if string.includes? Char::ZERO diff --git a/src/xml/reader.cr b/src/xml/reader.cr index b6877f3c6030..684d3ed28871 100644 --- a/src/xml/reader.cr +++ b/src/xml/reader.cr @@ -1,7 +1,31 @@ require "./libxml2" require "./parser_options" +# `XML::Reader` is a parser for XML that iterates a XML document. +# +# ``` +# require "xml" +# +# reader = XML::Reader.new(<<-XML) +# Hello XML! +# XML +# reader.read +# reader.name # => "message" +# reader.read +# reader.value # => "Hello XML!" +# ``` +# +# This is an alternative approach to `XML.parse` which parses an entire document +# into an XML data structure. +# `XML::Reader` offers more control and does not need to store the XML document +# in memory entirely. The latter is especially useful for large documents with +# the `IO`-based constructor. +# +# WARNING: This type is not concurrency-safe. class XML::Reader + # Returns the errors reported while parsing. + getter errors = [] of XML::Error + # Creates a new reader from a string. # # See `XML::ParserOptions.default` for default options. @@ -30,7 +54,7 @@ class XML::Reader # Moves the reader to the next node. def read : Bool - LibXML.xmlTextReaderRead(@reader) == 1 + collect_errors { LibXML.xmlTextReaderRead(@reader) == 1 } end # Moves the reader to the next node while skipping subtrees. @@ -46,7 +70,7 @@ class XML::Reader if result == -1 node = LibXML.xmlTextReaderCurrentNode(@reader) if node.null? - LibXML.xmlTextReaderRead(@reader) == 1 + collect_errors { LibXML.xmlTextReaderRead(@reader) == 1 } elsif !node.value.next.null? LibXML.xmlTextReaderNext(@reader) == 1 else @@ -123,7 +147,7 @@ class XML::Reader # Returns the node's XML content including subtrees. def read_inner_xml : String - xml = LibXML.xmlTextReaderReadInnerXml(@reader) + xml = collect_errors { LibXML.xmlTextReaderReadInnerXml(@reader) } xml ? String.new(xml) : "" end @@ -139,7 +163,7 @@ class XML::Reader # to avoid doing an extra C call each time. return "" if node_type.none? - xml = LibXML.xmlTextReaderReadOuterXml(@reader) + xml = collect_errors { LibXML.xmlTextReaderReadOuterXml(@reader) } xml ? String.new(xml) : "" end @@ -170,4 +194,10 @@ class XML::Reader def to_unsafe @reader end + + private def collect_errors + Error.collect(@errors) { yield }.tap do + Error.add_errors(@errors) + end + end end diff --git a/src/xml/xpath_context.cr b/src/xml/xpath_context.cr index dc96887a39db..33ae9335c742 100644 --- a/src/xml/xpath_context.cr +++ b/src/xml/xpath_context.cr @@ -1,11 +1,13 @@ class XML::XPathContext + getter errors = [] of XML::Error + def initialize(node : Node) @ctx = LibXML.xmlXPathNewContext(node.to_unsafe.value.doc) @ctx.value.node = node.to_unsafe end def evaluate(search_path : String) - xpath = LibXML.xmlXPathEvalExpression(search_path, self) + xpath = XML::Error.collect_generic(@errors) { LibXML.xmlXPathEvalExpression(search_path, self) } unless xpath {% if flag?(:arm) || flag?(:aarch64) %} if errors = XML::Error.errors