From 392969ad309fea8fca96885bda9dd6b05a66546f Mon Sep 17 00:00:00 2001 From: Stephen Checkoway Date: Sun, 2 Sep 2018 18:30:10 -0400 Subject: [PATCH] Include line numbers from the parser (#55) Neither libxml2 nor Nokogiri contain an API for setting the line numbers for a node. When the libxml2 headers are available, the line numbers can be set directly in the node structure. Closes: #53 --- nokogumbo-import/CHANGELOG.md | 2 ++ nokogumbo-import/README.md | 30 ++++++++++++++++++++++ nokogumbo-import/ext/nokogumbo/nokogumbo.c | 30 +++++++++++++++++----- nokogumbo-import/test/test_nokogumbo.rb | 5 ++++ 4 files changed, 61 insertions(+), 6 deletions(-) diff --git a/nokogumbo-import/CHANGELOG.md b/nokogumbo-import/CHANGELOG.md index dffd29ae9bf..c278737018e 100644 --- a/nokogumbo-import/CHANGELOG.md +++ b/nokogumbo-import/CHANGELOG.md @@ -13,6 +13,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Added option `:max_errors` to control the maximum number of errors reported by `#errors` - Added option `:max_tree_depth` to control the maximum parse tree depth. +- Line number support via `Nokogiri::XML::Node#line` as long as Nokogumbo has + been compiled with libxml2 support. ### Changed - Integrated [Gumbo parser](https://github.com/google/gumbo-parser) into diff --git a/nokogumbo-import/README.md b/nokogumbo-import/README.md index 1171d6a1275..13832677150 100644 --- a/nokogumbo-import/README.md +++ b/nokogumbo-import/README.md @@ -202,6 +202,36 @@ rules defined in the HTML5 specification for doing so. * Instead of returning `unknown` as the element name for unknown tags, the original tag name is returned verbatim. +# Flavors of Nokogumbo +Nokogumbo uses libxml2, the XML library underlying Nokogiri, to speed up +parsing. If the libxml2 headers are not available, then Nokogumbo resorts to +using Nokogiri's Ruby API to construct the DOM tree. + +Nokogiri can be configured to either use the system library version of libxml2 +or use a bundled version. By default (as of Nokogiri version 1.8.4), Nokogiri +will use a bundled version. + +To prevent differences between versions of libxml2, Nokogumbo will only use +libxml2 if the build process can find the exact same version used by Nokogiri. +This leads to three possibilities + +1. Nokogiri is compiled with the bundled libxml2. In this case, Nokogumbo will + (by default) use the same version of libxml2. +2. Nokogiri is compiled with the system libxml2. In this case, if the libxml2 + headers are available, then Nokogumbo will (by default) use the system + version and headers. +3. Nokogiri is compiled with the system libxml2 but its headers aren't + available at build time for Nokogumbo. In this case, Nokogumbo will use the + slower Ruby API. + +Using libxml2 can be required by passing `-- --with-libxml2` to `bundle exec +rake` or to `gem install`. Using libxml2 can be prohibited by instead passing +`-- --without-libxml2`. + +Functionally, the only difference between using libxml2 or not is in the +behavior of `Nokogiri::XML::Node#line`. If it is used, then `#line` will +return the line number of the corresponding node. Otherwise, it will return 0. + # Installation git clone https://github.com/rubys/nokogumbo.git diff --git a/nokogumbo-import/ext/nokogumbo/nokogumbo.c b/nokogumbo-import/ext/nokogumbo/nokogumbo.c index 5da48062ca3..9635d7f6c4c 100644 --- a/nokogumbo-import/ext/nokogumbo/nokogumbo.c +++ b/nokogumbo-import/ext/nokogumbo/nokogumbo.c @@ -290,6 +290,17 @@ static xmlNsPtr lookup_or_add_ns ( #endif } +static void set_line(xmlNodePtr node, size_t line) { +#if NGLIB + // libxml2 uses 65535 to mean look elsewhere for the line number on some + // nodes. + if (line < 65535) + node->line = (unsigned short)line; +#else + // XXX: If Nokogiri gets a `#line=` method, we'll use that. +#endif +} + // Construct an XML tree rooted at xml_output_node from the Gumbo tree rooted // at gumbo_node. static void build_tree ( @@ -323,6 +334,7 @@ static void build_tree ( continue; } const GumboNode *gumbo_child = children->data[child_index++]; + xmlNodePtr xml_child; switch (gumbo_child->type) { case GUMBO_NODE_DOCUMENT: @@ -330,24 +342,30 @@ static void build_tree ( case GUMBO_NODE_TEXT: case GUMBO_NODE_WHITESPACE: - xmlAddChild(xml_node, xmlNewDocText(doc, BAD_CAST gumbo_child->v.text.text)); + xml_child = xmlNewDocText(doc, BAD_CAST gumbo_child->v.text.text); + set_line(xml_child, gumbo_child->v.text.start_pos.line); + xmlAddChild(xml_node, xml_child); break; case GUMBO_NODE_CDATA: - xmlAddChild(xml_node, - xmlNewCDataBlock(doc, BAD_CAST gumbo_child->v.text.text, - (int) strlen(gumbo_child->v.text.text))); + xml_child = xmlNewCDataBlock(doc, BAD_CAST gumbo_child->v.text.text, + (int) strlen(gumbo_child->v.text.text)); + set_line(xml_child, gumbo_child->v.text.start_pos.line); + xmlAddChild(xml_node, xml_child); break; case GUMBO_NODE_COMMENT: - xmlAddChild(xml_node, xmlNewDocComment(doc, BAD_CAST gumbo_child->v.text.text)); + xml_child = xmlNewDocComment(doc, BAD_CAST gumbo_child->v.text.text); + set_line(xml_child, gumbo_child->v.text.start_pos.line); + xmlAddChild(xml_node, xml_child); break; case GUMBO_NODE_TEMPLATE: // XXX: Should create a template element and a new DocumentFragment case GUMBO_NODE_ELEMENT: { - xmlNodePtr xml_child = xmlNewDocNode(doc, NIL, BAD_CAST gumbo_child->v.element.name, NULL); + xml_child = xmlNewDocNode(doc, NIL, BAD_CAST gumbo_child->v.element.name, NULL); + set_line(xml_child, gumbo_child->v.text.start_pos.line); if (xml_root == NIL) xml_root = xml_child; xmlNsPtr ns = NIL; diff --git a/nokogumbo-import/test/test_nokogumbo.rb b/nokogumbo-import/test/test_nokogumbo.rb index 23b0ead5ebd..e21942abafe 100644 --- a/nokogumbo-import/test/test_nokogumbo.rb +++ b/nokogumbo-import/test/test_nokogumbo.rb @@ -220,6 +220,11 @@ def test_document_encoding assert_equal "Кирилические символы", doc.at('body').text.gsub(/\n\s+/,'') end + def test_line_numbers + doc = Nokogiri::HTML5(buffer) + assert_includes [0, 8], doc.at('h1').line + assert_includes [0, 10], doc.at('span').line + end private